In [20]:
from pyspark.sql import SparkSession
from pyspark.mllib.stat import Statistics
import pyspark.sql.functions as fun
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os, sys

In [2]:
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [3]:
spark = SparkSession.builder.master("local[1]").appName("ML_LogisticRegression").getOrCreate()

In [4]:
df = spark.read.csv("diabetes.csv", header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [6]:
df.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [7]:
df.dtypes

[('Pregnancies', 'int'),
 ('Glucose', 'int'),
 ('BloodPressure', 'int'),
 ('SkinThickness', 'int'),
 ('Insulin', 'int'),
 ('BMI', 'double'),
 ('DiabetesPedigreeFunction', 'double'),
 ('Age', 'int'),
 ('Outcome', 'int')]

In [8]:
df.select("Glucose", "Insulin", "BMI", "Age").describe().show()

+-------+-----------------+------------------+------------------+------------------+
|summary|          Glucose|           Insulin|               BMI|               Age|
+-------+-----------------+------------------+------------------+------------------+
|  count|              768|               768|               768|               768|
|   mean|     120.89453125| 79.79947916666667|31.992578124999977|33.240885416666664|
| stddev|31.97261819513622|115.24400235133803| 7.884160320375441|11.760231540678689|
|    min|                0|                 0|               0.0|                21|
|    max|              199|               846|              67.1|                81|
+-------+-----------------+------------------+------------------+------------------+



In [9]:
feature_names = df.columns[:-1]

In [10]:
feature_names

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [11]:
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
output = assembler.transform(df).select("features", "Outcome")

In [12]:
output.show(5)

+--------------------+-------+
|            features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|      1|
|[1.0,85.0,66.0,29...|      0|
|[8.0,183.0,64.0,0...|      1|
|[1.0,89.0,66.0,23...|      0|
|[0.0,137.0,40.0,3...|      1|
+--------------------+-------+
only showing top 5 rows



In [21]:
# split the data into training and testing
train_data, test_data = output.randomSplit([0.75, 0.25])

In [22]:
model = LogisticRegression(featuresCol="features", labelCol="Outcome")
# lr = model.fit(output)
lr = model.fit(train_data)

In [24]:
# train_pred = lr.transform(output)
test_pred = lr.transform(test_data)

In [25]:
test_pred.show(10)

+--------------------+-------+--------------------+--------------------+----------+
|            features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[6.0...|      0|[2.72271260887795...|[0.93835363426636...|       0.0|
|(8,[1,5,6,7],[99....|      0|[2.07138411922271...|[0.88809059699580...|       0.0|
|(8,[1,5,6,7],[138...|      1|[-0.7475586227295...|[0.32135349725086...|       1.0|
|(8,[1,6,7],[94.0,...|      0|[4.18893434885566...|[0.98506403150641...|       0.0|
|[0.0,91.0,68.0,32...|      0|[2.24300208107199...|[0.90404519732709...|       0.0|
|[0.0,94.0,70.0,27...|      0|[1.83133582721838...|[0.86192078496882...|       0.0|
|[0.0,98.0,82.0,15...|      0|[3.30856711895124...|[0.96472154674658...|       0.0|
|[0.0,101.0,64.0,1...|      0|[3.16261028119505...|[0.95940273660461...|       0.0|
|[0.0,104.0,64.0,2...|      0|[2.53232388927175...|[0.92637700593134...|    

In [29]:
evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction")

In [33]:
accuracy = evaluator.evaluate(test_pred, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(test_pred, {evaluator.metricName: "weightedPrecision"})

In [34]:
accuracy, precision

(0.7609756097560976, 0.7515142276422765)

In [26]:
# y_true = train_pred.select("Outcome")
# y_true = y_true.toPandas()

# y_pred = train_pred.select("prediction")
# y_pred = y_pred.toPandas()

In [17]:
# from sklearn.metrics import accuracy_score

In [27]:
# accuracy_score(y_true, y_pred)

In [28]:
# confusion_matrix(y_true, y_pred)