In [138]:
from __future__ import print_function
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression

In [139]:
# create spark session
spark = SparkSession.builder.appName("Logistic_Regression_Classifier_Demo").getOrCreate()

In [140]:
#Load the data 
df = spark.read.csv("diabetes.csv",header=True)

In [141]:
# shape of df
print((df.count(),len(df.columns)))

(768, 9)


In [142]:
#show the data
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|  31|                   0.248| 26|      1|


In [113]:
#print the schema
df.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [143]:
# EDA

df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|summary|       Pregnancies|          Glucose|     BloodPressure|     SkinThickness|           Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|  count|               768|              768|               768|               768|               768|               768|                     768|               768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.992578124999977|      0.4718763020833327|33.240885416666664|0.3489583333333333|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|15.95

In [144]:
# convert Sting to float 
from pyspark.sql.functions import col
new_df = df.select(*(col(c).cast("float").alias(c) for c in df.columns))

In [145]:
#check the data types
new_df.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



In [188]:
#check for missing data 
from pyspark.sql.functions import col, count, isnan, when
new_df.select([count(when(col(c).isNull(), c)).alias(c) for c in new_df.columns]).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



In [189]:
# convert the attributes into single features column
featurescols = new_df.drop("Outcome").columns
assembler = VectorAssembler(inputCols=featurescols,outputCol="features")

df_features= assembler.transform(new_df)


In [190]:
df_features.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|[6.0,148.0,72.0,3...|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|[1.0,85.0,66.0,29...|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|[8.0,183.0,64.0,0...|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|[1.0,89.0,66.0,23...|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|[0.0,137.0,40.0,3...|
|        5.0|  116.0|   

In [214]:
(train, test) = df_features.randomSplit([0.7, 0.3],seed=1)

In [215]:
lgclsf = LogisticRegression(labelCol="Outcome", featuresCol="features")

In [216]:
model = lgclsf.fit(train)

In [217]:
predictions = model.transform(test)

In [218]:
predictions.select("features","Outcome","prediction").show()

+--------------------+-------+----------+
|            features|Outcome|prediction|
+--------------------+-------+----------+
|[0.0,78.0,88.0,29...|    0.0|       0.0|
|[0.0,93.0,60.0,0....|    0.0|       0.0|
|[0.0,95.0,80.0,45...|    0.0|       0.0|
|[0.0,97.0,64.0,36...|    0.0|       0.0|
|(8,[1,5,6,7],[99....|    0.0|       0.0|
|[0.0,101.0,64.0,1...|    0.0|       0.0|
|[0.0,102.0,64.0,4...|    0.0|       0.0|
|[0.0,104.0,64.0,2...|    0.0|       0.0|
|[0.0,105.0,68.0,2...|    0.0|       0.0|
|[0.0,105.0,90.0,0...|    0.0|       0.0|
|[0.0,106.0,70.0,3...|    0.0|       0.0|
|[0.0,107.0,60.0,2...|    0.0|       0.0|
|[0.0,107.0,76.0,0...|    0.0|       0.0|
|[0.0,108.0,68.0,2...|    0.0|       0.0|
|[0.0,114.0,80.0,3...|    0.0|       0.0|
|[0.0,118.0,64.0,2...|    0.0|       0.0|
|[0.0,119.0,66.0,2...|    0.0|       0.0|
|[0.0,123.0,88.0,3...|    0.0|       0.0|
|[0.0,137.0,70.0,3...|    0.0|       0.0|
|[0.0,146.0,70.0,0...|    1.0|       1.0|
+--------------------+-------+----

In [219]:
#Evaluate model perfromance 
ev = BinaryClassificationEvaluator(
    labelCol="Outcome", rawPredictionCol="prediction")
accuracy_details  = ev.evaluate(predictions)


In [220]:
print ("Accuracy for Logistic Regression metric AUC ",round(accuracy_details*100,2) ,'%')



Accuracy for Logistic Regression metric AUC  74.06 %


In [221]:
TP = predictions.filter((col("Outcome") == 1) & (col("prediction") == 1)).count()

TN = predictions.filter((col("Outcome") == 0) & (col("prediction") == 0)).count()
FP = predictions.filter((col("Outcome") == 0) & (col("prediction") == 1)).count()
FP = predictions.filter((col("Outcome") == 0 )& (col("prediction") == 1)).count()
FN = predictions.filter((col("Outcome") == 1) & (col("prediction") == 0)).count()
FN1 = predictions[(predictions.Outcome == 1) & (predictions.prediction == 0)].count()

In [222]:

recall = float(TP/(TP+FN))
precision = float((TP) / (TP + FP))
accuracy=float((TP+TN) /(predictions.count()))
f1_score =2*float((precision*recall)/(precision + recall))

print(recall)
print(precision)
print(accuracy)
print(f1_score)


0.5679012345679012
0.7796610169491526
0.7922077922077922
0.6571428571428571
