In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('salary').getOrCreate()
spark

In [2]:
data = spark.read.csv("C:\\Users\\HAI\\Downloads\\diabetes (1).csv",header = True,inferSchema = True)
data.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [3]:
from pyspark.sql.functions import isnan,when,count,col
data.select([count(when(col(c).isNull(),c)).alias(c) for c in data.columns]).show()


+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



In [4]:
data.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|summary|       Pregnancies|          Glucose|     BloodPressure|     SkinThickness|           Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------------+------------------+------------------+
|  count|               768|              768|               768|               768|               768|               768|                     768|               768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.992578124999977|      0.4718763020833327|33.240885416666664|0.3489583333333333|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|15.95

In [5]:
data.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [6]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age'],
outputCol='Independent Features')
 
output = assembler.transform(data)
output.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|Independent Features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|[6.0,148.0,72.0,3...|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|[1.0,85.0,66.0,29...|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|[8.0,183.0,64.0,0...|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|[1.0,89.0,66.0,23...|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|[0.0,137.0,40.0,3...|
|          5|    116|           

In [7]:
finilized_data = output.select('Independent Features','Outcome')
finilized_data.show()

+--------------------+-------+
|Independent Features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|      1|
|[1.0,85.0,66.0,29...|      0|
|[8.0,183.0,64.0,0...|      1|
|[1.0,89.0,66.0,23...|      0|
|[0.0,137.0,40.0,3...|      1|
|[5.0,116.0,74.0,0...|      0|
|[3.0,78.0,50.0,32...|      1|
|[10.0,115.0,0.0,0...|      0|
|[2.0,197.0,70.0,4...|      1|
|[8.0,125.0,96.0,0...|      1|
|[4.0,110.0,92.0,0...|      0|
|[10.0,168.0,74.0,...|      1|
|[10.0,139.0,80.0,...|      0|
|[1.0,189.0,60.0,2...|      1|
|[5.0,166.0,72.0,1...|      1|
|[7.0,100.0,0.0,0....|      1|
|[0.0,118.0,84.0,4...|      1|
|[7.0,107.0,74.0,0...|      1|
|[1.0,103.0,30.0,3...|      0|
|[1.0,115.0,70.0,3...|      1|
+--------------------+-------+
only showing top 20 rows



In [14]:
from pyspark.ml.classification import LogisticRegression
train_data,test_data = finilized_data.randomSplit([0.75,0.25])
LR = LogisticRegression(featuresCol = 'Independent Features',labelCol='Outcome')
LR = LR.fit(train_data)

In [15]:
pred_test= LR.transform(test_data)
pred_test.show()


+--------------------+-------+--------------------+--------------------+----------+
|Independent Features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[2.0...|      0|[4.82299423487801...|[0.99202149903683...|       0.0|
|(8,[0,1,6,7],[3.0...|      0|[4.41175443997774...|[0.98801159427908...|       0.0|
|(8,[1,5,6,7],[131...|      1|[-0.4716591799569...|[0.38422361322083...|       1.0|
|(8,[1,5,6,7],[141...|      1|[-0.7139239259861...|[0.32873237645375...|       1.0|
|[0.0,91.0,68.0,32...|      0|[2.09790914672264...|[0.89069979277969...|       0.0|
|[0.0,91.0,80.0,0....|      0|[2.29350638893305...|[0.90833781112550...|       0.0|
|[0.0,93.0,60.0,25...|      0|[2.58555910560882...|[0.92992638570546...|       0.0|
|[0.0,100.0,70.0,2...|      0|[2.22046586137962...|[0.90207235659027...|       0.0|
|[0.0,100.0,88.0,6...|      0|[0.89860103675377...|[0.71066193042924...|    

In [16]:
pred_test.select('Outcome','prediction').show(10)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|      0|       0.0|
|      0|       0.0|
|      1|       1.0|
|      1|       1.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
+-------+----------+
only showing top 10 rows



In [17]:
lr_summary = LR.summary

In [18]:
#overall accuracy of classification
lr_summary.accuracy*100

77.21739130434783

In [19]:
#areaUnderROC
print('areaUnderROC:',lr_summary.areaUnderROC*100)

areaUnderROC: 83.04883958860624


In [20]:
#precison of both classes
print('precision:',lr_summary.precisionByLabel)

precision: [0.7952380952380952, 0.7096774193548387]


In [21]:
#recall of both classes
print('recall:',lr_summary.recallByLabel)

recall: [0.8812664907651715, 0.5612244897959183]


In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluater = MulticlassClassificationEvaluator(labelCol = 'Outcome',predictionCol = 'prediction',metricName = 'accuracy')
accuracy_F = evaluater.evaluate(pred_test)
print('accuracy:',accuracy_F*100)

accuracy: 80.31088082901555
