In [1]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.regression import *
from pyspark.ml.tree import *
from pyspark.ml.feature import *

In [2]:
spark = SparkSession.builder.appName("Structured Data").config("spark.driver.memory","4g").config("spark.executor.memory","4g").getOrCreate()


In [3]:
df = spark.read.csv("D:\\DataSets\\Diabetes\\*.csv", header=True)

In [4]:
df.limit(5).toPandas()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [6]:
df = df.withColumn("Pregnancies", col("Pregnancies").cast(IntegerType())) \
       .withColumn("Age", col("Age").cast(IntegerType())) \
       .withColumn("Outcome", col("Outcome").cast(IntegerType())) \
       .withColumn("Glucose", col("Glucose").cast(IntegerType())) \
       .withColumn("SkinThickness", col("SkinThickness").cast(IntegerType())) \
       .withColumn("BloodPressure", col("BloodPressure").cast(IntegerType())) \
       .withColumn("BMI", col("BMI").cast(FloatType())) \
       .withColumn("DiabetesPedigreeFunction", col("DiabetesPedigreeFunction").cast(FloatType())) \
       .withColumn("Insulin", col("Insulin").cast(IntegerType()))

In [7]:
df.sort(col("BloodPressure").desc()).limit(5).toPandas()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,96,122,0,0,22.4,0.207,27,0
1,13,158,114,0,0,42.299999,0.257,44,1
2,9,171,110,24,240,45.400002,0.721,54,1
3,0,129,110,46,130,67.099998,0.319,26,1
4,4,189,110,31,0,28.5,0.68,37,0


In [8]:
df.select(col("Outcome")).distinct().show()

+-------+
|Outcome|
+-------+
|      1|
|      0|
+-------+



In [9]:
print(df.count(),",", len(df.columns))

768 , 9


In [10]:
df.groupBy("Outcome").count().show()

+-------+-----+
|Outcome|count|
+-------+-----+
|      1|  268|
|      0|  500|
+-------+-----+



In [11]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [12]:
df.show(5,False)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|6          |148    |72           |35           |0      |33.6|0.627                   |50 |1      |
|1          |85     |66           |29           |0      |26.6|0.351                   |31 |0      |
|8          |183    |64           |0            |0      |23.3|0.672                   |32 |1      |
|1          |89     |66           |23           |94     |28.1|0.167                   |21 |0      |
|0          |137    |40           |35           |168    |43.1|2.288                   |33 |1      |
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [13]:
df.orderBy(col("Age").desc()).show(5,False)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|9          |134    |74           |33           |60     |25.9|0.46                    |81 |0      |
|2          |119    |0            |0            |0      |19.6|0.832                   |72 |0      |
|4          |145    |82           |18           |0      |32.5|0.235                   |70 |1      |
|5          |136    |82           |0            |0      |0.0 |0.64                    |69 |0      |
|5          |132    |80           |0            |0      |26.8|0.186                   |69 |0      |
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [37]:
df.select("Age").distinct().orderBy(col("Age").desc()).show()

+---+
|Age|
+---+
| 81|
| 72|
| 70|
| 69|
| 68|
| 67|
| 66|
| 65|
| 64|
| 63|
| 62|
| 61|
| 60|
| 59|
| 58|
| 57|
| 56|
| 55|
| 54|
| 53|
+---+
only showing top 20 rows



In [14]:
df.count()

768

In [15]:
pd.DataFrame(df.take(5), columns=df.columns).transpose() 

Unnamed: 0,0,1,2,3,4
Pregnancies,6.0,1.0,8.0,1.0,0.0
Glucose,148.0,85.0,183.0,89.0,137.0
BloodPressure,72.0,66.0,64.0,66.0,40.0
SkinThickness,35.0,29.0,0.0,23.0,35.0
Insulin,0.0,0.0,0.0,94.0,168.0
BMI,33.599998,26.6,23.299999,28.1,43.099998
DiabetesPedigreeFunction,0.627,0.351,0.672,0.167,2.288
Age,50.0,31.0,32.0,21.0,33.0
Outcome,1.0,0.0,1.0,0.0,1.0


In [16]:
required_features = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(df)

In [17]:
train_data, test_data = transformed_data.randomSplit([0.7, 0.3], seed=2018)
print("Training data set count is: ",train_data.count())
print("Test dataset count is: ",test_data.count())

Training data set count is:  547
Test dataset count is:  221


In [18]:
train_data.show(5,False)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+---------------------------------------------------------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|Outcome|features                                                             |
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+---------------------------------------------------------------------+
|0          |67     |76           |0            |0      |45.3|0.194                   |46 |0      |[0.0,67.0,76.0,0.0,0.0,45.29999923706055,0.1940000057220459,46.0]    |
|0          |73     |0            |0            |0      |21.1|0.342                   |25 |0      |(8,[1,5,6,7],[73.0,21.100000381469727,0.34200000762939453,25.0])     |
|0          |74     |52           |10           |36     |27.8|0.269                   |22 |0      |[0.0,74.0,52.0,10.0,36.0,27.799999237060547,0.26899

In [31]:
from pyspark.ml.classification import RandomForestClassifier

randomForest = RandomForestClassifier(labelCol='Outcome', featuresCol= 'features', maxDepth=4)
model = randomForest.fit(train_data)


In [32]:
predictions = model.transform(test_data)

In [33]:
#Evaluate our model

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='Outcome', predictionCol='prediction', metricName='accuracy')

In [34]:
accuracy = evaluator.evaluate(predictions)
print("Test accuracy is: ", accuracy)

Test accuracy is:  0.7647058823529411
