In [192]:
import pyspark
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, sum
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

##### Business and Data Understanding

The Diabetes prediction dataset is a collection of medical and demographic data from patients, along with their diabetes status (positive or negative). The data includes features such as age, gender, body mass index (BMI), hypertension, heart disease, smoking history, HbA1c level, and blood glucose level. This dataset can be used to build machine learning models to predict diabetes in patients based on their medical history and demographic information. This can be useful for healthcare professionals in identifying patients who may be at risk of developing diabetes and in developing personalized treatment plans. Additionally, the dataset can be used by researchers to explore the relationships between various medical and demographic factors and the likelihood of developing diabetes.

Explanation about the features :
- Gender - Gender refers to the biological sex of the individual, which can have an impact on their susceptibility to diabetes. There are three categories in it male ,female and other.
- Age - Age is an important factor as diabetes is more commonly diagnosed in older adults.Age ranges from 0-80 in our dataset. 
- Hypertension - Hypertension is a medical condition in which the blood pressure in the arteries is persistently elevated. It has values a 0 or 1 where 0 indicates they don’t have hypertension and for 1 it means they have hypertension.
- Heart_Disease - Heart disease is another medical condition that is associated with an increased risk of developing diabetes. It has values a 0 or 1 where 0 indicates they don’t have heart disease and for 1 it means they have heart disease.
- Smoking_History - Smoking history is also considered a risk factor for diabetes and can exacerbate the complications associated with diabetes.In our dataset we have 5 categories i.e not current,former,No Info,current,never and ever.
- BMI - BMI (Body Mass Index) is a measure of body fat based on weight and height. Higher BMI values are linked to a higher risk of diabetes. The range of BMI in the dataset is from 10.16 to 71.55. BMI less than 18.5 is underweight, 18.5-24.9 is normal, 25-29.9 is overweight, and 30 or more is obese.
- HbA1c_level - HbA1c (Hemoglobin A1c) level is a measure of a person's average blood sugar level over the past 2-3 months. Higher levels indicate a greater risk of developing diabetes. Mostly more than 6.5% of HbA1c Level indicates diabetes.
- Blood_Glucose_Level - Blood glucose level refers to the amount of glucose in the bloodstream at a given time. High blood glucose levels are a key indicator of diabetes. 
- Diabetes - Diabetes is the target variable being predicted, with values of 1 indicating the presence of diabetes and 0 indicating the absence of diabetes.
###### Explanation about the dataset :
- Target coloumns is diabetes

# Spark Session

In [193]:
spark = SparkSession.builder.getOrCreate()

## Data Engineering

In [194]:
#Load the data
dataset = spark.read.csv("diabetes_prediction_dataset.csv", header=True, inferSchema=True)
dataset.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|Female|80.0|           0|            1|          never|25.19|        6.6|                140|       0|
|Female|54.0|           0|            0|        No Info|27.32|        6.6|                 80|       0|
|  Male|28.0|           0|            0|          never|27.32|        5.7|                158|       0|
|Female|36.0|           0|            0|        current|23.45|        5.0|                155|       0|
|  Male|76.0|           1|            1|        current|20.14|        4.8|                155|       0|
|Female|20.0|           0|            0|          never|27.32|        6.6|                 85|       0|
|Female|44.0|           0|            0|          never|19.31|  

In [195]:
dataset.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)



## Data Cleansing

In [196]:
# Missing Values

missing_count = dataset.select([sum(col(c).isNull().cast("int")).alias(c) for c in dataset.columns])
missing_count.show()

+------+---+------------+-------------+---------------+---+-----------+-------------------+--------+
|gender|age|hypertension|heart_disease|smoking_history|bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+---+------------+-------------+---------------+---+-----------+-------------------+--------+
|     0|  0|           0|            0|              0|  0|          0|                  0|       0|
+------+---+------------+-------------+---------------+---+-----------+-------------------+--------+



In [197]:
# Duplicat Values
dups = dataset.groupBy(dataset.columns).count().filter("count > 1")
dups.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+-----+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|count|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+-----+
|Female|80.0|           0|            0|        No Info|27.32|        6.0|                140|       0|    4|
|Female|48.0|           0|            0|        No Info|27.32|        3.5|                130|       0|    2|
|  Male|80.0|           0|            0|        No Info|27.32|        6.1|                155|       0|    3|
|Female|70.0|           0|            0|        No Info|27.32|        4.5|                145|       0|    2|
|Female|57.0|           0|            0|         former|27.32|        6.6|                159|       0|    2|
|Female|80.0|           0|            0|        No Info|27.32|        5.8|                140|       0|    4|
|Female|49

In [198]:
dataset_new = dataset.dropDuplicates()
dataset_new.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|Female|21.0|           0|            0|          never|27.32|        5.8|                126|       0|
|  Male|26.0|           0|            0|          never|27.32|        6.6|                100|       0|
|Female|49.0|           0|            0|          never| 21.7|        5.8|                158|       0|
|Female|24.0|           0|            0|         former|20.47|        4.8|                100|       0|
|Female|53.0|           0|            0|          never| 31.4|        5.7|                 85|       0|
|Female|74.0|           0|            0|         former| 40.5|        3.5|                160|       0|
|  Male|76.0|           0|            0|         former|27.76|  

In [199]:
dataset_new.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)



In [200]:
#Converting categorical into numerical values

categorical_columns = ["gender", "age",  "smoking_history", "bmi","HbA1c_level"]

# Perform indexation for each column separately
for col in categorical_columns:
     indexer = StringIndexer(inputCol=col, outputCol=col + "_index")
    
     # Perform indexation on the DataFrame using the appropriate StringIndexer
     dataset_new= indexer.fit(dataset_new).transform(dataset_new)

In [201]:
dataset_new.show(5)

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+------------+---------+---------------------+---------+-----------------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|gender_index|age_index|smoking_history_index|bmi_index|HbA1c_level_index|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+------------+---------+---------------------+---------+-----------------+
|Female|21.0|           0|            0|          never|27.32|        5.8|                126|       0|         0.0|     40.0|                  0.0|      0.0|              3.0|
|  Male|26.0|           0|            0|          never|27.32|        6.6|                100|       0|         1.0|     35.0|                  0.0|      0.0|              0.0|
|Female|49.0|           0|            0|          never| 21.7|        5.8|                158|       0|         0.0

In [202]:
dataset_new.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- gender_index: double (nullable = false)
 |-- age_index: double (nullable = false)
 |-- smoking_history_index: double (nullable = false)
 |-- bmi_index: double (nullable = false)
 |-- HbA1c_level_index: double (nullable = false)



In [203]:
#We didn't need the old data set, so I decided to delete the column and I used a column name ending in _index which means it has been converted to numerical values
delete = ["gender", "age",  "smoking_history", "bmi","HbA1c_level"]

clean_dataset = dataset_new.drop(*delete)

In [204]:
clean_dataset.show(5)

+------------+-------------+-------------------+--------+------------+---------+---------------------+---------+-----------------+
|hypertension|heart_disease|blood_glucose_level|diabetes|gender_index|age_index|smoking_history_index|bmi_index|HbA1c_level_index|
+------------+-------------+-------------------+--------+------------+---------+---------------------+---------+-----------------+
|           0|            0|                126|       0|         0.0|     40.0|                  0.0|      0.0|              3.0|
|           0|            0|                100|       0|         1.0|     35.0|                  0.0|      0.0|              0.0|
|           0|            0|                158|       0|         0.0|      6.0|                  0.0|     45.0|              3.0|
|           0|            0|                100|       0|         0.0|     43.0|                  2.0|   1790.0|              8.0|
|           0|            0|                 85|       0|         0.0|      4.0|   

## Splitting Dataset

In [205]:
clean_dataset.show(5)

+------------+-------------+-------------------+--------+------------+---------+---------------------+---------+-----------------+
|hypertension|heart_disease|blood_glucose_level|diabetes|gender_index|age_index|smoking_history_index|bmi_index|HbA1c_level_index|
+------------+-------------+-------------------+--------+------------+---------+---------------------+---------+-----------------+
|           0|            0|                126|       0|         0.0|     40.0|                  0.0|      0.0|              3.0|
|           0|            0|                100|       0|         1.0|     35.0|                  0.0|      0.0|              0.0|
|           0|            0|                158|       0|         0.0|      6.0|                  0.0|     45.0|              3.0|
|           0|            0|                100|       0|         0.0|     43.0|                  2.0|   1790.0|              8.0|
|           0|            0|                 85|       0|         0.0|      4.0|   

In [206]:
#Feature and Label

feature_columns = [col for col in clean_dataset.columns if col != 'diabetes']
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

dataset_transformed = assembler.transform(clean_dataset).select('features', 'diabetes')


In [207]:
dataset_transformed.show(5)

+--------------------+--------+
|            features|diabetes|
+--------------------+--------+
|(8,[2,4,7],[126.0...|       0|
|(8,[2,3,4],[100.0...|       0|
|(8,[2,4,6,7],[158...|       0|
|[0.0,0.0,100.0,0....|       0|
|(8,[2,4,6,7],[85....|       0|
+--------------------+--------+
only showing top 5 rows



In [208]:
#Splitting Dataset

train_data, test_data = dataset_transformed.randomSplit([0.8, 0.2], seed=42)

train_data.count(), test_data.count()

(77043, 19103)

## Decision Tree Model

### ML Model Engineering

In [209]:
# Initialize model Decision Tree
dt = DecisionTreeClassifier(featuresCol='features', labelCol='diabetes', maxBins=5000)

# Train dt model
model = dt.fit(train_data)

In [210]:
# Prediction using test data
predictions = model.transform(test_data)
predictions.show(10)

+--------------------+--------+----------------+--------------------+----------+
|            features|diabetes|   rawPrediction|         probability|prediction|
+--------------------+--------+----------------+--------------------+----------+
|(8,[0,1,2,7],[1.0...|       0|[26475.0,3420.0]|[0.88559959859508...|       0.0|
|(8,[0,2,3,4],[1.0...|       0|[26475.0,3420.0]|[0.88559959859508...|       0.0|
|(8,[0,2,3,4],[1.0...|       1|[26475.0,3420.0]|[0.88559959859508...|       0.0|
|(8,[0,2,3,6],[1.0...|       1|[26475.0,3420.0]|[0.88559959859508...|       0.0|
|(8,[0,2,4],[1.0,8...|       0| [15245.0,780.0]|[0.95132605304212...|       0.0|
|(8,[0,2,4],[1.0,1...|       1|[26475.0,3420.0]|[0.88559959859508...|       0.0|
|(8,[0,2,4,5],[1.0...|       0|[26475.0,3420.0]|[0.88559959859508...|       0.0|
|(8,[0,2,4,5],[1.0...|       1|[26475.0,3420.0]|[0.88559959859508...|       0.0|
|(8,[0,2,4,6],[1.0...|       0|[26475.0,3420.0]|[0.88559959859508...|       0.0|
|(8,[0,2,4,6],[1.0...|      

In [211]:
predictions.select('features', 'diabetes', 'prediction').show()

+--------------------+--------+----------+
|            features|diabetes|prediction|
+--------------------+--------+----------+
|(8,[0,1,2,7],[1.0...|       0|       0.0|
|(8,[0,2,3,4],[1.0...|       0|       0.0|
|(8,[0,2,3,4],[1.0...|       1|       0.0|
|(8,[0,2,3,6],[1.0...|       1|       0.0|
|(8,[0,2,4],[1.0,8...|       0|       0.0|
|(8,[0,2,4],[1.0,1...|       1|       0.0|
|(8,[0,2,4,5],[1.0...|       0|       0.0|
|(8,[0,2,4,5],[1.0...|       1|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       1|       0.0|
|(8,[0,2,4,7],[1.0...|       0|       0.0|
|(8,[0,2,4,7],[1.0...|       0|       0.0|
|(8,[0,2,4,7],[1.0...|       0|       0.0|
+----------

### ML Model Evaluation

In [212]:
# Accuracy 

evaluator = MulticlassClassificationEvaluator(labelCol='diabetes', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print(f"Akurasi: {accuracy}")


Akurasi: 0.9496414175783908


In [213]:

# Precision
evaluator_precision = MulticlassClassificationEvaluator(
    labelCol='diabetes', predictionCol='prediction', metricName='precisionByLabel')

precision = evaluator_precision.evaluate(predictions)
print(f"Precision: {precision}")

# Recall
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol='diabetes', predictionCol='prediction', metricName='recallByLabel')

recall = evaluator_recall.evaluate(predictions)
print(f"Recall: {recall}")

# F1 Score
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol='diabetes', predictionCol='prediction', metricName='f1')

f1_score = evaluator_f1.evaluate(predictions)
print(f"F1 Score: {f1_score}")


Precision: 0.947802495930548
Recall: 1.0
F1 Score: 0.9398211681473817


## Random Forest Model

### ML Model Engineering


In [214]:
# Initialize model Decision Tree
rf = RandomForestClassifier(featuresCol='features', labelCol='diabetes', maxBins=5000)

# Train dt model
model_rf = rf.fit(train_data)

In [215]:
# Prediction using test data
predictions_rf = model_rf.transform(test_data)
predictions_rf.show(10)

+--------------------+--------+--------------------+--------------------+----------+
|            features|diabetes|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(8,[0,1,2,7],[1.0...|       0|[15.2111263644680...|[0.76055631822340...|       0.0|
|(8,[0,2,3,4],[1.0...|       0|[16.9957549582276...|[0.84978774791138...|       0.0|
|(8,[0,2,3,4],[1.0...|       1|[16.8574973824701...|[0.84287486912350...|       0.0|
|(8,[0,2,3,6],[1.0...|       1|[16.2454528208412...|[0.81227264104206...|       0.0|
|(8,[0,2,4],[1.0,8...|       0|[17.9503665870200...|[0.89751832935100...|       0.0|
|(8,[0,2,4],[1.0,1...|       1|[17.1235425348697...|[0.85617712674348...|       0.0|
|(8,[0,2,4,5],[1.0...|       0|[16.9957549582276...|[0.84978774791138...|       0.0|
|(8,[0,2,4,5],[1.0...|       1|[16.8574973824701...|[0.84287486912350...|       0.0|
|(8,[0,2,4,6],[1.0...|       0|[17.8507219761459...|[0.8925360988

In [216]:
predictions_rf.select('features', 'diabetes', 'prediction').show()

+--------------------+--------+----------+
|            features|diabetes|prediction|
+--------------------+--------+----------+
|(8,[0,1,2,7],[1.0...|       0|       0.0|
|(8,[0,2,3,4],[1.0...|       0|       0.0|
|(8,[0,2,3,4],[1.0...|       1|       0.0|
|(8,[0,2,3,6],[1.0...|       1|       0.0|
|(8,[0,2,4],[1.0,8...|       0|       0.0|
|(8,[0,2,4],[1.0,1...|       1|       0.0|
|(8,[0,2,4,5],[1.0...|       0|       0.0|
|(8,[0,2,4,5],[1.0...|       1|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       0|       0.0|
|(8,[0,2,4,6],[1.0...|       1|       0.0|
|(8,[0,2,4,7],[1.0...|       0|       0.0|
|(8,[0,2,4,7],[1.0...|       0|       0.0|
|(8,[0,2,4,7],[1.0...|       0|       0.0|
+----------

### ML Model Evaluation

In [217]:
# Accuracy 
evaluator_rf = MulticlassClassificationEvaluator(labelCol='diabetes', predictionCol='prediction', metricName='accuracy')
accuracy_rf = evaluator_rf.evaluate(predictions_rf)
print(f"Akurasi: {accuracy_rf}")


# Precision
evaluator_precision_rf = MulticlassClassificationEvaluator(
    labelCol='diabetes', predictionCol='prediction', metricName='precisionByLabel')

precision_rf = evaluator_precision_rf.evaluate(predictions_rf)
print(f"Precision: {precision_rf}")

# Recall
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol='diabetes', predictionCol='prediction', metricName='recallByLabel')

recall_rf = evaluator_recall.evaluate(predictions_rf)
print(f"Recall: {recall_rf}")

# F1 Score
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol='diabetes', predictionCol='prediction', metricName='f1')

f1_score_rf = evaluator_f1.evaluate(predictions_rf)
print(f"F1 Score: {f1_score_rf}")

Akurasi: 0.9479662880175889
Precision: 0.9461596793413498
Recall: 1.0
F1 Score: 0.9373236715173976


## Summary

In [218]:
print("Decision Tree")
print("Accuracy: %f" % accuracy)
print("Precision: %f" % precision)
print("Recall: %f" % recall)
print("f1_score: %f" % f1_score)
print("")
print("Random Forest")
print("Accuracy: %f" % accuracy_rf)
print("Precision: %f" % precision_rf)
print("Recall: %f" % recall_rf)
print("f1_score: %f" % f1_score_rf)

Decision Tree
Accuracy: 0.949641
Precision: 0.947802
Recall: 1.000000
f1_score: 0.939821

Random Forest
Accuracy: 0.947966
Precision: 0.946160
Recall: 1.000000
f1_score: 0.937324


# Spark Off

In [219]:
spark.stop()