In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
import matplotlib.pyplot as plt
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Business and Data Understanding

The Diabetes prediction dataset is a collection of medical and demographic data from patients, along with their diabetes status (positive or negative). The data includes features such as age, gender, body mass index (BMI), hypertension, heart disease, smoking history, HbA1c level, and blood glucose level. This dataset can be used to build machine learning models to predict diabetes in patients based on their medical history and demographic information. This can be useful for healthcare professionals in identifying patients who may be at risk of developing diabetes and in developing personalized treatment plans. Additionally, the dataset can be used by researchers to explore the relationships between various medical and demographic factors and the likelihood of developing diabetes.

Explanation about the features :

- Gender - Gender refers to the biological sex of the individual, which can have an impact on their susceptibility to diabetes. There are three categories in it male ,female and other.
- Age - Age is an important factor as diabetes is more commonly diagnosed in older adults.Age ranges from 0-80 in our dataset.
- Hypertension - Hypertension is a medical condition in which the blood pressure in the arteries is persistently elevated. It has values a 0 or 1 where 0 indicates they don’t have hypertension and for 1 it means they have hypertension.
- Heart_Disease - Heart disease is another medical condition that is associated with an increased risk of developing diabetes. It has values a 0 or 1 where 0 indicates they don’t have heart disease and for 1 it means they have heart disease.
- Smoking_History - Smoking history is also considered a risk factor for diabetes and can exacerbate the complications associated with diabetes.In our dataset we have 5 categories i.e not current,former,No Info,current,never and ever.
- BMI - BMI (Body Mass Index) is a measure of body fat based on weight and height. Higher BMI values are linked to a higher risk of diabetes. The range of BMI in the dataset is from 10.16 to 71.55. BMI less than 18.5 is underweight, 18.5-24.9 is normal, 25-29.9 is overweight, and 30 or more is obese.
- HbA1c_level - HbA1c (Hemoglobin A1c) level is a measure of a person's average blood sugar level over the past 2-3 months. Higher levels indicate a greater risk of developing diabetes. Mostly more than 6.5% of HbA1c Level indicates diabetes.
- Blood_Glucose_Level - Blood glucose level refers to the amount of glucose in the bloodstream at a given time. High blood glucose levels are a key indicator of diabetes.
- Diabetes - Diabetes is the target variable being predicted, with values of 1 indicating the presence of diabetes and 0 indicating the absence of diabetes.

##### Target Column : Diabetes

In [0]:
#Load Data
dataset = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/rahelcecilia9@gmail.com/diabetes_prediction_dataset.csv")

In [0]:
dataset.show(5)

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|Female|80.0|           0|            1|          never|25.19|        6.6|                140|       0|
|Female|54.0|           0|            0|        No Info|27.32|        6.6|                 80|       0|
|  Male|28.0|           0|            0|          never|27.32|        5.7|                158|       0|
|Female|36.0|           0|            0|        current|23.45|        5.0|                155|       0|
|  Male|76.0|           1|            1|        current|20.14|        4.8|                155|       0|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
only showing top 5 rows



In [0]:
dataset.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- hypertension: string (nullable = true)
 |-- heart_disease: string (nullable = true)
 |-- smoking_history: string (nullable = true)
 |-- bmi: string (nullable = true)
 |-- HbA1c_level: string (nullable = true)
 |-- blood_glucose_level: string (nullable = true)
 |-- diabetes: string (nullable = true)



In [0]:

#Change the string data type to int
column_int = ["age", "hypertension", "heart_disease", "bmi", "HbA1c_level", "blood_glucose_level", "diabetes"]

for col_name in column_int:
    dataset = dataset.withColumn(col_name, col(col_name).cast("int"))
    
dataset.printSchema()


root
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: string (nullable = true)
 |-- bmi: integer (nullable = true)
 |-- HbA1c_level: integer (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)



In [0]:
#Information about the dataset
row_count = dataset.count()
col_count = len(dataset.columns)

shape = (row_count, col_count)
print(shape)


(100000, 9)


### Data Cleansing

In [0]:
#Converting categorical into numerical values

categorical_columns = ["gender",  "smoking_history"]

# Perform indexation for each column separately
for col in categorical_columns:
     # Create a StringIndexer for each column
     indexer = StringIndexer(inputCol=col, outputCol=col + "_index")
    
     # Perform indexation on the DataFrame using the appropriate StringIndexer
     dataset = indexer.fit(dataset).transform(dataset)

In [0]:
dataset.show(5)

+------+---+------------+-------------+---------------+---+-----------+-------------------+--------+------------+---------------------+
|gender|age|hypertension|heart_disease|smoking_history|bmi|HbA1c_level|blood_glucose_level|diabetes|gender_index|smoking_history_index|
+------+---+------------+-------------+---------------+---+-----------+-------------------+--------+------------+---------------------+
|Female| 80|           0|            1|          never| 25|          6|                140|       0|         0.0|                  1.0|
|Female| 54|           0|            0|        No Info| 27|          6|                 80|       0|         0.0|                  0.0|
|  Male| 28|           0|            0|          never| 27|          5|                158|       0|         1.0|                  1.0|
|Female| 36|           0|            0|        current| 23|          5|                155|       0|         0.0|                  3.0|
|  Male| 76|           1|            1|        c

In [0]:
#We didn't need the old data set, so I decided to delete the column and I used a column name ending in _index which means it has been converted to numerical values
delete = ["gender","smoking_history"]

clean_dataset = dataset.drop(*delete)
clean_dataset.show(5)

+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+
|age|hypertension|heart_disease|bmi|HbA1c_level|blood_glucose_level|diabetes|gender_index|smoking_history_index|
+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+
| 80|           0|            1| 25|          6|                140|       0|         0.0|                  1.0|
| 54|           0|            0| 27|          6|                 80|       0|         0.0|                  0.0|
| 28|           0|            0| 27|          5|                158|       0|         1.0|                  1.0|
| 36|           0|            0| 23|          5|                155|       0|         0.0|                  3.0|
| 76|           1|            1| 20|          4|                155|       0|         1.0|                  3.0|
+---+------------+-------------+---+-----------+-------------------+--------+------------+------

In [0]:
from pyspark.sql.functions import col, sum

#Missing Values
missing_count = clean_dataset.select([sum(col(c).isNull().cast("int")).alias(c) for c in clean_dataset.columns])
missing_count.show()

+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+
|age|hypertension|heart_disease|bmi|HbA1c_level|blood_glucose_level|diabetes|gender_index|smoking_history_index|
+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+
|  0|           0|            0|  0|          0|                  0|       0|           0|                    0|
+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+



### ML Model Engineering


In [0]:
#Split dataset

 #data_train will have about 80% of the data, and data_test will have about 20% of the data. The seed value (seed=123) is used to ensure consistent sharing.
data_train = clean_dataset.sample(False, 0.8, seed=123)
data_test = clean_dataset.subtract(data_train)

In [0]:
#Information about data_train

row_count = data_train.count()
col_count = len(data_test.columns)

shape = (row_count, col_count)

data_train.show(10)
print(shape)

+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+
|age|hypertension|heart_disease|bmi|HbA1c_level|blood_glucose_level|diabetes|gender_index|smoking_history_index|
+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+
| 80|           0|            1| 25|          6|                140|       0|         0.0|                  1.0|
| 54|           0|            0| 27|          6|                 80|       0|         0.0|                  0.0|
| 36|           0|            0| 23|          5|                155|       0|         0.0|                  3.0|
| 76|           1|            1| 20|          4|                155|       0|         1.0|                  3.0|
| 20|           0|            0| 27|          6|                 85|       0|         0.0|                  1.0|
| 79|           0|            0| 23|          5|                 85|       0|         0.0|      

In [0]:
#Information about data_test
row_count = data_test.count()
col_count = len(data_test.columns)

shape = (row_count, col_count)
data_test.show(10)
print(shape)

+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+
|age|hypertension|heart_disease|bmi|HbA1c_level|blood_glucose_level|diabetes|gender_index|smoking_history_index|
+---+------------+-------------+---+-----------+-------------------+--------+------------+---------------------+
| 53|           0|            0| 31|          4|                200|       0|         0.0|                  0.0|
| 78|           0|            0| 36|          5|                130|       0|         0.0|                  2.0|
| 67|           0|            0| 25|          5|                200|       0|         0.0|                  1.0|
| 76|           0|            0| 23|          5|                 85|       0|         0.0|                  1.0|
| 50|           1|            0| 27|          5|                260|       1|         1.0|                  3.0|
| 72|           0|            1| 27|          6|                130|       0|         0.0|      

In [0]:
#because the purpose of the test data is to test the model, the target column (diabetes) must be deleted.
delete = [
    "diabetes"
]

data_test_fix = data_test.drop(*delete)
data_test_fix.show(5)

+---+------------+-------------+---+-----------+-------------------+------------+---------------------+
|age|hypertension|heart_disease|bmi|HbA1c_level|blood_glucose_level|gender_index|smoking_history_index|
+---+------------+-------------+---+-----------+-------------------+------------+---------------------+
| 53|           0|            0| 31|          4|                200|         0.0|                  0.0|
| 78|           0|            0| 36|          5|                130|         0.0|                  2.0|
| 67|           0|            0| 25|          5|                200|         0.0|                  1.0|
| 72|           0|            1| 27|          6|                130|         0.0|                  2.0|
| 28|           0|            0| 27|          5|                158|         1.0|                  1.0|
+---+------------+-------------+---+-----------+-------------------+------------+---------------------+
only showing top 5 rows



In [0]:
feature_columns = ["age", "hypertension", "heart_disease","bmi","HbA1c_level","blood_glucose_level", "gender_index", "smoking_history_index"]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data_train = assembler.transform(data_train)

assembler_test = VectorAssembler(inputCols=feature_columns, outputCol="features")
data_test_fix = assembler_test.transform(data_test_fix)


In [0]:
# Initialize SVM model

svm = LinearSVC(labelCol="diabetes", featuresCol="features")

# Train SVM model
svm_model = svm.fit(data_train)

In [0]:
# Prediction using test data
predictions = svm_model.transform(data_test_fix)
predictions.show(10)

+---+------------+-------------+---+-----------+-------------------+------------+---------------------+--------------------+--------------------+----------+
|age|hypertension|heart_disease|bmi|HbA1c_level|blood_glucose_level|gender_index|smoking_history_index|            features|       rawPrediction|prediction|
+---+------------+-------------+---+-----------+-------------------+------------+---------------------+--------------------+--------------------+----------+
| 46|           0|            0| 21|          6|                158|         0.0|                  2.0|[46.0,0.0,0.0,21....|[1.59465533288541...|       0.0|
| 38|           0|            0| 26|          6|                155|         1.0|                  3.0|[38.0,0.0,0.0,26....|[1.44317507340094...|       0.0|
| 57|           0|            0| 22|          6|                 85|         0.0|                  3.0|[57.0,0.0,0.0,22....|[2.62903430788985...|       0.0|
| 62|           0|            0| 40|          3|          

In [0]:
from pyspark.sql.functions import count

#To calculate how much the result is from the predicted value
unique = predictions.groupBy('prediction').agg(count('*').alias('count'))

unique.show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|13369|
|       1.0|  986|
+----------+-----+



### ML Model Evaluation

In [0]:

evaluator = MulticlassClassificationEvaluator(labelCol="prediction", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

#if accuracy = 1.0, so model is making a correct predictions

Accuracy: 1.0


In [0]:
# precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
print("Precision: {:.2f}".format(precision))

# recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
print("Recall: {:.2f}".format(recall))

# ROC (AUC)
auc = evaluator.evaluate(predictions)
print("Area under ROC = {:.2f}".format(auc))

Precision: 1.00
Recall: 1.00
Area under ROC = 1.00
