In [9]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import (
    LogisticRegression, DecisionTreeClassifier,
    RandomForestClassifier, LinearSVC)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pyspark.sql.functions as F

file_path = "/Users/pepijnschouten/Desktop/Python_Scripts" \
                "/Python_Scripts_Books/Distributed_ML_with_PySpark" \
                    "/Python_Own_Files/Chapter 2/data/diabetes.csv"

"""
Data Preprocessing
"""
pandas_df = pd.read_csv(file_path)

spark = SparkSession.builder.appName("diabetes_data").getOrCreate()
spark_df = spark.createDataFrame(pandas_df)
spark_df = spark_df.filter((spark_df["Glucose"] != 0)
                          & (spark_df["BloodPressure"] != 0)
                          & (spark_df["BMI"] != 0)).select(
                              ["Pregnancies", "Glucose", "BloodPressure",
                               "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
                          )

"""
PySpark Model Selection with Cross Validation
"""

# required for distributed training
assembler = VectorAssembler(
    inputCols=["Pregnancies",
               "Glucose",
               "BloodPressure",
               "BMI",
               "DiabetesPedigreeFunction",
               "Age"],
    outputCol="features")
data = assembler.transform(spark_df)
data.show(5, truncate=False)

# rename outcome column to label
data = data.withColumnRenamed("Outcome", "label")

# standardize features
scaler = StandardScaler(inputCol="features",
                        outputCol="scaled_features")
data = scaler.fit(data).transform(data)

# make classifier list
classifiers = [LogisticRegression(),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               LinearSVC(maxIter=1500)]

# build empty param grid
param_grid = ParamGridBuilder().build()

# build evaluator
evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC")

# build cross validator
for classifier in classifiers:
    cv = CrossValidator(estimator=classifier,
                        estimatorParamMaps=param_grid,
                        evaluator=evaluator,
                        numFolds=10)
    cv_model = cv.fit(data)
    results = cv_model.transform(data)
    accuracy = evaluator.evaluate(results)

    print(f"{classifier.__class__.__name__} ROC AUC mean: {accuracy:.2f}")

+-----------+-------+-------------+----+------------------------+---+-------+--------------------------------+
|Pregnancies|Glucose|BloodPressure|BMI |DiabetesPedigreeFunction|Age|Outcome|features                        |
+-----------+-------+-------------+----+------------------------+---+-------+--------------------------------+
|6          |148    |72           |33.6|0.627                   |50 |1      |[6.0,148.0,72.0,33.6,0.627,50.0]|
|1          |85     |66           |26.6|0.351                   |31 |0      |[1.0,85.0,66.0,26.6,0.351,31.0] |
|8          |183    |64           |23.3|0.672                   |32 |1      |[8.0,183.0,64.0,23.3,0.672,32.0]|
|1          |89     |66           |28.1|0.167                   |21 |0      |[1.0,89.0,66.0,28.1,0.167,21.0] |
|0          |137    |40           |43.1|2.288                   |33 |1      |[0.0,137.0,40.0,43.1,2.288,33.0]|
+-----------+-------+-------------+----+------------------------+---+-------+--------------------------------+
o

24/10/24 15:34:34 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


LogisticRegression ROC AUC mean: 0.84
DecisionTreeClassifier ROC AUC mean: 0.78
RandomForestClassifier ROC AUC mean: 0.91


24/10/24 15:35:08 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 
24/10/24 15:35:29 ERROR OWLQN: Failure! Resetting history: breeze.optimize.NaNHistory: 


LinearSVC ROC AUC mean: 0.84
