In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col
import pandas as pd

spark = (SparkSession.builder
         .appName("DiabetesLogReg").getOrCreate())

# read data
def read_data(path):
    pandas_df = pd.read_csv(path)
    return spark.createDataFrame(pandas_df)

# split data
def split_data(spark_df):
    return spark_df.randomSplit([0.8, 0.2],
                                seed=87)
    
# vectorize features
def vectorize_features(train_data, test_data, feature_cols):
    assembler = VectorAssembler(inputCols=feature_cols,
                                outputCol="features")
    return (assembler.transform(train_data),
            assembler.transform(test_data))
    
# standardize features
def scale_features(train_data, test_data):
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaled_features")
    scaled_model = scaler.fit(train_data)
    return (scaled_model.transform(train_data),
            scaled_model.transform(test_data))
    
# train model
def train_model(train_data):
    spark.conf.set("spark.seed", "87")
    log_reg = LogisticRegression(featuresCol="scaled_features",
                                 labelCol="Outcome")
    return log_reg.fit(train_data)

# evaluate model
def evaluate_model(model, test_data):
    preds = model.transform(test_data)
    preds = preds.withColumn("prediction",
                             col("prediction").cast("int"))
    preds_and_labels = preds.select(["prediction", "Outcome"])
    
    tp = preds_and_labels[
        (preds_and_labels.Outcome == 1) &
        (preds_and_labels.prediction == 1)].count()
    
    tn = preds_and_labels[
        (preds_and_labels.Outcome == 0) &
        (preds_and_labels.prediction == 0)].count()
    
    fp = preds_and_labels[
        (preds_and_labels.Outcome == 0) &
        (preds_and_labels.prediction == 1)].count()
    
    fn = preds_and_labels[
        (preds_and_labels.Outcome == 1) &
        (preds_and_labels.prediction == 0)].count()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = (2 * precision * recall) / (precision + recall)
    confusion = [[tp, fp], [fn, tn]]
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Confusion Matrix:\n{confusion}")
    preds.select('Outcome', 'prediction').show(5)    
    

In [13]:
path = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/PySpark/Distributed_ML_with_" \
        "PySpark/Python_Own_Files/Chapter 7 Logistic Reg" \
            "/data/diabetes.csv"
            
spark_df = read_data(path)

# relevant columns
columns = ['Pregnancies', 'Glucose',
           'BloodPressure', 'BMI',
           'DiabetesPedigreeFunction', 'Age',
           'Outcome']

spark_df = spark_df.select(columns)
spark_df = spark_df.filter(
    (col('Glucose') != 0) &
    (col('BloodPressure') != 0) &
    (col('BMI') != 0))

train_data, test_data = split_data(spark_df)

*feature_cols, _ = columns

train_data, test_data = vectorize_features(
    train_data, test_data, feature_cols)

train_data, test_data = scale_features(
    train_data, test_data)

log_reg_model = train_model(train_data)

evaluate_model(log_reg_model, test_data)

24/11/14 10:06:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Accuracy: 0.7721518987341772
Precision: 0.6382978723404256
Recall: 0.6122448979591837
F1 Score: 0.625
Confusion Matrix:
[[30, 17], [19, 92]]
+-------+----------+
|Outcome|prediction|
+-------+----------+
|      1|         0|
|      0|         0|
|      0|         0|
|      0|         0|
|      0|         0|
+-------+----------+
only showing top 5 rows

