In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score,
                             confusion_matrix)
from sklearn.preprocessing import StandardScaler

# load data
def load_diabetes_csv(path):
    return pd.read_csv(path)

# split data
def split_data(X, y, test_size=0.2, random_state=87):
    return train_test_split(X, y,
                            test_size=test_size, random_state=random_state)

# scaling the data
def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

# Logistic regression model
def train_log_reg(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

# model evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print(f'Confusion Matrix:\n{confusion}')
    
    result_df = pd.DataFrame({'Actual': y_test,
                              'Predicted':y_pred})
    print(result_df.head())

In [None]:
path = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_Scripts_Books/PySpark/Distributed_ML_with_" \
        "PySpark/Python_Own_Files/Chapter 7 Logistic Reg" \
            "/data/diabetes.csv"
            
pandas_df = load_diabetes_csv(path)

# relevant columns
columns = ['Pregnancies', 'Glucose',
           'BloodPressure', 'BMI',
           'DiabetesPedigreeFunction', 'Age',
           'Outcome']

# clean data
pandas_df = pandas_df[columns]
pandas_df = pandas_df.loc[(pandas_df.Glucose != 0)
                          & (pandas_df.BloodPressure != 0)
                          & (pandas_df.BMI != 0),]

X = pandas_df.drop('Outcome', axis=1)
y = pandas_df['Outcome']

X_train, X_test, y_train, y_test = split_data(X, y)

X_train, X_test = scale_data(X_train, X_test)

model = train_log_reg(X_train, y_train)

evaluate_model(model, X_test, y_test)


Accuracy: 0.69
Precision: 0.67
Recall: 0.48
F1 Score: 0.56
Confusion Matrix:
[[71 14]
 [31 29]]
     Actual  Predicted
319       1          1
658       0          1
534       0          0
73        0          0
322       1          0
