# Building a Machine learning model pipeline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import pickle

In [74]:
pd.options.mode.chained_assignment = None
df = pd.read_csv("../data/healthcare-dataset-stroke-data.csv")

In [75]:
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [76]:
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [77]:
## Split and format dataframe
def split_train_test(df):

    X = df.iloc[:,:-1]
    Y = df.iloc[:, -1]
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, random_state=43, test_size=0.33)
    xtrain.to_csv("../data/stroke_prediction_train.csv")
    xtest.to_csv("../data/stroke_prediction_test.csv")

    return xtrain, xtest, ytrain, ytest

In [78]:
## format dataframe
def reformat_work_type(df):
    df["work_type"] = df["work_type"].replace("Self-employed", "Self employed")
    df["work_type"] = df["work_type"].replace("Govt_job", "Govt job")
    df["work_type"] = df["work_type"].replace("children", "Children")
    df["work_type"] = df["work_type"].replace("Never_worked", "Never worked")

    return df

In [79]:
## rename column
def rename_columns(df):
    df = df.rename(columns={"heart_disease": "heart disease",
        "avg_glucose_level": "avg glucose level", "ever_married": "ever married",
        "Residence_type": "residence type", "work_type": "work type",
        "smoking_status": "smoking status"})

    return df

In [80]:
def format_df(df):
    xtrain, xtest, ytrain, ytest = split_train_test(df)
    xtrain = reformat_work_type(xtrain)
    xtrain = rename_columns(xtrain)

    #xtest = reformat_work_type(xtest)
    #xtest = rename_columns(xtest)
    
    return xtrain, xtest, ytrain, ytest

In [81]:
def format_inference_df(df):
    df = reformat_work_type(df)
    df = rename_columns(df)
    
    return df

## Preprocessing

### Outliers and missing values

In [82]:
def remove_outliers(df):
    upper_fence_age = df["age"].mean() + 3*df["age"].std()
    lower_fence_age = df["age"].mean() - 3*df["age"].std()

    age_index = df[(df["age"] > upper_fence_age) | (df["age"] < lower_fence_age)].index.tolist()

    upper_fence_glucose = df["avg_glucose_level"].mean() + 3*df["avg_glucose_level"].std()
    lower_fence_glucose = df["avg_glucose_level"].mean() - 3*df["avg_glucose_level"].std()

    glucose_index = df[(df["avg_glucose_level"] > upper_fence_glucose | df["avg_glucose_level"] < lower_fence_glucose)].index.tolist()

    upper_fence_bmi = df["bmi"].mean() + 3*df["bmi"].std()
    lower_fence_bmi = df["bmi"].mean() - 3*df["bmi"].std()

    bmi_index = df[(df["bmi"] > upper_fence_bmi | df["bmi"] < lower_fence_bmi)].index.tolist()

    index_to_be_removed = bmi_index + age_index + glucose_index

    df = df.drop(index=index_to_be_removed)

    return df

In [83]:
def fit_KNN_missing_values(df):
    file = "../models/imputer_KNN.pkl"
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(df[["bmi"]])
    pickle.dump(imputer, open(file, "wb"))

    return df

In [84]:
def transform_imputer(df):
    file = "../models/imputer_KNN.pkl"
    imputer = pickle.load(open(file, "rb"))
    df["bmi"] = imputer.transform(df[["bmi"]])
    
    return df

## Feature Engineering

In [85]:
def preprocess_gender(df):
    df = df.replace("Male", 0)
    df = df.replace("Female", 1)
    other_index = df[df["gender"] == "Other"].index
    df = df.drop(other_index)
    
    return df, other_index

In [86]:
def preprocess_ever_married(df):
    df = df.replace("no", 0)
    df = df.replace("yes", 1)
    
    return df

In [87]:
def fit_scaler(df):
    cols = ["age", "avg glucose level", "bmi"]
    file = "../models/Scaler.pkl"
    scaler = MinMaxScaler()
    scaler.fit(df[cols])
    pickle.dump(scaler, open(file, "wb"))

In [88]:
def transform_scaler(df):
    cols = ["age", "avg glucose level", "bmi"]
    file = "../models/Scaler.pkl"
    scaler = pickle.load(open(file, "rb"))
    df[cols] = scaler.transform(df[cols])
    return df

In [89]:
def preprocess_residence(df):
    df = df.replace("Urban", 0)
    df = df.replace("Rural", 1)
    
    return df

In [90]:
def preprocess_ever_maried(df):
    df = df.replace("Yes", 0)
    df = df.replace("No", 1)

    return df

In [91]:
def fit_encoder(df):
    file = "../models/OneHot.pkl"
    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
    enc.fit(df[["work type", "smoking status"]])

    pickle.dump(enc, open(file, "wb"))

In [92]:
def transform_encoder(df):
    file = "../models/OneHot.pkl"
    enc = pickle.load(open(file, "rb"))
    list_name = enc.get_feature_names_out(["work type", "smoking status"])
    df[list_name] = enc.transform(df[["work type", "smoking status"]])
    df = df.drop(columns=["work type", "smoking status"])

    return df

In [93]:
def fit_scaler_encoder(df):
    fit_KNN_missing_values(df)
    fit_encoder(df)
    fit_scaler(df)

def transform_scaler_encoder(df):
    df = transform_imputer(df)
    df = transform_encoder(df)
    df = transform_scaler(df)

    return df

### Removing Ids and Building model

In [94]:
def store_id(df):
    df_ids = df["id"]
    df = df.drop(columns=["id"])
    return df_ids, df

In [95]:
def build_model(xtrain, ytrain):
    file = "../models/classifier.pkl"
    kfold = StratifiedKFold(n_splits=10)
    clf = LogisticRegression()
    clf.fit(xtrain, ytrain)
    pickle.dump(clf, open(file, "wb"))

### Creating a pipeline train

In [96]:
def pipeline_train(df):
    xtrain, xtest, ytrain, ytest = format_df(df)
    fit_scaler_encoder(xtrain)
    xtrain = transform_scaler_encoder(xtrain)
    xtrain, ind = preprocess_gender(xtrain)
    ytrain = ytrain.drop(index=ind)
    xtrain = preprocess_ever_maried(xtrain)
    xtrain = preprocess_residence(xtrain)
    ids, xtrain = store_id(xtrain)
    
    return xtrain, ytrain, xtest, ytest

### Creating a pipeline test

In [97]:
def pipeline_test(df):
    df = format_inference_df(df)
    
    if df["bmi"].isnull().sum() > 0:
        df = transform_imputer(df)

    df, ind = preprocess_gender(df)
    df = preprocess_ever_maried(df)
    df = preprocess_residence(df)
    df = transform_scaler_encoder(df)

    ids, df = store_id(df)

    return df

In [98]:
def make_model_train(df):
    xtrain, ytrain, xtest, ytest = pipeline_train(df)
    build_model(xtrain, ytrain)

    return xtest, ytest

In [99]:
def evaluate_model(xtest, ytest):
    xtest = pipeline_test(xtest)
    file = "../models/classifier.pkl"
    clf = pickle.load(open(file, "rb"))

    return clf.score(xtest, ytest)

In [100]:
xtest, ytest = make_model_train(df)



In [108]:
def make_prediction(X):
    X = pipeline_test(X)
    file = "/Users/quangtn/Desktop/01_work/01_job/02_ml/PySpark/HeartStrokePrediction/models/classifier.pkl"
    clf = pickle.load(open(file, "rb"))

    return clf.predict(X)

In [109]:
make_prediction(xtest)

array([0, 0, 0, ..., 0, 0, 0])

In [103]:
xtest.head(4)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
2342,28258,Female,80.0,0,0,Yes,Self employed,Urban,75.06,29.7,Unknown
2805,35759,Female,16.0,0,0,No,Private,Rural,92.77,24.9,Unknown
3986,18134,Male,10.0,0,0,No,Children,Rural,95.8,17.3,Unknown
4740,70677,Male,60.0,0,0,Yes,Private,Rural,234.45,36.8,formerly smoked
