In [61]:
import pandas as pd
import numpy as np

In [62]:
heart = pd.read_csv('../data/heart_2020_cleaned.csv')

In [63]:
y = heart['HeartDisease']
X = heart.drop(columns=['HeartDisease'])

In [64]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import Bunch

In [65]:
heart_model_data = Bunch()

In [66]:
le = LabelEncoder()
le.fit(y)
fitted_y = le.transform(y)

In [67]:
heart_model_data['target_names'] = le.classes_ 
heart_model_data['target'] = fitted_y

In [68]:
X_numerical = X[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']]
X_categorical = X.drop(columns=['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'])

In [69]:
oe = OrdinalEncoder()
oe.fit(X_categorical)
fitted_X_categorical = oe.transform(X_categorical)

In [70]:
X_numerical.reset_index(inplace=True)

In [71]:
fitted_X_categorical = pd.DataFrame(fitted_X_categorical, columns=X_categorical.columns)

In [72]:
fitted_X_categorical.reset_index(inplace=True)

In [73]:
X = X_numerical.merge(fitted_X_categorical, left_on='index', right_on='index', how='inner')

In [74]:
X.drop(columns='index', inplace=True)

In [76]:
X.to_csv('../data/heart_encoded.csv', index=False)

In [113]:
heart_model_data['data'] = X.values
heart_model_data['feature_names'] = X.columns

In [78]:
X.columns

Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Smoking',
       'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma',
       'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [97]:
test = [heart.loc[0].drop('HeartDisease'), heart.loc[5].drop('HeartDisease')]

In [103]:
test_tmp = [test[0][X.columns][:4].to_list(), test[1][X.columns][:4].to_list()]

In [104]:
test_tmp[0].extend(oe.transform(test[0][X.columns][4:].to_numpy().reshape(1, -1))[0])
test_tmp[1].extend(oe.transform(test[1][X.columns][4:].to_numpy().reshape(1, -1))[0])



In [122]:
test = test_tmp

# Model training

In [118]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [119]:
def try_SGD():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    sgd = make_pipeline(StandardScaler(), SGDClassifier(loss='modified_huber', penalty='l2', max_iter=1000, class_weight='balanced', random_state=0))
    sgd.fit(X_train, y_train)

    print(f'Its score is {sgd.score(X_test, y_test)}.')

    return sgd

sgd = try_SGD()

Its score is 0.7583489681050657.


In [124]:
sgd.predict_proba(test)
# looks good

array([[0.37858369, 0.62141631],
       [0.27997911, 0.72002089]])

In [125]:
def try_SGD_log():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    sgd = make_pipeline(StandardScaler(), SGDClassifier(loss='log', penalty='l2', max_iter=1000, class_weight='balanced', random_state=0))
    sgd.fit(X_train, y_train)

    print(f'Its score is {sgd.score(X_test, y_test)}.')

    return sgd

sgd_log = try_SGD_log()

Its score is 0.743370856785491.


In [126]:
sgd_log.predict_proba(test)
# looks good

array([[0.46878456, 0.53121544],
       [0.26855211, 0.73144789]])

In [127]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [128]:
def try_logistic_regression():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    log = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', penalty='l2', random_state=0, max_iter=1000))
    log.fit(X_train, y_train)

    print(f'Its score is {log.score(X_test, y_test)}.')

    return log

log = try_logistic_regression()

Its score is 0.7429643527204502.


In [129]:
log.predict_proba(test)

array([[0.41094576, 0.58905424],
       [0.35774992, 0.64225008]])