In [1]:
import pandas as pd
import numpy as np

In [2]:
heart = pd.read_csv('../data/heart_2020_cleaned.csv')

In [3]:
heart[['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']] = \
    heart[['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']].replace(['Yes', 'No'], value=[1, 0])

In [4]:
# use get_dummies method to convert all categorical columns into numerical representation
heart_dummies = pd.get_dummies(heart)

In [50]:
heart_dummies.to_csv('../data/heart_dummies.csv', index=False)

# Model training

In [5]:
from sklearn.utils import Bunch

In [6]:
heart_model_data = Bunch()

heart_model_data['target'] = heart_dummies['HeartDisease'].to_list()
heart_dummies.drop(columns=['HeartDisease'], inplace=True)

heart_model_data['data'] = heart_dummies.values.tolist()

heart_model_data['feature_names'] = heart_dummies.columns.to_list()
heart_model_data['target_names'] = ['No', 'Yes']

In [7]:
test = [heart_dummies.loc[0].to_list(), heart_dummies.loc[5].to_list()] # first didn't have, second had heart problems

In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [9]:
def try_SGD():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    sgd = make_pipeline(StandardScaler(), SGDClassifier(loss='modified_huber', penalty='l2', max_iter=1000, class_weight='balanced', random_state=0))
    sgd.fit(X_train, y_train)

    print(f'Its score is {sgd.score(X_test, y_test)}.')

    return sgd

sgd = try_SGD()

Its score is 0.7189493433395873.


In [10]:
sgd.predict_proba(test)
# looks good

array([[0.53234597, 0.46765403],
       [0.19835945, 0.80164055]])

In [11]:
def try_SGD_log():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    sgd = make_pipeline(StandardScaler(), SGDClassifier(loss='log', penalty='l2', max_iter=1000, class_weight='balanced', random_state=0))
    sgd.fit(X_train, y_train)

    print(f'Its score is {sgd.score(X_test, y_test)}.')

    return sgd

sgd_log = try_SGD_log()

Its score is 0.7651657285803627.


In [12]:
sgd_log.predict_proba(test)
# bad predictions

array([[0.43208986, 0.56791014],
       [0.45877196, 0.54122804]])

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [14]:
def try_logistic_regression():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    log = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', penalty='l2', random_state=0, max_iter=1000))
    log.fit(X_train, y_train)

    print(f'Its score is {log.score(X_test, y_test)}.')

    return log

log = try_logistic_regression()

Its score is 0.750093808630394.


In [15]:
log.predict_proba(test)
# looks good

array([[0.52487795, 0.47512205],
       [0.30856725, 0.69143275]])