In [1]:
import pandas as pd
import numpy as np

In [2]:
heart = pd.read_csv('../data/heart_2020_cleaned.csv')

In [3]:
y = heart['HeartDisease']
X = heart.drop(columns=['HeartDisease'])

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import Bunch

In [5]:
heart_model_data = Bunch()

In [6]:
le = LabelEncoder()
le.fit(y)
fitted_y = le.transform(y)

In [7]:
heart_model_data['target_names'] = le.classes_ 
heart_model_data['target'] = fitted_y

In [8]:
oe = OrdinalEncoder()
oe.fit(X)
fitted_X = oe.transform(X)

In [9]:
heart_model_data['data'] = fitted_X
heart_model_data['feature_names'] = oe.feature_names_in_

In [10]:
test = [heart.loc[0].drop('HeartDisease'), heart.loc[5].drop('HeartDisease')]

In [11]:
test = [oe.transform(v.to_numpy().reshape(1, -1))[0] for v in test]



# Model training

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [13]:
def try_SGD():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    sgd = make_pipeline(StandardScaler(), SGDClassifier(loss='modified_huber', penalty='l2', max_iter=1000, class_weight='balanced', random_state=0))
    sgd.fit(X_train, y_train)

    print(f'Its score is {sgd.score(X_test, y_test)}.')

    return sgd

sgd = try_SGD()

Its score is 0.7581613508442777.


In [14]:
sgd.predict_proba(test)
# looks good

array([[0.38266308, 0.61733692],
       [0.2800126 , 0.7199874 ]])

In [15]:
def try_SGD_log():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    sgd = make_pipeline(StandardScaler(), SGDClassifier(loss='log', penalty='l2', max_iter=1000, class_weight='balanced', random_state=0))
    sgd.fit(X_train, y_train)

    print(f'Its score is {sgd.score(X_test, y_test)}.')

    return sgd

sgd_log = try_SGD_log()

Its score is 0.7429018136335209.


In [17]:
sgd_log.predict_proba(test)
# looks good

array([[0.4711882, 0.5288118],
       [0.2673095, 0.7326905]])

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [19]:
def try_logistic_regression():
    X = heart_model_data.data
    y = heart_model_data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2000)

    log = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced', penalty='l2', random_state=0, max_iter=1000))
    log.fit(X_train, y_train)

    print(f'Its score is {log.score(X_test, y_test)}.')

    return log

log = try_logistic_regression()

Its score is 0.742714196372733.


In [20]:
log.predict_proba(test)

array([[0.41493172, 0.58506828],
       [0.3562869 , 0.6437131 ]])