In [36]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

In [37]:
data = pd.read_csv('heart.csv')
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [38]:
data.ExerciseAngina.value_counts()

N    547
Y    371
Name: ExerciseAngina, dtype: int64

In [39]:
# data.isna().sum()

In [40]:
# df = data.sample(frac=0.8,random_state=1)
# test = data.drop(df.index)
df, test = train_test_split(data, test_size=0.2, shuffle=True)

In [41]:
df.corr()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
Age,1.0,0.25793,-0.085609,0.193174,-0.375764,0.263849,0.270633
RestingBP,0.25793,1.0,0.089975,0.072651,-0.125442,0.196766,0.122898
Cholesterol,-0.085609,0.089975,1.0,-0.271712,0.237133,0.075085,-0.213981
FastingBS,0.193174,0.072651,-0.271712,1.0,-0.144404,0.049968,0.256848
MaxHR,-0.375764,-0.125442,0.237133,-0.144404,1.0,-0.155297,-0.418219
Oldpeak,0.263849,0.196766,0.075085,0.049968,-0.155297,1.0,0.396223
HeartDisease,0.270633,0.122898,-0.213981,0.256848,-0.418219,0.396223,1.0


In [42]:
# sns.barplot(x='Sex', y='HeartDisease', data=df);

In [43]:
# plt.hist(df.Age)
# plt.show()
# plt.bar(np.sort(np.unique(df.Age)),df.groupby('Age')['HeartDisease'].sum())

In [44]:
df.groupby('HeartDisease').Age.count()

HeartDisease
0    328
1    406
Name: Age, dtype: int64

In [45]:
X, y = df[df.columns.difference(['HeartDisease'])], df['HeartDisease']
X.shape

(734, 11)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [47]:
cat_columns = ['ChestPainType', 'RestingECG', 'ST_Slope']
le_columns = ['Sex', 'FastingBS', 'ExerciseAngina']

In [48]:
X_test.Sex.value_counts()

M    147
F     37
Name: Sex, dtype: int64

In [49]:
le = LabelEncoder()

for le_column in le_columns:
    le.fit(X_train[le_column])
    X_train.loc[:, le_column] = pd.Series(le.transform(X_train[le_column]), name=le_column, index=X_train.index)
    X_test.loc[:, le_column] = pd.Series(le.transform(X_test[le_column]), name=le_column, index=X_test.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [50]:
# sscaler = MinMaxScaler()
# sscaler.fit(X_train[X_train.columns.difference(cat_columns)])

# train_scaled = pd.DataFrame(
#     sscaler.transform(X_train[X_train.columns.difference(cat_columns)]),
#     columns=X_train.columns.difference(cat_columns),
#     index=X_train.index)
# test_scaled = pd.DataFrame(
#     sscaler.transform(X_test[X_train.columns.difference(cat_columns)]),
#     columns=X_train.columns.difference(cat_columns),
#     index=X_test.index)

In [51]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(X_train[cat_columns])
    
train_encoded = pd.DataFrame(
    encoder.transform(X_train[cat_columns]),
    columns=encoder.get_feature_names(cat_columns),
    index=X_train.index)
test_encoded = pd.DataFrame(
    encoder.transform(X_test[cat_columns]),
    columns=encoder.get_feature_names(cat_columns),
    index=X_test.index)

In [52]:
X_train.columns.difference(cat_columns)

Index(['Age', 'Cholesterol', 'ExerciseAngina', 'FastingBS', 'MaxHR', 'Oldpeak',
       'RestingBP', 'Sex'],
      dtype='object')

In [53]:
X_train =  pd.concat([X_train[X_train.columns.difference(cat_columns)], train_encoded], axis=1)
X_test = pd.concat([X_test[X_test.columns.difference(cat_columns)], test_encoded], axis=1)

In [54]:
X_train.head()

Unnamed: 0,Age,Cholesterol,ExerciseAngina,FastingBS,MaxHR,Oldpeak,RestingBP,Sex,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
335,62,0,0,1,134,-0.8,120,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
324,46,0,0,1,133,-2.6,100,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
28,53,468,0,0,127,0.0,113,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
743,52,201,0,0,158,0.8,134,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
631,46,311,1,0,120,1.8,140,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [55]:
out_index = X_train[np.abs(stats.zscore(X_train)) > 4].index
X_train.drop(out_index, inplace=True)
y_train.drop(out_index, inplace=True)

In [56]:
# class_weight = dict((y_train.value_counts().sum() - y_train.value_counts()) / y_train.value_counts().sum())

In [57]:
# logR = LogisticRegression(C=1, tol=0.0001, fit_intercept=True, class_weight=class_weight, max_iter=500)

In [58]:
# logR.fit(X_train, y_train)

In [59]:
# logR.score(X_train, y_train), logR.score(X_test, y_test)

In [60]:
# s1 = []
# s = []
# for i in range(10):
#     rf = RandomForestClassifier(
#         n_estimators=100,
#         max_depth=None,
#         min_samples_split=10,
#         max_leaf_nodes=20,
#         class_weight=class_weight,
#         n_jobs=2)
#     rf.fit(X_train, y_train)
#     s1.append(rf.score(X_train, y_train)), s.append(rf.score(X_test, y_test))

# np.mean(np.array(s1)), np.mean(np.array(s))

In [61]:
# rf.score(X_train, y_train), rf.score(X_test, y_test)

In [62]:
# f1_score(rf.predict(X_train), y_train), f1_score(rf.predict(X_test), y_test)

In [63]:
# pca = PCA(n_components=15)
# pca.fit(X_train)
# pca.transform(X_train).shape, pca.transform(X_test).shape

In [64]:
# X_train = pd.DataFrame(
#     pca.transform(X_train),
#     # columns=encoder.get_feature_names(cat_columns),
#     index=X_train.index)
# X_test = pd.DataFrame(
#     pca.transform(X_test),
#     # columns=encoder.get_feature_names(cat_columns),
#     index=X_test.index)

In [65]:
dict((y.value_counts().sum() - y.value_counts()) / y.value_counts().sum())

{1: 0.44686648501362397, 0: 0.553133514986376}

In [66]:
y.shape[0] / (2 * np.bincount(y))
# np.bincount(y)

array([1.11890244, 0.90394089])

In [67]:
# s1 = []
# s = []
# for i in range(10):
#     rf = RandomForestClassifier(
#         n_estimators=100,
#         max_depth=None,
#         min_samples_split=10,
#         max_leaf_nodes=20,
#         class_weight=class_weight,
#         n_jobs=2)
#     rf.fit(X_train, y_train)
#     s1.append(rf.score(X_train, y_train)), s.append(rf.score(X_test, y_test))

# np.mean(np.array(s1)), np.mean(np.array(s))

In [101]:
def full_pipeline(data,
                  clf=None,
                  encoder=None,
                  le_d=None,
                  pca=None,
                  k=10):

    X, y = data[data.columns.difference(['HeartDisease'])], data['HeartDisease']
    print(X.shape)
    
    cat_columns = ['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
    le_columns = ['Sex', 'FastingBS', 'ExerciseAngina']
    
    if le_d is None:
        le_d = dict()

        for le_column in le_columns:
            le_d[le_column] = LabelEncoder()
            le_d[le_column].fit(X.loc[:, le_column])

    for le_column in le_columns:
        X.loc[:, le_column] = pd.Series(le_d[le_column].transform(X.loc[:, le_column]), name=le_column, index=X.index)

    if encoder is None:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoder.fit(X[cat_columns])
    X_encoded = pd.DataFrame(
        encoder.transform(X[cat_columns]),
        columns=encoder.get_feature_names(cat_columns),
        index=X.index)

    X =  pd.concat([X[X.columns.difference(cat_columns)], X_encoded], axis=1)
    
    if pca is None:
        pca = PCA(n_components=15)
        pca.fit(X)
    X = pd.DataFrame(pca.transform(X), index=X.index)

    class_weight = dict((y.value_counts().sum() - y.value_counts()) / y.value_counts().sum())

    if clf is None:
        
        out_index = X[np.abs(stats.zscore(X)) > 4].index
        X.drop(out_index, inplace=True)
        y.drop(out_index, inplace=True)
        
        rf = RandomForestClassifier(
            n_estimators=100,
            max_depth=None,
            min_samples_split=10,
            max_leaf_nodes=15,
            class_weight=class_weight,
            n_jobs=4)
        
        clf = AdaBoostClassifier(base_estimator=rf, n_estimators=10)

        train_acc = []
        test_acc = []

        kf_cv = KFold(n_splits=k, shuffle=True)

        for train_index, test_index in kf_cv.split(X):
            # Train-test split
            X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

            # Fit
            clf.fit(X_train, y_train)
            # Prediction
            y_pred_train = clf.predict(X_train)    # Local train
            y_pred_test = clf.predict(X_test)      # Local test

            # RMSE
            train_acc.append(accuracy_score(y_train, y_pred_train))
            test_acc.append(accuracy_score(y_test, y_pred_test))

        print('Local train ACC:', np.array(train_acc).mean())
        print('Local test ACC: ', np.array(test_acc).mean())
        
        clf.fit(X, y)
    else:
        y_pred = clf.predict(X)
        acc = accuracy_score(y, y_pred)
        print('Unseen test ACC:', acc)
    
    return le_d, encoder, pca, clf

In [102]:
le_d, encoder, pca, clf = full_pipeline(data)

(918, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Local train ACC: 0.9959374737969157
Local test ACC:  0.8742685475444096


In [103]:
full_pipeline(test, clf=clf, pca=pca, encoder=encoder, le_d=le_d)

(184, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unseen test ACC: 0.9891304347826086


({'Sex': LabelEncoder(),
  'FastingBS': LabelEncoder(),
  'ExerciseAngina': LabelEncoder()},
 OneHotEncoder(handle_unknown='ignore', sparse=False),
 PCA(n_components=15),
 AdaBoostClassifier(base_estimator=RandomForestClassifier(class_weight={0: 0.5533769063180828,
                                                                        1: 0.4466230936819172},
                                                          max_leaf_nodes=15,
                                                          min_samples_split=10,
                                                          n_jobs=4),
                    n_estimators=10))

In [71]:
# LabelEncoding
# 449 index outlier
# 