In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Index 
<a id='index'></a>
[Read Data](#read_data) <br>
[Analyze Data](#analyze_data) <br>
[Feature engeneering and feature selection](#feature_engeneering) <br>
[Split train test](#split_train_test) <br>
[Logistic Regression](#logistic_regression) <br>
[Classification tree](#classification_tree) <br>
[XGBoost](#xgboost) <br>
[Support Vector Classifier](#svc) <br>
[KNN](#knn) <br>
[MLP](#mlp) <br>

 

### Read Data <a id='read_data'></a>

[Back to Index](#index)

In [None]:
# read heart as pandas dataframe
dfheart = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
# read the o2Saturation
dfo2 = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/o2Saturation.csv')

### Analyze the data <a id='analyze_data'></a>

[Back to Index](#index)

In [None]:
#check for missing values
dfheart.info()

In [None]:
dfo2.info()

In [None]:
import matplotlib as plt

In [None]:
# different scaling
fig = plt.figure(figsize = (20,10))
plt.xticks(rotation='vertical')
dfheart.boxplot()


In [None]:
# look at distribution
dfheart.hist()

oldpeak has exponential decay -> Take logarithm <br>

In [None]:
numer_var = ['age','trtbps','chol','thalachh','oldpeak']

In [None]:
categ_var = ['sex','cp','fbs','restecg','exng','slp','caa','thall']

In [None]:
import seaborn as sns
sns.pairplot(dfheart, hue = 'output')

Variables separating output <br>
sex,
cp,
rest_ecg,
thalachh,
exng,
oldpeak,
slp,
caa,
thall


In [None]:
#age and thalachh, thalachh and oldpeak slightly negatively correlated
sns.heatmap(dfheart[numer_var].corr(),annot=True)

# Feature Engeneering & Feature selection <a id='feature_engeneering'></a>

[Back to Index](#index)

In [None]:
# reduce the number of features
numer_var = ['age','thalachh','oldpeak']
categ_var = ['sex','cp','restecg','exng','slp','caa','thall']

In [None]:
df = dfheart[numer_var].copy()

In [None]:
df[categ_var] = dfheart[categ_var].copy()

In [None]:
df['output'] = dfheart['output']

In [None]:
# loop to change each column to category type
for col in categ_var:
    df[col] = df[col].astype('category',copy=False)

In [None]:
df.info()

In [None]:
#set categorical to dummies
df = pd.get_dummies(df)

In [None]:
df.info()

In [None]:
#take logarithm of oldpeak
df['oldpeak']=np.log(1+df['oldpeak'])

In [None]:
#scaling numerical variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df[numer_var])

In [None]:
df[numer_var] = scaler.fit_transform(df[numer_var])

In [None]:
df[numer_var].boxplot()

# Split train test <a id='split_train_test'></a>

[Back to Index](#index)

In [None]:
# split train and test
y = df['output']
X = df.drop('output', axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.25,random_state=1234)

In [None]:
print(len(X_train),',',len(X_test))

In [None]:
# look at distribution
y_train_dist=y_train.groupby(y_train.iloc[:]).size()/y_train.size
y_test_dist=y_test.groupby(y_test.iloc[:]).size()/y_test.size

train_test_dist = pd.DataFrame({'train': y_train_dist, 'test': y_test_dist})
ax = train_test_dist.plot.bar(rot=0) # rotation of the labels

# Logistic Regression <a id='logistic_regression'></a>

[Back to Index](#index)

In [None]:
# first try with simple models
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

parameter = {"C":[0.01,0.1,1,5,10,20], 
             "penalty":["l1","l2"],
            "n_jobs" : [-1],
            "random_state" : [1234]}
logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg,parameter,cv=10,scoring='f1')
logreg_cv.fit(X_train,y_train)


In [None]:
logreg_cv.best_estimator_

In [None]:
best_logreg = logreg_cv.best_estimator_
best_logreg.fit(X_train,y_train)

In [None]:
y_pred = best_logreg.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
def print_performance(gs_model, model_name, X_train,X_test,y_train,y_test):
   print('##################### \n')
   print("Best score: %f using %s" % (gs_model.best_score_, gs_model.best_params_))
   print('##################### \n')
   best_model = gs_model.best_estimator_
   y_pred_test = best_model.predict(X_test)
   y_pred_train = best_model.predict(X_train)
   print('#####################')
   print('f1 Score on the train: ',metrics.f1_score(y_train,y_pred_train))
   print('f1 Score on the test: ',metrics.f1_score(y_test,y_pred_test))
   print('##################### \n')

In [None]:
from sklearn import metrics
model_name = 'Logistic Regression'
print_performance(logreg_cv, model_name, X_train,X_test,y_train,y_test)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

def plot_ROC_curve(model, y_test, y_pred, name):
   logit_roc_auc = roc_auc_score(y_test, y_pred)
   fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
   plt.figure()
   plt.plot(fpr, tpr, label='Area = %0.2f' % logit_roc_auc)
   plt.plot([0, 1], [0, 1],'r--')
   plt.xlim([0.0, 1.0])
   plt.ylim([0.0, 1.05])
   plt.xlabel('False Positive Rate')
   plt.ylabel('True Positive Rate')
   plt.title('Receiver operating characteristic')
   plt.legend(loc="lower right")
   plt.savefig(name)
   plt.show()
name = 'Log_ROC'
plot_ROC_curve(best_logreg,y_test, y_pred, name)

# Classification Tree <a id='classification_tree'></a> <br>

Even with small tree we have overfitting. <br>

[Back to Index](#index)

In [None]:
from sklearn.tree import DecisionTreeClassifier
parameter = { 'criterion':['gini','entropy'],
             'max_depth': np.arange(2, 4),#np.arange(3, 15)
             'random_state' : [1234]
            }
tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree,parameter,cv=10,scoring='f1')
tree_cv.fit(X_train,y_train)



In [None]:
best_tree = tree_cv.best_estimator_
best_tree.fit(X_train,y_train)
y_pred = best_tree.predict(X_test)

In [None]:
best_tree

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
model_name = 'Classification Tree'
print_performance(tree_cv, model_name, X_train,X_test,y_train,y_test)

In [None]:
plot_ROC_curve(best_tree,y_test, y_pred, name)

# XGBoost <a id='xgboost'></a>

[Back to Index](#index)

In [None]:
# XGBoost is slow on this machine
RUN_XGBOOST = False # set to true to fit XGBoost

In [None]:

from xgboost import XGBClassifier
import xgboost as xgb
#classification tree obtained better results with depth 6
parameters = {
        'learning_rate' : [0.01,0.1,0.8], 
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8],
        'max_depth': [3, 4, 5, 6, 9],
        'lambda' : [0.1,0.5],
        'alpha' : [0.1,0.5],
        'random_state' : [1234]
        }
xgb_model = XGBClassifier()
gs_xbg = GridSearchCV(xgb_model, parameters, cv=10, scoring = 'f1', n_jobs=-1)
if RUN_XGBOOST == True:
   gs_xbg = gs_xbg.fit(X_train,y_train)
   best_xgboost = gs_xbg.best_estimator_
   best_xgboost.fit(X_train,y_train)
   y_pred = best_xgboost.predict(X_test)
   print(classification_report(y_test, y_pred))

# Support Vector Classifier <a id='svc'></a>

This classifier is overfitting.

[Back to Index](#index)

In [None]:
from sklearn.svm import SVC

In [None]:
SuppVectC = SVC()
parameters = {'kernel':['linear','rbf','polinomial'], 
                 'C': [20,50,100],#[8,9,10],#[5,10,100],#[0.1,1,10,100], # tried different regularisations
                 'gamma':[1], 
                 'degree':[2,3,4,5], # degree of polynomial kernel
                 'random_state' : [1234]
                 }
gs_svc = GridSearchCV(SuppVectC, parameters, cv=10, scoring = 'f1', n_jobs=-1)
gs_svc = gs_svc.fit(X_train, y_train)

In [None]:
gs_svc.best_estimator_

In [None]:
best_svc = gs_svc.best_estimator_
best_svc.fit(X_train,y_train)
y_pred = best_svc.predict(X_test)


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
model_name = 'Support Vector Classifier'
print_performance(gs_svc, model_name, X_train,X_test,y_train,y_test)

# KNN <a id='knn'></a>

Less overfitting. Good candidate

[Back to Index](#index)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
KN_model = KNeighborsClassifier()
parameters = {'n_neighbors':np.arange(3,30)}
gs_KN = GridSearchCV(KN_model, parameters, cv=10, scoring = 'f1', verbose=10, n_jobs=-1)
gs_KN = gs_KN.fit(X_train,y_train)

In [None]:
gs_KN.best_estimator_

In [None]:
best_knn = gs_KN.best_estimator_
best_knn.fit(X_train,y_train)
y_pred = best_svc.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
model_name = 'KNN Classifier'
print_performance(gs_KN, model_name, X_train,X_test,y_train,y_test)

In [None]:
plot_ROC_curve(gs_KN,y_test, y_pred, name)

# Multy Layer Perceptron <a id='mlp'></a>


[Back to Index](#index)

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
parameters = {"hidden_layer_sizes": [(5, 2),(2,2,2,2,2,2),(3,3,3,3),(10,8,5)],#[(5, 2),(2,2,2,2,2,2),(3,3,3,3),(10,8,5),(100)], 
              "max_iter": [100,200], 
              "alpha": [1,2,5],#[0.00001,0.1,0.5,1], # #L2 penalty
              #"activation" : ['logistic', 'tanh', 'relu'] <- with these overfitting
              "random_state" : [1234]}
gs_mlp = GridSearchCV(mlp, parameters, cv=10, scoring = 'f1', n_jobs=-1)
gs_mlp = gs_mlp.fit(X_train, y_train)

In [None]:
gs_mlp.best_estimator_

In [None]:
best_mlp = gs_mlp.best_estimator_
best_mlp.fit(X_train,y_train)
y_pred = best_mlp.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
model_name = 'MLP Classifier'
print_performance(gs_mlp, model_name, X_train,X_test,y_train,y_test)

In [None]:
plot_ROC_curve(gs_mlp,y_test, y_pred, name)