# Aim
The aim is to predict whether patient have 10 year risk of coronary heart disease CHD or not. Additionally, participants also asked to create some data visualization about the data to gained actionable insight about the topic.

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#Models import
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss 

#plot imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
sns.set(style='white', context='notebook', palette='deep')

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn
#constants
np.random.seed(0)
NFOLDS=5
SEED=0

# Load Data

In [None]:
original_train=pd.read_csv("../input/cardiovascular-study-dataset-predict-heart-disea/train.csv")
original_train=original_train.drop(['id'], axis=1) #drop 'id'

original_test=pd.read_csv("../input/cardiovascular-study-dataset-predict-heart-disea/test.csv")
IDtest=original_test['id']
original_test=original_test.drop(['id'], axis=1) #drop 'id'


In [None]:
#convert binary values to 0 and 1
def give_binary_values(dataset,col,one,zero): 
    dataset[col].loc[dataset[col]==one]=1
    dataset[col].loc[dataset[col]==zero]=0
    return dataset

#fill missing values
def handle_missing(dataset): 
    #convert sex and is_smoking to numerical category
    dataset=give_binary_values(dataset,'sex','F','M')
    dataset=give_binary_values(dataset,'is_smoking','YES','NO')
    
    
    dataset['education']= dataset['education'].fillna(dataset['education'].median())

    dataset.loc[dataset['is_smoking']==0,'cigsPerDay']=0


    for i in range(2):
        correct_sex=(dataset['sex'] == i)
        for col in ['heartRate','BMI','totChol']:
            avg=dataset[col][correct_sex].mean()
            dataset.loc[correct_sex, col]=dataset.loc[correct_sex, col].fillna(avg)
        for col in ['BPMeds','cigsPerDay','glucose']:
            med=dataset[col][correct_sex].median()
            dataset.loc[correct_sex, col]=dataset.loc[correct_sex, col].fillna(med)
        dataset.loc[dataset["is_smoking"]==0,"cigsPerDay"]=0
    return dataset

In [None]:
#fill missing values in the original files of train and test
original_train=handle_missing(original_train)
original_test=handle_missing(original_test)

original_train=original_train.drop(['is_smoking'], axis=1) #drop 'is_smoking'
original_test=original_test.drop(['is_smoking'], axis=1) #drop 'is_smoking'

In [None]:
#devision features into groups
numerical_features=["age","education","cigsPerDay","BPMeds","diabetes","sysBP","diaBP","BMI","heartRate","glucose"] #numerical features
categorical_features=["sex","prevalentStroke","prevalentStroke","prevalentHyp"] #categorical features
numerical_dataset=original_train.drop(categorical_features,axis=1) #partial dataset that includes onley the numerical features
categorical_dataset=original_train.drop(numerical_features,axis=1) #partial dataset that includes onley the categorical features

# Cross Correlation
Checking correlation between features

In [None]:
def preset_correlation_of_Xy(dataset,title): #present correlation matrix of features and lable in a dataset 
    f,ax = plt.subplots(figsize=(24,20))
    corr=dataset.corr()    
    sns.heatmap(corr, cmap='coolwarm_r', annot=True, annot_kws={'size':20}, ax=ax)
    ax.set_title(title, fontsize=20)
    plt.show()
    


In [None]:
#present correlation matrix of numerical features and lable 
preset_correlation_of_Xy(numerical_dataset,"corelation matrix - numerical features")


In [None]:
#present correlation matrix of categorical features and lable 
preset_correlation_of_Xy(categorical_dataset,"corelation matrix - categorical features")

In [None]:
#according the correlation matrices above, it can be seen that 'diabetes' is quite correlated with 'glucose' and 'diaBP' is quite correlated with 'sysBP'.
#Therefore 'diabetes' and 'diaBP' are dropped from the original train and test files


original_train=original_train.drop(['diabetes','diaBP'], axis=1) #drop 'diabetes','diaBP' from train file
original_train.head(10)

In [None]:
original_test=original_test.drop(['diabetes','diaBP'], axis=1) #drop 'diabetes','diaBP' from test file
original_test.head(10)

> # Modeling

In [None]:
y_on_train = original_train['TenYearCHD'].to_frame()
X_on_train = original_train.drop(['TenYearCHD'], axis=1)
X_train_on_train, X_test_on_train, y_train_on_train, y_test_on_train = train_test_split(X_on_train, y_on_train, test_size=0.2, random_state=42)
kf = KFold( n_splits= NFOLDS, random_state=SEED, shuffle=False)

In [None]:
# Modeling step Test differents algorithms 
random_state = 42
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state,warm_start=True))
classifiers.append(ExtraTreesClassifier(random_state=random_state,warm_start=True))
classifiers.append(GradientBoostingClassifier(random_state=random_state,warm_start=True))
classifiers.append(MLPClassifier(random_state=random_state,warm_start=True))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state,warm_start=True))
classifiers.append(LinearDiscriminantAnalysis())

cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X_train_on_train, y = y_train_on_train, scoring = "accuracy", cv = kf, n_jobs=-1))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors","LogisticRegression","LinearDiscriminantAnalysis"]})

g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std})    
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")
plt.show()

# Out-of-Fold Predictions

In [None]:

def get_oof(clf, x_train, y_train, x_test,kf):
    oof_train = np.zeros((x_train.shape[0],))
    oof_test = np.zeros((x_test.shape[0],))
    oof_test_skf = np.empty((NFOLDS, x_test.shape[0]))
    i=0
    
    for (train_index, test_index) in kf.split(x_train):
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train.iloc[test_index]
        clf.fit(x_tr.values, y_tr.values.ravel())
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        i+=1
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
# Create 5 objects that represent our 4 models
rf=RandomForestClassifier(n_estimators=1000,max_depth=6,min_samples_leaf=2,max_features='sqrt',n_jobs=-1,random_state=0,verbose=0,warm_start=True)
et = ExtraTreesClassifier(n_estimators=1000,max_depth=8,min_samples_leaf=2,n_jobs=-1,random_state=0,verbose=0,warm_start=True)
ada =AdaBoostClassifier(n_estimators=1000,learning_rate=0.75,random_state=0)
gb = GradientBoostingClassifier(n_estimators=1000,max_depth=5,min_samples_leaf=2,random_state=0,verbose=0,warm_start=True)
svc = SVC(kernel='linear',C=0.025)

In [None]:
# Create Numpy arrays of train, test and target (TenYearCHD) dataframes to feed into our models
y_train=original_train['TenYearCHD'].to_frame()
X_train=original_train.drop(['TenYearCHD'], axis=1)

#Balance by oversampling by SMOTE
sm=SMOTE()
X_train, y_train = sm.fit_sample(X_train, y_train)

X_test=original_test

In [None]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, X_train, y_train, X_test, kf) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,X_train, y_train, X_test, kf) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, X_test, kf) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, X_test, kf) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,X_train, y_train, X_test, kf) # Support Vector Classifier

# Second-Level Predictions from the First-level Output

**First-level output as new features**

Having now obtained our first-level predictions, one can think of it as essentially building a new set of features to be used as training data for the next classifier. As per the code below, we are therefore having as our new columns the first-level predictions from our earlier classifiers and we train the next classifier on this.

In [None]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head(10)

Correlation Heatmap of the Second Level Training set


In [None]:
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

In [None]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1).astype(int)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1).astype(int)


In [None]:
x_train

In [None]:
x_test

**Second level learning model via XGBoost
**



I choose XGBoost in order to built to optimize large-scale boosted tree algorithms. I call an XGBClassifier and fit it to the first-level train and target data and use the learned model to predict the test data as follows:

In [None]:
def feature_importances(clf,x,y):
        print(clf.fit(x,y).feature_importances_)
        
rf_feature = feature_importances(rf,X_train,y_train)
et_feature = feature_importances(et, X_train, y_train)
ada_feature = feature_importances(ada, X_train, y_train)
gb_feature = feature_importances(gb, X_train,y_train)

gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1)
gbm.fit(x_train, y_train)
predictions = gbm.predict(x_test)

# Submission

In [None]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'id': IDtest,
                            'TenYearCHD': predictions })
StackingSubmission.to_csv("StackingSubmission.csv", index=False)