# Load Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Read Data

In [None]:
df=pd.read_csv('../input/heart-disease-uci/heart.csv')
print(df.shape)
df.head(10)

# Divide Data

In [None]:
train_df,test_df=train_test_split(df,test_size=0.15)
print(train_df.shape)
print(test_df.shape)

# Basic Preprocessing

* Treat Outliers
* Scale data

In [None]:
#Outlier Correction,Missing values,Scaling
columns=['age','trestbps','chol','thalach','oldpeak'] #take only numerical columns
ss=StandardScaler() 

def outlier_(X):
    tmp=[]
    Q3=train_df[X].quantile(0.75)
    Q1=train_df[X].quantile(0.25)
    IQR=Q3-Q1
    lower=train_df[X][train_df[X]<Q1-1.5*IQR].values
    upper=train_df[X][train_df[X]>Q3+1.5*IQR].values
    return lower,upper   

outlier_col=[] #Store column names that have outliers.
for i in columns:
    l,u=outlier_(i)
    if len(l)>0 or len(u)>0:
        print('>>Feature {} contain outliers'.format(i))
        outlier_col.append(i)
    else:
        print('Feature {} contain no outliers'.format(i))

In [None]:
iqr_values={} #To treat values in test data
def outlier_treatment(X):
    Q3=train_df[X].quantile(0.75)
    Q1=train_df[X].quantile(0.25)
    IQR=Q3-Q1
    lb=Q1-1.5*IQR
    ub=Q1+1.5*IQR
    train_df[X][train_df[X]<lb]=lb
    train_df[X][train_df[X]>ub]=ub
    iqr_values[X]=[lb,ub]

for i in outlier_col:
    outlier_treatment(i)

In [None]:
X=train_df.drop('target',axis=1)

#Rescale
X[columns]=ss.fit_transform(X[columns])
X=X.values

y=train_df['target'].values

# Stratified_KFold + GridSearch
* With each fold, we will tune the model and save it
* Use all saved models from the K folds to make averaging classifier

In [None]:

params={'C':[i for i in range(1,150,5)],
       'penalty':['l1','l2','elasticnet']}

splits=5
kf=StratifiedKFold(n_splits=splits)

model_dict={}
model_ix=0

for train_ix,test_ix in kf.split(X,y):
    print('Fold: {}/{}'.format(model_ix+1,splits))
    train_data_X,train_data_y=X[train_ix],y[train_ix]
    test_data_X,test_data_y=X[test_ix],y[test_ix]
    
    lr=LogisticRegression(class_weight='balanced')
    gs=GridSearchCV(LogisticRegression(class_weight='balanced'),param_grid=params)
    gs.fit(train_data_X,train_data_y)
    best_model=gs.best_estimator_
    
    print('Validation Score: {:.3f}'.format(roc_auc_score(test_data_y,best_model.predict(test_data_X))))
    print('-------------------------------------')
    
    #Save Model
    model_dict['LogReg'+str(model_ix)]=best_model
    model_ix+=1

# Preprocess Test Data

In [None]:
test_X=test_df.drop('target',axis=1)
test_y=test_df['target']

for col in outlier_col:
    test_X[col][test_X[col]>iqr_values[col][1]]=iqr_values[col][1]
    test_X[col][test_X[col]<iqr_values[col][0]]=iqr_values[col][1]
    
test_X[columns]=ss.transform(test_X[columns])

# Make Test Predictions

In [None]:
predictions=np.zeros((len(test_X),5))
for ix,k in enumerate(model_dict):
    p=model_dict[k].predict(test_X)
    predictions[:,ix]=p

predictions=[1 if p>=0.5 else 0 for p in predictions.mean(axis=1)]
print('Test Score: {:.3f}'.format(roc_auc_score(test_y,predictions)))

* Nothing Fancy and technical done in this notebook, we just used 3 things:
  * > Cross Validation
  * > Grid Search
  * > Averaging

**Fin**