# Coronary-Heart-Disease-Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.utils import resample
# Data Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# Data Splitting
from sklearn.model_selection import train_test_split
# Data Scaling
from sklearn.preprocessing import MinMaxScaler
#Data Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report
# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
#Ensembling
from mlxtend.classifier import StackingCVClassifier


In [None]:
#data overview
df_data = pd.read_csv(r"../input/heart-diseases/datasets_4123_6408_framingham.csv")
df_data.head(20)

In [None]:
df_data.shape

The dataset contains 4240 redords and 16 columns including the target column.

In [None]:
df_data.info()

this shows an evweview of the Columns, non-null count and the data types of the dataset

## EXPLORATORY DATA ANALYSIS

* Handling missing and duplicate data.
* Univariate, Bivariate and Multivariate Analysis.
* Individual Features descriptive statistics.
* Visualizing Target attribute shows if the dataset is imbalanced.
* Resampling imbalanced dataset by oversampling positive case.
* Feature Selection


### missing data handling

In [None]:
df_data.isnull().sum()

It shows the number of np.nan or null values or missing values are present in the dataset:

   * education: 105
   * cigsPerDay: 29
   * BPMeds: 53
   * totChol: 50
   * BMI: 19
   * heartRate: 1
   * glucose: 388

In [None]:
df_data.duplicated().sum()

 there are no Duplicated Values present in the dataset.

In [None]:
print((df_data["glucose"].mode())[0])

In [None]:
df_data["glucose"].fillna((df_data["glucose"].mode())[0], inplace=True)

In [None]:
df_data.dropna(inplace=True)
df_data.isnull().sum()

The missing data has been handled and all other rows containing missing values has been removed.

In [None]:
plt.figure(figsize=(40,15), facecolor='w')
sns.boxplot(data=df_data)
plt.show()

In [None]:
df_data['totChol'].max()

In [None]:
df_data['sysBP'].max()

In [None]:
df_data = df_data[df_data['totChol']<600.0]
df_data = df_data[df_data['sysBP']<295.0]
df_data.shape

* Removable Outliers are detected in totChol and sysBP columns of our dataset. Outliers in all other numerical columns are important and thus cannot be removed.
*  the missing values, outliers and duplicate values are dealt with, now we perform EDA.

In [None]:
df_data.describe()

Continuos value features analysis:

* Age : We can see that Min. age of subject found in given records is 32 while Max. being 70. So our values are ranging from 32 to 70.
* cigsPerDay : Subject smoking Cig. per day is as low as nill while we have 70 Cigs. per day making the Peak.
* totChol : Min. Cholesterol level recorded in our dataset is 107 while Max. is 696.
* sysBP : Min. Systolic Blood Pressure observed in Subject is 83 while Max. is 295.
* diaBP : Min. Diastolic Blood Pressure observed in Subject is 48 while Max. is 142.
* BMI : Body Mass Index in our dataset ranges from 15.54 to 57 .
* heartRate : Observed Heartrate in our case study is 44 to 143.
* glucose : Glucose sugar level range is 40 to 394.

In [None]:
#Checking relationship between variables
cor=df_data.corr()
plt.figure(figsize=(15,15), facecolor='w')
sns.heatmap(cor,xticklabels=cor.columns,yticklabels=cor.columns,annot=True, cmap ='Blues' )
plt.title("Correlation among all the Variables of the Dataset", size=10)
cor

The Pearson correlation between the attributes provides information to deduce if a feature is usefull or not.

* currentSmoker and cigsPerDay has a strong Correlation of 77.
* prevalentHyp vs sysBP / diaBP are having Positive Correlation of 70 and 62.
* glucose and diabetes are postively Correlated alongside sysBP and diaBP.
* The column education has a negative correlatio with the outcom variable 'TenYearCHD'. In practicality distinguishing between patients by using a feature such an education will be subjective and will not effect the prediction.

## Univariate Analysis

In [None]:
categorical_features = ['age', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']

In [None]:
for feature in categorical_features:
    print(feature,':')
    print(df_data[feature].value_counts())
    print("-----------------")

In [None]:
num_plots = len(categorical_features)
total_cols = 2
total_rows = num_plots//total_cols + 1
fig, axs = plt.subplots(nrows=total_rows, ncols=total_cols,
                        figsize=(5*total_cols, 5*total_rows), facecolor='w', constrained_layout=True)
for i, var in enumerate(categorical_features):
    row = i//total_cols
    pos = i % total_cols
    plot = sns.countplot(x=var, data=df_data, ax=axs[row][pos])

Among the categorical features:
* BPmeds, prevalentStroke and diabetes are highly imbalanced.
* BPMeds, currentSmoker, diabetes, male, prevalentHyp, and prevalentStroke are binary variable features of the dataset.
* The number of Smokers and non-Smokers in currentSmoker is almost the same

In [None]:
#numeric_features
plt.figure(figsize=(23,15))
plt.subplots_adjust(wspace=0.3, hspace=0.3)

plt.subplot(2, 3, 1)
sns.distplot(df_data['glucose'] , color='blue')
plt.title('Distribution of Glucose')

plt.subplot(2, 3, 2)
sns.distplot(df_data['totChol'], color='orange')
plt.title('Distribution of Total Cholesterol')

plt.subplot(2, 3, 3)
sns.distplot(df_data['sysBP'], color='r')
plt.title('Distribution of Systolic BP')

plt.subplot(2, 3, 4)
sns.distplot(df_data['diaBP'] , color='purple')
plt.title('Distribution of Dia. BP')

plt.subplot(2, 3, 5)
sns.distplot(df_data['BMI'], color='g')
plt.title('Distribution of BMI')

plt.subplot(2, 3, 6)
sns.distplot(df_data['heartRate'], color='lime')
plt.title('Distribution of HeartRate')

* Glucose, Total Cholestrol, SysBP, and BMI are right skewed.
* DiaBP and HeartRate are almost close to normal distibution.

In [None]:
numeric_features = ['cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
num_plots = len(numeric_features)
total_cols = 2
total_rows = num_plots//total_cols + 1
color = ['v', 'i', 'b', 'g', 'y', 'o', 'r']
fig, axs = plt.subplots(nrows=total_rows, ncols=total_cols,
                        figsize=(7*total_cols, 7*total_rows), facecolor='w', constrained_layout=True)
for i, var in enumerate(numeric_features):
    row = i//total_cols
    pos = i % total_cols
    plot = sns.violinplot(y=var, data=df_data, ax=axs[row][pos], linewidth=3)

* cigsPerDay has a highly uneven distribution with the most data present in 0.
* The majority portions of the following columns lie in the range:
 * totChol: 150 to 300
 * sysBP: 100 to 150
 * diaBP: 60 to 100
 * BMI: 20 to 30
 * heartRate: 50 to 100
* glucose: 50 to 150

In [None]:
#Target variable
#Distribution of outcome variable, Heart Disease
plt.figure(figsize=(10, 8), facecolor='w')
plt.subplots_adjust(right=1.5)
plt.subplot(121)
sns.countplot(x="TenYearCHD", data=df_data)
plt.title("Count distribution of TenYearCHD", size=10)
plt.subplot(122)
labels=[0,1]
plt.pie(df_data["TenYearCHD"].value_counts(),autopct="%1.1f%%",labels=labels,colors=["green","red"])
plt.show()

The distribution is highly imbalanced. As in, the number of negative cases outweigh the number of positive cases.
This would lead to class imbalance problem while fitting our models. 
Therefore, this problem needs to be addressed and taken care of.

## Bivariate Analysis

In [None]:
#Relationship between education and cigsPerDay
#Grouping education and cigsPerDay
graph_1 = df_data.groupby("education", as_index=False).cigsPerDay.mean()
plt.figure(figsize=(10,8), facecolor='w')
sns.regplot(x=graph_1["education"], y=graph_1["cigsPerDay"])
plt.title("Graph showing cigsPerDay in every level of education.", size=15)
plt.xlabel("education", size=15)
plt.ylabel("cigsPerDay", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

In [None]:
#checking for which gender has more risk of coronary heart disease CHD
graph_2 = df_data.groupby("age", as_index=False).TenYearCHD.sum()
#Ploting the above values
plt.figure(figsize=(10,8), facecolor='w')
sns.barplot(x=graph_2["age"], y=graph_2["TenYearCHD"])
plt.title("Graph showing which gender has more risk of coronary heart disease CHD", size=15)
plt.xlabel("Gender\n0 is female and 1 is male",size=15)
plt.ylabel("TenYearCHD cases", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

According to this dataset, males have shown a slighly higher risk of coronary heart disease TenYearCHD.

In [None]:
#Distribution of current smokers with respect to age
plt.figure(figsize=(20,10), facecolor='w')
sns.countplot(x="age",data=df_data,hue="currentSmoker")
plt.title("Graph showing which age group has more smokers.", size=15)
plt.xlabel("age", size=15)
plt.ylabel("age Count", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

* Mid-age groups ranging from the age of 38 - 46 have more number of currentSmokers.
* No currentSmokers observed below the age of 32.
* maximum age for a currentSmokers is 70.

In [None]:
#Relation between cigsPerDay and risk of coronary heart disease.
plt.figure(figsize=(30,12), facecolor='w')
sns.countplot(x="TenYearCHD",data=df_data,hue="cigsPerDay")
plt.legend(title='cigsPerDay', fontsize='large')
plt.title("Graph showing the relation between cigsPerDay and risk of coronary heart disease.", size=15)
plt.xlabel("Risk of TenYearCHD", size=15)
plt.ylabel("Count of TenYearCHD", size=15)
plt.xticks(size=15)
plt.yticks(size=15)
plt.show()

* Low cigsPerDay comes with lower risk of CHD.
* Those who don't smoke, i.e., with a cigsPerDay of 0.0 has a really low risk of contracting the disease
* Although that is the case, low cigsPerDay doesn't actually guarantee a much lower risk of CHD

In [None]:
#Relation between sysBP and risk of CHD
# Grouping up the data and ploting it
graph_3 = df_data.groupby("TenYearCHD", as_index=False).sysBP.mean()

plt.figure(figsize=(10,8), facecolor='w')
sns.barplot(x=graph_3["TenYearCHD"], y=graph_3["sysBP"])
plt.title("Graph showing the relation between sysBP and risk of CHD", size=15)
plt.xlabel("Risk of CHD", size=15)
plt.ylabel("sysBP", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

In [None]:
plt.figure(figsize=(10,8), facecolor='w')
sns.regplot(x=graph_3["TenYearCHD"], y=graph_3["sysBP"])
plt.title("Distribution of sysBP in relation to the risk of CHD", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

* Minor relation of higher risk of TenYearCHD found with higher sysBP
* Majority of people with sysBP ranging from 72 - 130 has lower chance of contracting the disease.

In [None]:
# Grouping up the data and ploting it
# Relation between diaBP and risk of CHD
graph_4 = df_data.groupby("TenYearCHD", as_index=False).diaBP.mean()

plt.figure(figsize=(12,8), facecolor='w')
sns.barplot(x=graph_4["TenYearCHD"], y=graph_4["diaBP"])
plt.title("Graph showing the relation between diaBP and risk of CHD", size=15)
plt.xlabel("Risk of CHD", size=15)
plt.ylabel("diaBP", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

In [None]:
plt.figure(figsize=(10,8), facecolor='w')
sns.regplot(x=graph_4["TenYearCHD"], y=graph_4["diaBP"])
plt.title("Distribution of diaBP in relation to the risk of CHD", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

* Minor relation found between higher risk of TenYearCHD with higher diaBP similar to the previous one
* Majority of people with diaBP ranging upto 80.0 has lower chance of contracting the disease.

In [None]:
#elation between age and totChol
graph_5 = df_data.groupby("TenYearCHD", as_index=False).totChol.mean()

plt.figure(figsize=(10,8), facecolor='w')
sns.barplot(x=graph_5["TenYearCHD"], y=graph_5["totChol"])
plt.title("Graph showing the relation between age and totChol", size=15)
plt.xlabel("age", size=15)
plt.ylabel("totChol", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

In [None]:
plt.figure(figsize=(10,8), facecolor='w')
sns.regplot(x=graph_5["TenYearCHD"], y=graph_5["totChol"])
plt.title("Distribution of age with respect to totChol", size=15)
plt.xticks(size=10)
plt.yticks(size=10)

* aged people have more cholesterol
* bad cholesterol in general

### Multivariate Analysis

In [None]:
#Relationship between age and cigsPerDay, totChol, glucose.
plt.figure(figsize=(20,10), facecolor='w')
sns.boxplot(x="age",y="totChol",data=df_data)
plt.title("Distribution of age with respect to totChol", size=20)
plt.show()
plt.figure(figsize=(20,10), facecolor='w')
sns.boxplot(x="age",y="cigsPerDay",data=df_data)
plt.title("Distribution of age with respect to cigsPerDay", size=20)
plt.show()
plt.figure(figsize=(20,10), facecolor='w')
sns.boxplot(x="age",y="glucose",data=df_data)
plt.title("Distribution of age with respect to glucose", size=15)
plt.show()

* There is a minor relation between totChol and glucose.
* totChol has a steep, linear and inverse graph for lower ranges of age
* cigsPerDay has a fairly parallel relationship with age

In [None]:
#Distribution of sysBP vs diaBP with respect to currentSmoker and male attributes

#sysBP vs diaBP with respect to currentSmoker and male attributes
plt.figure(figsize=(9, 9), facecolor='w')
sns.lmplot('sysBP', 'diaBP', 
           data=df_data,
           hue="TenYearCHD",
           col="age",row="currentSmoker")
plt.show()

The above graph plots the relationship between systolic blood pressure and diastolic blood pressure for patients based on their gender and whether they are current smokers or not and plots the best fit line

## Resampling imbalanced dataset by oversampling positive case

In [None]:
target1=df_data[df_data['TenYearCHD']==1]
target0=df_data[df_data['TenYearCHD']==0]

In [None]:
target1=resample(target1,replace=True,n_samples=len(target0),random_state=40)
target=pd.concat([target0,target1])
target['TenYearCHD'].value_counts()

In [None]:
df_data=target
np.shape(df_data)

In [None]:
plt.figure(figsize=(12, 10), facecolor='w')
plt.subplots_adjust(right=1.5)
plt.subplot(121)
sns.countplot(x="TenYearCHD", data=df_data)
plt.title("Count of TenYearCHD column", size=15)
plt.subplot(122)
labels=[0,1]
plt.pie(df_data["TenYearCHD"].value_counts(),autopct="%1.1f%%",labels=labels,colors=["crimson","seagreen"])
plt.show()

The number of positive and negative cases are equal.
Hence the classes are now balanced for model fitting

# Feature Selection

In [None]:
X = df_data.iloc[:,0:15]  
y = df_data.iloc[:,-1]    

bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)

featureScores = pd.concat([df_columns,df_scores],axis=1)
featureScores.columns = ['Specs','Score']  
print(featureScores.nlargest(11,'Score'))

In [None]:
featureScores = featureScores.sort_values(by='Score', ascending=False)
featureScores


In [None]:
#Visualization of Feature Selection:

plt.figure(figsize=(20,5))
sns.barplot(x='Specs', y='Score', data=featureScores, palette = "Blues_r")
plt.box(False)
plt.title('Feature importance', fontsize=15)
plt.xlabel('\n Features', fontsize=15)
plt.ylabel('Importance \n', fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
features_list = featureScores["Specs"].tolist()[:10]
features_list


These are the 10 best features that can be used to predict the output variable:

* Systolic Blood Pressure
* Glucose
* Age
* Cholesterin
* Cigarettes per Day
* Diastolic Blood Pressure
* Hypertensive
* Diabetes
* Blood Pressure Medication
* Gender

The columns removed are:

* Education: Due to irrelevance to outcome variable and being out of subject.
* CurrentSmoker: Due to presence of a more informative similar variable(CigsPerDay).
* PrevalentStroke: Due to high imbalance caused by this variable.
* BMI: Due to unimpactful effect on outcome variable.
* HeartRate: Due to the prediction made by sklearn algorithm.

In [None]:
#A new dataset with the most important features is created.
df = df_data[['sysBP', 'glucose','age','totChol','cigsPerDay','diaBP','prevalentHyp','diabetes','BPMeds','TenYearCHD']]
df.head()

In [None]:
#Final Correlation Check:
sns.set_context('talk')
plt.figure(figsize=(22,10))
sns.heatmap(df_data.corr()*100, annot=True, cmap='Blues')

We can observe that almost all features have a strong correlation to the output variable.

## Feature Spliting And Scaling

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
df_scaled = pd.DataFrame(scaler.fit_transform(df_data), columns=df_data.columns)
df_scaled.describe()

## Training And Testing Data

In [None]:
y = df['TenYearCHD']
X = df.drop(['TenYearCHD'], axis=1)
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)

* The X train and test tables contain all the features and their values.
* The y train and test tables contain all the standalone features without their values.

In [None]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
print ('Train set:', train_x.shape,  train_y.shape) 
print ('Test set:', test_x.shape,  test_y.shape) 

## Predictive Modelling

### * Logistic Regression

In [None]:
#evaluation and accuracy
m1 = 'Logistic Regression'
logreg = LogisticRegression() 
logreg.fit(train_x, train_y) 
pred_y = logreg.predict(test_x)

from sklearn.metrics import jaccard_score  
print('Accuracy of the model in jaccard similarity score is = ',  
      jaccard_score(test_y, pred_y))

In [None]:
#confusion matrix
  
cm = confusion_matrix(test_y, pred_y) 
conf_matrix = pd.DataFrame(data = cm,  
                           columns = ['Predicted:0', 'Predicted:1'],  
                           index =['Actual:0', 'Actual:1']) 
plt.figure(figsize = (8, 5)) 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = "Blues") 
plt.show() 
  
print('The details for confusion matrix is =') 
print (classification_report(test_y, pred_y)) 

### K Nearest Neighbors

In [None]:
#Evaluation And Accuracy
m2 = 'KNeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=1)
model = knn.fit(train_x, train_y)
knn_predict = knn.predict(test_x)
knn_acc_score = accuracy_score(test_y, knn_predict)

from sklearn.metrics import jaccard_score 
print('Accuracy of the model in jaccard similarity score is = ',  
      jaccard_score(test_y, knn_predict))

In [None]:
#Confusion Matrix
cm = confusion_matrix(test_y, knn_predict) 
conf_matrix = pd.DataFrame(data = cm,  
                           columns = ['Predicted:0', 'Predicted:1'],  
                           index =['Actual:0', 'Actual:1']) 
plt.figure(figsize = (8, 5)) 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = "Blues_r") 
plt.show() 
  
print('The details for confusion matrix is =') 
print (classification_report(test_y, knn_predict)) 

### Decision tree

In [None]:
#Evaluation and Accuracy
m3 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 30)
dt.fit(train_x,train_y)
dt_predict = dt.predict(test_x)

from sklearn.metrics import jaccard_score 
print('Accuracy of the model in jaccard similarity score is = ',  
      jaccard_score(test_y, dt_predict))

In [None]:
#Confusion Matrix
cm = confusion_matrix(test_y, dt_predict) 
conf_matrix = pd.DataFrame(data = cm,  
                           columns = ['Predicted:0', 'Predicted:1'],  
                           index =['Actual:0', 'Actual:1']) 
plt.figure(figsize = (8, 5)) 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = "winter")
plt.show() 
  
print('The details for confusion matrix is =') 
print (classification_report(test_y, dt_predict)) 

### Random Forest

In [None]:
#Evaluation And Accuracy
m4 = 'Random Forest Classfier'
rf = RandomForestClassifier(n_estimators=200, random_state=0,max_depth=12)
rf.fit(train_x,train_y)
rf_predict = rf.predict(test_x)

from sklearn.metrics import jaccard_score 
print('Accuracy of the model in jaccard similarity score is = ',  
      jaccard_score(test_y, rf_predict))

In [None]:
#Confusion Matrix
cm = confusion_matrix(test_y, rf_predict) 
conf_matrix = pd.DataFrame(data = cm,  
                           columns = ['Predicted:0', 'Predicted:1'],  
                           index =['Actual:0', 'Actual:1']) 
plt.figure(figsize = (8, 5)) 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = "Greens") 
plt.show() 
  
print('The details for confusion matrix is =') 
print (classification_report(test_y, rf_predict)) 

### Gradient Boosting Classifier

In [None]:
m5 = 'Gradient Boosting Classifier'
gbc =  GradientBoostingClassifier()
gbc.fit(train_x,train_y)
gbc_predict = gbc.predict(test_x)

from sklearn.metrics import jaccard_score 
print('Accuracy of the model in jaccard similarity score is = ',  
      jaccard_score(test_y, gbc_predict))

In [None]:
#Confusion Matrix
cm = confusion_matrix(test_y, gbc_predict) 
conf_matrix = pd.DataFrame(data = cm,  
                           columns = ['Predicted:0', 'Predicted:1'],  
                           index =['Actual:0', 'Actual:1']) 
plt.figure(figsize = (8, 5)) 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = "Greens_r")
plt.show() 
  
print('The details for confusion matrix is =') 
print (classification_report(test_y, gbc_predict)) 

## Hyperparameter Tuning for best Classifier

### Random Forest

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 150, 
                               cv = 2, 
                               verbose=2, 
                               random_state=7, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(train_x,train_y)

In [None]:
rf_hyper = rf_random.best_estimator_
rf_hyper.fit(train_x,train_y)
print("Accuracy on training set is : {}".format(rf_hyper.score(train_x,train_y)))
print("Accuracy on validation set is : {}".format(rf_hyper.score(test_x, test_y)))
rf_predict = rf_hyper.predict(test_x)
print("Accuracy of Hyper-tuned Random Forest Classifier:",jaccard_score(test_y, rf_predict))
print(classification_report(test_y, rf_predict))

### Gradient Boosting Classifier

In [None]:
#Number of trees
n_estimators = [int(i) for i in np.linspace(start=100,stop=1000,num=10)]
#Number of features to consider at every split
max_features = ['auto','sqrt']
#Maximum number of levels in tree
max_depth = [int(i) for i in np.linspace(10, 100, num=10)]
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split=[2,5,10]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]

#Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
gb=GradientBoostingClassifier(random_state=0)
#Random search of parameters, using 3 fold cross validation, 
#search across 100 different combinations
gb_random = RandomizedSearchCV(estimator=gb, param_distributions=random_grid,
                              n_iter=150, scoring='f1', 
                              cv=2, verbose=2, random_state=0, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
gb_random.fit(train_x,train_y)

In [None]:
gb_hyper = gb_random.best_estimator_
gb_hyper.fit(train_x,train_y)
print("Accuracy on training set is : {}".format(gb_hyper.score(train_x,train_y)))
print("Accuracy on validation set is : {}".format(gb_hyper.score(test_x, test_y)))
gbc_predict = gb_hyper.predict(test_x)
gbc_acc_score = accuracy_score(test_y, gbc_predict)
print("Accuracy of Hyper-tuned Gradient Boosting Classifier:",gbc_acc_score*100,'\n')
print(classification_report(test_y, gbc_predict))

In [None]:
lr_false_positive_rate,lr_true_positive_rate,lr_threshold = roc_curve(test_y,pred_y)
knn_false_positive_rate,knn_true_positive_rate,knn_threshold = roc_curve(test_y,knn_predict)                                                             
dt_false_positive_rate,dt_true_positive_rate,dt_threshold = roc_curve(test_y,dt_predict)
rf_false_positive_rate,rf_true_positive_rate,rf_threshold = roc_curve(test_y,rf_predict)
gbc_false_positive_rate,gbc_true_positive_rate,gbc_threshold = roc_curve(test_y,gbc_predict)


sns.set_style('whitegrid')
plt.figure(figsize=(15,8), facecolor='w')
plt.title('Reciever Operating Characterstic Curve')
plt.plot(lr_false_positive_rate,lr_true_positive_rate,label='Logistic Regression')
plt.plot(knn_false_positive_rate,knn_true_positive_rate,label='K-Nearest Neighbor')
plt.plot(dt_false_positive_rate,dt_true_positive_rate,label='Desion Tree')
plt.plot(rf_false_positive_rate,rf_true_positive_rate,label='Random Forest')
plt.plot(gbc_false_positive_rate,gbc_true_positive_rate,label='Gradient Boosting Classifier')
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend()
plt.show()

## Model Evaluation

In [None]:
model_ev = pd.DataFrame({'Model': ['Logistic Regression','K-Nearest Neighbour','Decision Tree',
                                   'Random Forest','Gradient Boosting'], 'Accuracy': [jaccard_score(test_y, pred_y), jaccard_score(test_y, knn_predict), 
                                                                                     jaccard_score(test_y, dt_predict), jaccard_score(test_y, rf_predict),jaccard_score(test_y, gbc_predict)]})
model_ev

In [None]:
colors = ['red','green','blue','gold','silver']
plt.figure(figsize=(20,15), facecolor='w')
plt.title("Barplot Representing Accuracy of different models")
plt.ylabel("Accuracy %")
plt.xlabel("Models")
plt.bar(model_ev['Model'],model_ev['Accuracy'],color = colors)
plt.show()

## Ensembling
* In order to increase the accuracy of the model we use ensembling. Here we use stacking technique. We stack the 4 highest accuracy yielding models to create an ensembled model

In [None]:
scv=StackingCVClassifier(classifiers=[rf_hyper, gb_hyper, knn, dt], meta_classifier= rf)
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)
scv.fit(train_x.values,train_y.values)
scv_predict = scv.predict(test_x)
scv_acc_score = accuracy_score(test_y, scv_predict)
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')

In [None]:
cm = confusion_matrix(test_y, scv_predict) 
conf_matrix = pd.DataFrame(data = cm,  
                           columns = ['Predicted:0', 'Predicted:1'],  
                           index =['Actual:0', 'Actual:1']) 
plt.figure(figsize = (8, 5)) 
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = "Blues_r")
plt.show() 
  
print('The details for confusion matrix is =') 
print (classification_report(test_y, scv_predict)) 

## Conclusion

In [None]:
model_ev = model_ev.append({"Model":"Stacking Ensemble", "Accuracy":scv_acc_score*100}, ignore_index=True)
model_ev

* Individual Best Model: Gradient Boosting( 96.6% )
* Overall Best Model: Stacking Ensemble Classification( 97% )