# Heart Stroke Prediction using XGBoost and Random Forest

In [None]:
import numpy as np
import pandas  as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### Reading data from the files

In [None]:
df=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head(3)

#### Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
# dropping nulls in bmi from the dataset
df=df.dropna().reset_index(drop=True)

In [None]:
# sanity check
df.isnull().sum()

In [None]:
# dropping id column because it is unique for each row
df=df.drop(columns=['id'])

In [None]:
plt.figure(figsize=(12,8))
df.boxplot()
plt.show()

### Boxplot of Numerical features in the data shows the presence of outliers in avg_glucose_level and bmi

In [None]:
for i in df.select_dtypes(include=np.number).columns:
    sns.boxplot(df[i])
    plt.show()

### Boxplot of Numerical features in the data shows the presence of outliers in avg_glucose_level and bmi

In [None]:
# type conversion of hypertension, heart disease and stroke
df['hypertension']=df['hypertension'].astype(object)
df['heart_disease']=df['heart_disease'].astype(object)
df['stroke']=df['stroke'].astype(object)


In [None]:
# converting data into numerical and categorical
df_int=df.select_dtypes(include=np.number)
df_cat=df.select_dtypes(exclude=np.number)

In [None]:
# sanity check 
df_int.head()

In [None]:
# sanity check
df_cat.head()

In [None]:
# Countplot of our Categorical Variables
for i in df_cat:
    sns.countplot(df[i])
    plt.show()

### From this we infer that our predictor is very highly imbalanced. To treat imbalance we could go with Oversampling or Undersampling techniques(like SMOTE), but further in this problem we have used SMOTE(Oversampling)

# Bivariate Analysis

In [None]:
pd.crosstab(df['ever_married'],df['stroke']).plot(kind='bar',stacked=True)
plt.show()

### We can easily visualize that people who have ever been maried have a larger number of stroke than compared to the non married

In [None]:
pd.crosstab(df['work_type'],df['stroke']).plot(kind='bar',stacked=True)
plt.show()

### We can easily visualize that people who have been working in private sector have larger possibility of stroke than any other profession

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(df[df['stroke']==0]['age'],shade=True,label='no_stroke')
sns.kdeplot(df[df['stroke']==1]['age'],shade=True,label='stroke')
plt.xlabel('Age')
plt.title('Stroke Density vs Age')
plt.legend()

plt.show()

### We can see that people with age between 60-90 have most likely to have a stroke

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(df[df['stroke']==0]['bmi'],shade=True,label='no_stroke')
sns.kdeplot(df[df['stroke']==1]['bmi'],shade=True,label='stroke')
plt.legend()
plt.title('Stroke Density vs BMI ')
plt.show()

### We can easily  see that bmi is not affecting the target that much

In [None]:
# Separating Dependent and Predictor variables
X=pd.get_dummies(df,columns=df_cat.columns,drop_first=True).iloc[:,:-2]
y=pd.to_numeric(df['stroke'])

In [None]:
# splitting into train and test sets into stratified sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,random_state=8)

In [None]:
# sanity check
y_train.value_counts()

In [None]:
# sanity check
y_test.value_counts()

In [None]:
# Applying SMOTE for treating imbalance in our data
from imblearn.over_sampling import SMOTE
smt = SMOTE(random_state=8)
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)
print(y_train_sm.value_counts())

### Logistic regression without SMOTE

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg=LogisticRegression(max_iter=1000)
log_reg.fit(X_train,y_train)
print('Train:',log_reg.score(X_train,y_train))
print('Test:',log_reg.score(X_test,y_test))

y_pred_lr=log_reg.predict(X_test)

In [None]:
pd.DataFrame(y_pred_lr).value_counts()

In [None]:
from sklearn.metrics import classification_report,roc_auc_score,roc_curve
print(classification_report(y_test,y_pred_lr))

### Logistic Regression with SMOTE

In [None]:
log_regsm=LogisticRegression(max_iter=1000)
log_regsm.fit(X_train_sm,y_train_sm)
print('Train:',log_regsm.score(X_train_sm,y_train_sm))
print('Test:',log_regsm.score(X_test,y_test))

y_pred_lrsm=log_regsm.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_lrsm))

### Logistic Regression with SMOTE gives better Precision,Recall and F1-SCore as compared to Logistic Regression without SMOTE

### Decision tree without SMOTE

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
dtc.fit(X_train,y_train)
print('Train:',dtc.score(X_train,y_train))
print('Test:',dtc.score(X_test,y_test))

y_pred_dt=dtc.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_dt))

## Decision Tree with SMOTE

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtcsm=DecisionTreeClassifier()
dtcsm.fit(X_train_sm,y_train_sm)
print('Train:',dtcsm .score(X_train_sm,y_train_sm))
print('Test:',dtcsm.score(X_test,y_test))

y_pred_dtsm=dtcsm.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_dtsm))

### Decision Tree with SMOTE gives better Precision,Recall and F1-SCore as compared to Decision Tree without SMOTE

## KNN without SMOTE

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)
print('Train:',knn.score(X_train,y_train))
print('Test:',knn.score(X_test,y_test))

y_pred_knn=knn.predict(X_test)


In [None]:
print(classification_report(y_test,y_pred_knn))

### KNN with SMOTE

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knnsm=KNeighborsClassifier()
knnsm.fit(X_train_sm,y_train_sm)
print('Train:',knnsm.score(X_train_sm,y_train_sm))
print('Test:',knnsm.score(X_test,y_test))

y_pred_knnsm=knnsm.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_knnsm))

### KNN with SMOTE gives better Recall and F1-SCore as compared to KNN without SMOTE

## Naive Bayes without SMOTE

In [None]:
from sklearn.naive_bayes import GaussianNB
naive=GaussianNB()
naive.fit(X_train,y_train)
print('Train:',naive.score(X_train,y_train))
print('Test:',naive.score(X_test,y_test))

y_pred_gnb=naive.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_gnb))

## Naive Bayes with SMOTE

In [None]:
from sklearn.naive_bayes import GaussianNB
naivesm=GaussianNB()
naivesm.fit(X_train_sm,y_train_sm)
print('Train:',naivesm.score(X_train_sm,y_train_sm))
print('Test:',naivesm.score(X_test,y_test))

y_pred_naivesm=naivesm.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_naivesm))

### Naive Bayes with SMOTE gives lower F1-SCore as compared to Naive Bayes without SMOTE

# Applying Cross validation to check Sampling Bias

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(LogisticRegression(max_iter=1000),X_train,y_train,cv=10,scoring='accuracy')
print(score)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(dtc,X_train,y_train,cv=10,scoring='accuracy')
print(score)

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve

def roc_curve1(model):
    pred_proba=model.predict_proba(X_test)
    fpr,tpr,th = roc_curve(y_test,pred_proba[:,1])
    plt.plot(fpr,tpr)
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.0])
    plt.plot([0,1],[0,1],'r--')
    plt.title('ROC curve for Classifier')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.text(x = 0.02, y = 0.9, s = ('AUC Score:',round(roc_auc_score(y_test, pred_proba[:,1]),4)))
    plt.grid(True)
    plt.show()

In [None]:
roc_curve1(log_reg)

In [None]:
#### ROC curve along with scores for comparing 2 algorithms
## 1st classifier

y_pred_proba= log_reg.predict_proba(X_test)[:,1]

y_pred_probasm= log_regsm.predict_proba(X_test)[:,1]

plt.figure(figsize = (8, 8))
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.plot([0, 1], [0, 1],'r--')
plt.title('ROC-curves for Logistic Regression with and without SMOTE', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
plt.text(x = 0.02, y = 0.9, s = ('AUC Score for log_reg model:',round(roc_auc_score(y_test, y_pred_proba),4)))
## 2nd classifier
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_probasm)
plt.plot(fpr1, tpr1)
plt.text(x = 0.02, y = 0.8, s = ('AUC Score for log_reg_sm model:',round(roc_auc_score(y_test, y_pred_probasm),4)))
plt.grid(True)

## Validation Curve for Logistic Regression Model without SMOTE 

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import validation_curve

C_param_range = [0.001,0.01,0.1,1,10,100,1000]

plt.figure(figsize=(15, 10))

# Logistic Regression validation curve
train_scores, test_scores = validation_curve(estimator=log_reg,X=X_train,y=y_train ,param_name='C',param_range=C_param_range)

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

plt.subplot(2,2,1)
plt.semilogx(C_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
    
plt.semilogx(C_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
    
plt.xlabel('C_parameter')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.5,1])

## Validation Curve for Decision Tree Model without SMOTE 

In [None]:
dtc_param_range = np.arange(1,13)

plt.figure(figsize=(15, 10))

# Decision Tree validation curve
train_scores, test_scores = validation_curve(estimator=dtc,X=X_train,y=y_train ,param_name='max_depth',param_range=dtc_param_range)

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

lw=0.5
plt.subplot(2,2,1)
plt.semilogx(dtc_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
plt.fill_between(dtc_param_range, train_mean - train_std,
                 train_mean + train_std, alpha=0.2,
                 color="darkorange", lw=lw)    
plt.semilogx(dtc_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
plt.fill_between(dtc_param_range, test_mean - test_std,
                 test_mean + test_std, alpha=0.2,
                 color="navy", lw=lw)

plt.xlabel('max_depth_parameter')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.5,1])

## Validation Curve for Decision Tree Model with SMOTE 

In [None]:
dtc_param_range = np.arange(1,25)

plt.figure(figsize=(15, 10))

# Decision Tree(SMOTE) validation curve
train_scores, test_scores = validation_curve(estimator=dtcsm,X=X_train_sm,y=y_train_sm ,param_name='max_depth',param_range=dtc_param_range)

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

lw=0.5
plt.subplot(2,2,1)
plt.semilogx(dtc_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
plt.fill_between(dtc_param_range, train_mean - train_std,
                 train_mean + train_std, alpha=0.2,
                 color="darkorange", lw=lw)    
plt.semilogx(dtc_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
plt.fill_between(dtc_param_range, test_mean - test_std,
                 test_mean + test_std, alpha=0.2,
                 color="navy", lw=lw)

plt.xlabel('max_depth_parameter')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.5,1])

## Tuning Decision Tree without SMOTE by taking Hyperparameter range as per Validation Curve

In [None]:
from sklearn.model_selection import GridSearchCV

params={'criterion':['entropy', 'gini'],'max_depth': range(2, 10),'min_samples_split' : range(1,5)}
dt=DecisionTreeClassifier()
grid=GridSearchCV(dt,params,cv=5)
grid.fit(X_train,y_train)

print('The best value of hyperparameters "criterion", "max_depth", and "min_samples_split"')
print(grid.best_params_)

## Fitting Decision Tree algo without SMOTE using the hyperparameters deduced above

In [None]:
dt_tuned=DecisionTreeClassifier(criterion='entropy',max_depth=2,min_samples_split=2)
dt_tuned.fit(X_train,y_train)

y_pred_dttuned= dt_tuned.predict(X_test)

dt_tuned.score(X_train,y_train)

In [None]:
print(classification_report(y_test,y_pred_dttuned))

In [None]:
dt_tuned.score(X_test,y_test)

## Tuning Decision Tree with SMOTE by taking Hyperparameter range as per Validation Curve

In [None]:
from sklearn.model_selection import GridSearchCV

Parameter_Trials={'max_depth': [11,12,13,14,15,16]}

Grid_Search = GridSearchCV(dtcsm, Parameter_Trials, cv=5, n_jobs=1)
GridSearchResults=Grid_Search.fit(X_train_sm,y_train_sm)

print('The best value of hyperparameters "max_depth" :')
print(Grid_Search.best_params_)

In [None]:
dtsm_tuned=DecisionTreeClassifier(criterion='entropy',max_depth=16)
dtsm_tuned.fit(X_train_sm,y_train_sm)

y_pred_dtsmtuned= dtsm_tuned.predict(X_test)

dtsm_tuned.score(X_train_sm,y_train_sm)

In [None]:
print(classification_report(y_test,y_pred_dtsmtuned))

In [None]:
# ROC-Curve for Decision Tree
roc_curve1(dtc)

In [None]:
# ROC-Curve for tuned Decision Tree
roc_curve1(dt_tuned)

## Comparing AUC Score along with ROC curve for Decision Tree, tuned Decision Tree, Decision Tree with SMOTE and tuned Decision Tree with SMOTE 

In [None]:
y_pred_probad= dtc.predict_proba(X_test)[:,1]

y_pred_probadsm= dtcsm.predict_proba(X_test)[:,1]

y_pred_probad_tuned= dt_tuned.predict_proba(X_test)[:,1]

y_pred_probadsm_tuned= dtsm_tuned.predict_proba(X_test)[:,1]

plt.figure(figsize = (12, 8))
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probad)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.plot([0, 1], [0, 1],'r--')
plt.title('ROC curve for different Models', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
plt.text(x = 0.02, y = 0.9, s = ('AUC Score for DT model:',round(roc_auc_score(y_test, y_pred_probad),4)))
## 2nd classifier
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_probadsm)
plt.plot(fpr1, tpr1)
plt.text(x = 0.02, y = 0.8, s = ('AUC Score for DT_SMOTE model:',round(roc_auc_score(y_test, y_pred_probadsm),4)))
plt.grid(True)

## 3rd classifier
fpr2, tpr2, thresholds2 = roc_curve(y_test, y_pred_probad_tuned)
plt.plot(fpr2, tpr2)
plt.text(x = 0.02, y = 0.7, s = ('AUC Score for tuned_DT model:',round(roc_auc_score(y_test, y_pred_probad_tuned),4)))
plt.grid(True)

## 4th classifier
fpr3, tpr3, thresholds3 = roc_curve(y_test, y_pred_probadsm_tuned)
plt.plot(fpr3, tpr3)
plt.text(x = 0.02, y = 0.6, s = ('AUC Score for tuned_DT_SMOTE model:',round(roc_auc_score(y_test, y_pred_probadsm_tuned),4)))
plt.grid(True)

## Applying Ensemble Techniques 

# Random Forest without SMOTE

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()

rf.fit(X_train,y_train)

rf.score(X_train,y_train)

In [None]:
y_pred_rf=rf.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_rf))

In [None]:
rf.score(X_test,y_test)

In [None]:
# Validation Curve for Random Forest without SMOTE
rf_param_range = np.arange(1,1000,100)

plt.figure(figsize=(12,8))

train_scores, test_scores = validation_curve(estimator=rf,X=X_train,y=y_train ,param_name='n_estimators',param_range=rf_param_range)

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

lw=0.5
plt.subplot(2,2,1)
plt.plot(rf_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
plt.fill_between(rf_param_range, train_mean - train_std,
                 train_mean + train_std, alpha=0.2,
                 color="darkorange", lw=lw)

plt.plot(rf_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
plt.fill_between(rf_param_range, test_mean - test_std,
                 test_mean + test_std, alpha=0.2,
                 color="navy", lw=lw)

    
plt.xlabel('n_estimators_parameter')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.9,1])

In [None]:
# Validation Curve for Random Forest without SMOTE
rf_param_range = np.arange(1,13)

plt.figure(figsize=(15, 10))

train_scores, test_scores = validation_curve(estimator=rf,X=X_train,y=y_train ,param_name='max_depth',param_range=rf_param_range)

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

lw=0.5
plt.subplot(2,2,1)
plt.plot(rf_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
plt.fill_between(rf_param_range, train_mean - train_std,
                 train_mean + train_std, alpha=0.2,
                 color="darkorange", lw=lw)

plt.plot(rf_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
plt.fill_between(rf_param_range, test_mean - test_std,
                 test_mean + test_std, alpha=0.2,
                 color="navy", lw=lw)

    
plt.xlabel('max_depth_parameter')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.9,1])

## Tuning Random Forest without SMOTE by taking Hyperparameter range as per Validation Curve

In [None]:
params={'criterion':['entropy', 'gini'],'max_depth': range(2,8),'n_estimators' : range(1,200,50)}

grid_rf=GridSearchCV(rf,params,cv=5)
grid_rf.fit(X_train,y_train)

print('The best value of hyperparameters "criterion", "max_depth", and "min_samples_split"')
print(grid_rf.best_params_)

In [None]:
rf_tuned=RandomForestClassifier(n_estimators=1,max_depth=2,criterion='entropy')

rf_tuned.fit(X_train,y_train)
rf_tuned.score(X_train,y_train)

y_pred_tunedrf=rf_tuned.predict(X_test)

In [None]:
rf_tuned.score(X_test,y_test)

In [None]:

print(classification_report(y_test,y_pred_tunedrf))

## Applying Random Forest with SMOTE 

In [None]:
rf_sm=RandomForestClassifier()

rf_sm.fit(X_train_sm,y_train_sm)
rf_sm.score(X_train_sm,y_train_sm)

In [None]:
y_pred_rfsm=rf_sm.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_rfsm))

In [None]:
# Validation Curve for Random Forest model with SMOTE
rfsm_param_range = np.arange(1,40)

plt.figure(figsize=(15, 10))

train_scores, test_scores = validation_curve(estimator=rf_sm,X=X_train_sm,y=y_train_sm ,param_name='max_depth',param_range=rfsm_param_range)

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

lw=0.5
plt.subplot(2,2,1)
plt.plot(rfsm_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
plt.fill_between(rfsm_param_range, train_mean - train_std,
                 train_mean + train_std, alpha=0.2,
                 color="darkorange", lw=lw)

plt.plot(rfsm_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
plt.fill_between(rfsm_param_range, test_mean - test_std,
                 test_mean + test_std, alpha=0.2,
                 color="navy", lw=lw)

    
plt.xlabel('max_depth_parameters')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.9,1])

In [None]:
# Validation Curve for Random Forest model with SMOTE
rfsm_param_range = np.arange(1,1000,100)

plt.figure(figsize=(15, 10))

train_scores, test_scores = validation_curve(estimator=rf_sm,X=X_train_sm,y=y_train_sm ,param_name='n_estimators',param_range=rfsm_param_range)

train_mean = np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
test_mean = np.mean(test_scores,axis=1)
test_std = np.std(test_scores,axis=1)

lw=0.5
plt.subplot(2,2,1)
plt.plot(rfsm_param_range
            ,train_mean
            ,color='blue'
            ,marker='o'
            ,markersize=5
            ,label='training accuracy')
plt.fill_between(rfsm_param_range, train_mean - train_std,
                 train_mean + train_std, alpha=0.2,
                 color="darkorange", lw=lw)

plt.plot(rfsm_param_range
            ,test_mean
            ,color='green'
            ,marker='x'
            ,markersize=5
            ,label='test accuracy') 
plt.fill_between(rfsm_param_range, test_mean - test_std,
                 test_mean + test_std, alpha=0.2,
                 color="navy", lw=lw)

    
plt.xlabel('n_estimators_parameters')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.9,1])

In [None]:
from sklearn.model_selection import GridSearchCV
Parameter_Trials={'criterion':['gini','entropy'],
                  'n_estimators': range(100,500,100),'max_depth':[15,16,17,18,19,20,21,22,23]}
 
Grid_Search = GridSearchCV(rf_sm, Parameter_Trials, cv=5, n_jobs=1)
GridSearchResults=Grid_Search.fit(X,y)
Grid_Search.best_params_

In [None]:
rf_smtuned=RandomForestClassifier(criterion='gini',n_estimators=300,max_depth=17)

rf_smtuned.fit(X_train_sm,y_train_sm)
rf_smtuned.score(X_train_sm,y_train_sm)

y_pred_rfsmtuned=rf_smtuned.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_rfsmtuned))

## Comparing AUC Score along with ROC curve for Random Forest, tuned Random Forest, Random Forest with SMOTE and tuned Random Forest with SMOTE 

In [None]:
y_pred_probarf= rf.predict_proba(X_test)[:,1]

y_pred_probarfsm= rf_sm.predict_proba(X_test)[:,1]

y_pred_probarf_tuned= rf_tuned.predict_proba(X_test)[:,1]

y_pred_probadrfsm_tuned= rf_smtuned.predict_proba(X_test)[:,1]

plt.figure(figsize = (12, 8))
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probarf)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.plot([0, 1], [0, 1],'r--')
plt.title('ROC curve for Different Models', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
plt.text(x = 0.02, y = 0.9, s = ('AUC Score for rf model:',round(roc_auc_score(y_test, y_pred_probarf),4)))
## 2nd classifier
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_probarfsm)
plt.plot(fpr1, tpr1)
plt.text(x = 0.02, y = 0.8, s = ('AUC Score for rf_sm model:',round(roc_auc_score(y_test, y_pred_probarfsm),4)))
plt.grid(True)

## 3rd classifier
fpr2, tpr2, thresholds2 = roc_curve(y_test, y_pred_probarf_tuned)
plt.plot(fpr2, tpr2)
plt.text(x = 0.02, y = 0.7, s = ('AUC Score for rf_tuned model:',round(roc_auc_score(y_test, y_pred_probarf_tuned),4)))
plt.grid(True)

## 4th classifier
fpr3, tpr3, thresholds3 = roc_curve(y_test, y_pred_probadrfsm_tuned)
plt.plot(fpr3, tpr3)
plt.text(x = 0.02, y = 0.6, s = ('AUC Score for rfsm_tuned model:',round(roc_auc_score(y_test, y_pred_probadrfsm_tuned),4)))
plt.grid(True)

# Applying few Ensemble Techniques

# XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

X_train_xgb=X_train.astype(np.number)
X_test_xgb=X_test.astype(np.number)
y_train_xgb=y_train.astype(np.number)
y_test_xgb=y_test.astype(np.number)

xgb=XGBClassifier()
xgb.fit(X_train_xgb,y_train_xgb)

xgb.score(X_train_xgb,y_train_xgb)

y_pred_xgb=xgb.predict(X_test)



In [None]:
print(classification_report(y_test,y_pred_xgb))

In [None]:
xgb.score(X_test_xgb,y_test_xgb)

In [None]:
from sklearn.metrics import classification_report,accuracy_score,precision_score,confusion_matrix

y_pred_xgb= xgb.predict(X_test_xgb)
y_proba_xgb= xgb.predict_proba(X_test_xgb)
print('Roc_auc score:' ,roc_auc_score(y_test_xgb,y_proba_xgb[:,1]))
print('Classification Report:')
print(classification_report(y_test_xgb,y_pred_xgb))

print('precision_score')
print(precision_score(y_test_xgb,y_pred_xgb))

print('confusion_matrix')
print(confusion_matrix(y_test_xgb,y_pred_xgb))

In [None]:
from sklearn.feature_selection import SelectKBest,f_classif
select_features = SelectKBest(f_classif,k='all')

# AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ad = AdaBoostClassifier(n_estimators=100)
ad.fit(X_train,y_train)
print("What is the Testing Accuracy")
print(ad.score(X_test,y_test))
print("What is the Training Accuracy")
print(ad.score(X_train,y_train))

y_pred_ad=ad.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_ad))

# Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et=ExtraTreesClassifier()

et.fit(X_train,y_train)
et.score(X_train,y_train)

y_pred_et=et.predict(X_test)

In [None]:
y_pred_et=et.predict(X_test)
print(classification_report(y_test,y_pred_ad))

In [None]:
et.score(X_test,y_test)

# Bagging Classifier

In [None]:
from sklearn.ensemble import BaggingClassifier

bg = BaggingClassifier(n_estimators=2)
bg.fit(X_train,y_train)
print("What is the Testing Accuracy")
print(bg.score(X_test,y_test))
print("What is the Training Accuracy")
print(bg.score(X_train,y_train))

In [None]:
y_pred_bg=bg.predict(X_test)
print(classification_report(y_test,y_pred_bg))

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)

print("What is the Training Accuracy")
print(gb.score(X_train,y_train))

print("What is the Testing Accuracy")
print(gb.score(X_test,y_test))
gb.feature_importances_

In [None]:
y_pred_gb=et.predict(X_test)
print(classification_report(y_test,y_pred_gb))