In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np
from numpy import mean
from numpy import std
import feature_engine as ft
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.feature_selection import VarianceThreshold
from feature_engine.encoding import OneHotEncoder

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
data.head()

**We have over 12 features and one output, all of which have no missing values**

In [None]:
data.info()

In [None]:
data.describe()

**let see which can be divided to continous and categorical**

In [None]:
data.nunique()

**Dropping the output column**

In [None]:
X = data.drop(labels = ['output'], axis = 1)
y = data['output']

**Dividing into continous and categorical features, we have set the uniqueness threshold to 6 for that**

In [None]:
categorical = [feature for feature in X.columns if X[feature].nunique() < 6]
continous = [feature for feature in X.columns if feature not in categorical and X[feature].dtype != 'object']

In [None]:
print('Catgorical variable are ' , categorical, ',total they are', len(categorical))
print('Continous variables are ', continous, ',total they are ', len(continous))

**Now assigning the Data types**

In [None]:
X[categorical] = X[categorical].astype('object')

**We are using test size of 0.1, which means more data for our train set**

In [None]:
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size = 0.1, stratify = y, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

**Since all of our categroical features are not ordinal we will be using one hot encoder**

In [None]:
oht = OneHotEncoder(top_categories = None,
              variables = categorical,
              drop_last = False
              
             )
oht.fit(X_train)
X_train = oht.transform(X_train)
X_test = oht.transform(X_test)

In [None]:
X_train.shape, X_test.shape

**There is a Class imbalence of 20 samples we will use Smote to balance that avoid biasness**

In [None]:
sns.countplot(y_train)
plt.title('Before SMOTE')
plt.show()

**Now we are balanced**

In [None]:
smote = SMOTE(random_state= 42)
x_train ,Y_train = smote.fit_resample(X_train, y_train)
sns.countplot(Y_train)
plt.title('After Smote')
plt.show()

### Grid Search to find the best parameters

In [None]:
clf = RandomForestClassifier()

param_grid = { "criterion" : ["gini", "entropy"], 
              "min_samples_leaf" : [1, 5, 10], 
           "min_samples_split" : [2, 4, 10, 12, 16],
              "n_estimators": [50, 100, 400, 700, 1000],
              "max_depth" : [None, 5, 10 ,20],
             "max_features" : ['auto', 'sqrt', 'log2'],
             "bootstrap" : [True, False]
}
gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose= 2)

gs.fit(x_train, Y_train)




**Grid search takes time, so we will not be running here but have stored its results**

In [None]:
#Model = gs.best_estimator_
Model = RandomForestClassifier(max_depth=5, max_features='log2', min_samples_leaf=5,
                       n_estimators=50)
Model.fit(x_train, Y_train)

In [None]:
Model.score(x_train, Y_train)

In [None]:
roc_auc_score(y_test, Model.predict_proba(X_test)[:, 1])

In [None]:
y_pred = Model.predict_proba(X_test)



In [None]:
skplt.metrics.plot_roc_curve(y_test, y_pred)
plt.show()


### Using Recursive feature addition to make efficient Model with minimum features 

In [None]:
importance  = Model.feature_importances_
features = pd.Series(importance)
features.index = X_train.columns
features.sort_values(ascending = False, inplace = True)
features.plot.bar(orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2, )
plt.show()

In [None]:
features = list (features.index)

In [None]:
## Making with 1 feature at first
model_one_feature = RandomForestClassifier(max_depth=5, max_features='log2', min_samples_leaf=5,
                       n_estimators=50)

model_one_feature.fit(X_train[features[0]].to_frame(), y_train)

y_pred = model_one_feature.predict_proba(X_test[features[0]].to_frame())[:,1]

roc_first = roc_auc_score(y_test, y_pred)
print('Test one feature  ROC AUC=%f' % (roc_first))


In [None]:
tol = 0.00001
feature_to_keep = [features[0]]

for feature in features[1:]:
    
    model_int = RandomForestClassifier(max_depth=5, max_features='log2', min_samples_leaf=5,
                       n_estimators=50)
    
    model_int.fit(X_train[feature_to_keep + [feature]], y_train)
    
    y_pred_test = model_int.predict_proba(
        X_test[feature_to_keep + [feature] ])[:, 1]
    
    roc_int = roc_auc_score(y_test, y_pred_test)
    
    diff_roc = roc_int - roc_first
    
    if diff_roc >= tol:
        roc_first = roc_int
        feature_to_keep.append(feature)
        print('FEATURE ADDED: ->', feature)
    else:
        print('REMOVED :->', feature)
    
print(len(feature_to_keep))

### Using only six features out of 30 gives us improved performace

In [None]:
print(feature_to_keep)

In [None]:
model_final =  RandomForestClassifier(max_depth=5, max_features='log2', min_samples_leaf=5,
                       n_estimators=50, random_state= 42)
model_final.fit(X_train[feature_to_keep], y_train)
y_pred_test = model_final.predict_proba(X_test[feature_to_keep])[:,1]

roc_final = roc_auc_score(y_test, y_pred_test)
print('Test selected features ROC AUC=%f' % (roc_final))


In [None]:
importance  = model_final.feature_importances_
feat = pd.Series(importance * 100)
feat.index = X_train[feature_to_keep].columns
feat.sort_values(ascending = False, inplace = True)
feat.plot.bar(orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2, )
plt.show()

In [None]:
y_pred_tested = model_final.predict(X_test[feature_to_keep])
print(classification_report(y_test, y_pred_tested))