# Understanding  Decision Tree and Random Forest using  heart disease patient data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
import warnings
import pickle
from sklearn.model_selection import train_test_split

#Suppressing all warnings
warnings.filterwarnings("ignore")

### Reading and exploring dataset

In [None]:
df=pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

### Checking for null values and imputing null values if exist

In [None]:
df.isnull().sum()

### EDA

In [None]:

df.DEATH_EVENT.value_counts()

In [None]:
plt.figure(figsize=(10,8))
ax=sns.countplot(df['DEATH_EVENT'], palette='OrRd')
ax.set_xticklabels(['Survived','Not Survived'])
for p in ax.patches:
    ax.annotate('{:d}'.format(p.get_height()), (p.get_x()+0.40, p.get_height()+1))

#### Here  survived are 203 and  not survived are  96

In [None]:
sources_pie = go.Pie( values=df.DEATH_EVENT.value_counts())


layout = go.Layout(height = 600,
                   width = 800,
                   autosize = False,
                   title = 'Death Event')
fig = go.Figure(data = [sources_pie], layout = layout)
fig.show()

In [None]:
import plotly.express as px
import plotly.figure_factory as ff
fig = px.histogram(df, x="age")
fig.show()

## Most of patients are from 50-70 age group

In [None]:


sns.pairplot(df)

In [None]:
c=df.corr()

In [None]:
f, ax = plt.subplots(figsize=(11, 11)) 
sns.heatmap(c,annot=True)

In [None]:
df.columns

In [None]:
X = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']]
y = df['DEATH_EVENT']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
X_train.shape, X_test.shape

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.tree import plot_tree

In [None]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

In [None]:
#from IPython.display import Image  
#from sklearn.externals.six import StringIO  
#from sklearn.tree import export_graphviz
#import pydotplus, graphviz

In [None]:
plt.figure(figsize=(60,30))
plot_tree(dt, feature_names = X.columns,class_names=['Survived', "Death"],filled=True);

#### Evaluating model performance

In [None]:
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

In [None]:
print("Accuracy Score-Train: ",accuracy_score(y_train, y_train_pred))
print("Train ROC_AUC Score :", roc_auc_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)

In [None]:
print("Accuracy Score-Test: ",accuracy_score(y_test, y_test_pred))
print("Test ROC_AUC Score :", roc_auc_score(y_test, y_test_pred))
confusion_matrix(y_test, y_test_pred)

In [None]:
#Helper Function

In [None]:
def get_dt_graph(dt_classifier):
    plt.figure(figsize=(60,30))
    plot_tree(dt_classifier, feature_names = X.columns,class_names=['Survived', "Death"],filled=True);

In [None]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train ROC_AUC Score :", roc_auc_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Train ROC_AUC Score :", roc_auc_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

### Without setting any hyper-parameters

In [None]:
dt_default = DecisionTreeClassifier(random_state=42)
dt_default.fit(X_train, y_train)

In [None]:
get_dt_graph(dt_default)


In [None]:
evaluate_model(dt_default)

### Controlling the depth of the tree

In [None]:
dt_depth = DecisionTreeClassifier(max_depth=3)
dt_depth.fit(X_train, y_train)

In [None]:
get_dt_graph(dt_depth) 


In [None]:
evaluate_model(dt_depth)

### Specifying minimum samples before split

In [None]:
dt_min_split = DecisionTreeClassifier(min_samples_split=20)
dt_min_split.fit(X_train, y_train)

In [None]:
get_dt_graph(dt_min_split) 


In [None]:
evaluate_model(dt_min_split)

### Specifying minimum samples in leaf node

In [None]:
dt_min_leaf = DecisionTreeClassifier(min_samples_leaf=20, random_state=42)
dt_min_leaf.fit(X_train, y_train)

In [None]:
get_dt_graph(dt_min_leaf)


In [None]:
evaluate_model(dt_min_leaf)

### Using Entropy instead of Gini

In [None]:
dt_min_leaf_entropy = DecisionTreeClassifier(min_samples_leaf=20, random_state=42, criterion="entropy")
dt_min_leaf_entropy.fit(X_train, y_train)

In [None]:
get_dt_graph(dt_min_leaf_entropy)


In [None]:
evaluate_model(dt_min_leaf_entropy)

### Hyper-parameter tuning

In [None]:
dt = DecisionTreeClassifier(random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [None]:
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "roc_auc")

In [None]:
%%time
grid_search.fit(X_train, y_train)

In [None]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

In [None]:
score_df.nlargest(5,"mean_test_score")

In [None]:
grid_search.best_estimator_

In [None]:
dt_best = grid_search.best_estimator_
evaluate_model(dt_best)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, dt_best.predict(X_test)))

In [None]:
get_dt_graph(dt_best)


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=10, max_depth=3)

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.estimators_[0]

In [None]:
sample_tree = rf.estimators_[4]

In [None]:
get_dt_graph(sample_tree)


In [None]:
get_dt_graph(rf.estimators_[2])


In [None]:
evaluate_model(rf)

### Grid search for hyper-parameter tuning

In [None]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [None]:
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [2,3,4],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [None]:
grid_search = GridSearchCV(estimator=classifier_rf, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "roc_auc")

In [None]:
%%time
grid_search.fit(X,y)

In [None]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
evaluate_model(rf_best)

### Variable importance in RandomForest and Decision trees

In [None]:
rf_best.feature_importances_

In [None]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rf_best.feature_importances_
})

In [None]:
imp_df.sort_values(by="Imp", ascending=False)

In [None]:
#with open('HDClassifierRF.pkl','wb')as pickle_file:
   # pickle.dump(rf_best,pickle_file)