In [None]:
!pip install seaborn --upgrade

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn import metrics

# Division of this study

1. Evaluate Random Forest Classifier x Ada Boost Classifier and compare averaging and boosting methods
2. Scale the data and check results again
3. Evaluate SVC x NuSVC
4. Scale the data and check results again
5. Compare all the estimators

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
Y = df['target']

In [None]:
X = df.drop(columns = 'target')

In [None]:
seed = 1000

In [None]:
train_X, val_X, train_Y, val_Y = train_test_split(X,Y, random_state = 1000, test_size = 0.20)

# RAW Random Forest Classifier
  This method uses averaging system to predict values

In [None]:
rfc = RandomForestClassifier(random_state = seed)

In [None]:
rfc.fit(train_X, train_Y)

In [None]:
rfc_pred = rfc.predict(val_X)

In [None]:
metrics.confusion_matrix(val_Y,rfc_pred)

In [None]:
sns.heatmap(metrics.confusion_matrix(val_Y,rfc_pred), annot = True)
plt.ylim(0,2)

In [None]:
metrics.f1_score(val_Y, rfc_pred)

In [None]:
metrics.accuracy_score(val_Y, rfc_pred)

# RAW Ada Boost Classifier
  This method uses boosting method to predict values

In [None]:
abc = AdaBoostClassifier(random_state = seed)

In [None]:
abc.fit(train_X, train_Y)

In [None]:
abc_pred = abc.predict(val_X)

In [None]:
metrics.confusion_matrix(val_Y, abc_pred)

In [None]:
sns.heatmap(metrics.confusion_matrix(val_Y,abc_pred), annot = True)
plt.ylim(0,2)

In [None]:
metrics.f1_score(val_Y, abc_pred)

In [None]:
metrics.accuracy_score(val_Y, abc_pred)

# At first without any estimator tuning, AdaBoost has a better f1 score

# LETS USE GRIDSEARCHCV TO TUNE OUR MODELS AND COMPARE THE EFFICIENCY

# RANDOM FOREST CLASSIFIER

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
rfc_param = {'n_estimators':[100,120,150,200], 'max_leaf_nodes':[2,3,4,5,8,10,12,15],
             'max_depth':[2,4,6,8,10,12,14,15]}

In [None]:
gs_rfc = GridSearchCV(rfc, rfc_param, scoring = 'f1', cv = 10)

In [None]:
gs_rfc.fit(X,Y)

In [None]:
gs_rfc.best_params_

In [None]:
results_rfc = pd.DataFrame(gs_rfc.cv_results_)

In [None]:
results_rfc = results_rfc[['rank_test_score', 'mean_test_score', 'params']]

In [None]:
results_rfc.sort_values(by = 'rank_test_score')

# ADA BOOST CLASSIFIER

In [None]:
abc_param = {'n_estimators':[1,3,5,6,8,10,12,14,15,17],
            'learning_rate':[0.1,0.25,0.40,0.55,0.70,0.85,1]}

In [None]:
gs_abc = GridSearchCV(abc, abc_param, scoring = 'f1', cv = 10)

In [None]:
gs_abc.fit(X,Y)

In [None]:
gs_abc.best_params_

In [None]:
results_abc = pd.DataFrame(gs_abc.cv_results_)

In [None]:
results_abc = results_abc[['rank_test_score','mean_test_score','params']]

In [None]:
results_abc.sort_values(by = 'rank_test_score')

# NOW LETS USE STANDARD SCALE TO NORMALIZE OUR DATA AND CHECK IF THE ESTIMATORS IMPROVE ITS EFFICIENCY

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X.describe()

In [None]:
scaler = StandardScaler()

In [None]:
X_scaled = scaler.fit_transform(X)

In [None]:
teste = pd.DataFrame(X_scaled)

In [None]:
teste.describe()

In [None]:
gs_rfc.fit(teste,Y)

In [None]:
gs_rfc.best_params_

In [None]:
rfc_result_scale = pd.DataFrame(gs_rfc.cv_results_)

In [None]:
rfc_result_scale = rfc_result_scale[['rank_test_score', 'mean_test_score']]

In [None]:
rfc_result_scale.sort_values(by = 'rank_test_score')

In [None]:
gs_abc.fit(teste, Y)

In [None]:
gs_abc.best_params_

In [None]:
gs_abc_scale = pd.DataFrame(gs_abc.cv_results_)

In [None]:
gs_abc_scale = gs_abc_scale[['rank_test_score', 'mean_test_score']]

In [None]:
gs_abc_scale.sort_values(by = 'rank_test_score')

# RAW SVC 

In [None]:
svc = svm.SVC(random_state = seed)

In [None]:
svc.fit(train_X,train_Y)

In [None]:
svc_pred = svc.predict(val_X)

In [None]:
metrics.f1_score(val_Y, svc_pred)

# RAW NuSVC

In [None]:
nusvc = svm.NuSVC(random_state = seed)

In [None]:
nusvc.fit(train_X, train_Y)

In [None]:
nusvc_pred = nusvc.predict(val_X)

In [None]:
metrics.f1_score(val_Y,nusvc_pred)

# At first SVC performs better than NuSVC, but worst than Random Forest Classifier and Ada Boost Classifier
# Let's do some tuning to check again

In [None]:
svc_param = {'C':[0.1,0.25,0.5,1,2,6],
            'kernel':['poly', 'rbf', 'sigmoid'],
            'degree':[1,2,3,4,5,6,7],
            'gamma':['scale']}

In [None]:
gs_svc = GridSearchCV(svc, svc_param, scoring = 'f1', cv = 10)

In [None]:
gs_svc.fit(X,Y)

In [None]:
gs_svc.best_params_

In [None]:
svc_results = pd.DataFrame(gs_svc.cv_results_)

In [None]:
svc_results = svc_results[['rank_test_score','mean_test_score']].sort_values(by = 'rank_test_score')

In [None]:
svc_results

In [None]:
nusvc_param = {'nu':[0.1,0.2,0.5,0.8],
            'kernel':['poly', 'rbf', 'sigmoid'],
            'degree':[1,2,3,4,5,6,7],
            'gamma':['scale']}

In [None]:
gs_nusvc = GridSearchCV(nusvc, nusvc_param, scoring = 'f1', cv = 10)

In [None]:
gs_nusvc.fit(X,Y)

In [None]:
gs_nusvc.best_params_

In [None]:
results_nusvc = pd.DataFrame(gs_nusvc.cv_results_)

In [None]:
results_nusvc = results_nusvc[['rank_test_score', 'mean_test_score']].sort_values(by = 'rank_test_score')

In [None]:
results_nusvc

# NOW LET'S REPEAT SVC AND NUSVC GRIDSEARCH TUNING USING SCALED FEATURES

In [None]:
gs_svc.fit(teste, Y)

In [None]:
gs_svc.best_params_

In [None]:
results_svc = pd.DataFrame(gs_svc.cv_results_)

In [None]:
results_svc = results_svc[['rank_test_score', 'mean_test_score']].sort_values(by = 'rank_test_score')

In [None]:
results_svc

In [None]:
gs_nusvc.fit(teste,Y)

In [None]:
gs_nusvc.best_params_

In [None]:
results_nusvc = pd.DataFrame(gs_nusvc.cv_results_)

In [None]:
results_nusvc = results_nusvc[['rank_test_score', 'mean_test_score']].sort_values(by = 'rank_test_score')

In [None]:
results_nusvc

# After tuning parameters and scaling the features we have the following f1 score for each estimator when applying GridSearchCV:

# Random Forest Classifier    = 0,8665
# Ada Boost Classifier        = 0,8668
# Suport Vector Classifier    = 0,8615
# Nu Suport Vector Classifier = 0,8609

# Now let's conclude the study with AdaBoost

In [None]:
abc = AdaBoostClassifier(random_state = seed, learning_rate = 0.1, n_estimators = 12)

In [None]:
abc.fit(train_X,train_Y)

In [None]:
abc_pred = abc.predict(val_X)

In [None]:
metrics.f1_score(val_Y, abc_pred)

Not so bad after all, with GridSearchCV we had 0,8668 and in real life we have 0,8125

In [None]:
metrics.recall_score(val_Y, abc_pred)

In [None]:
metrics.accuracy_score(val_Y, abc_pred)

In [None]:
metrics.precision_score(val_Y, abc_pred)

In [None]:
metrics.confusion_matrix(val_Y, abc_pred)

In [None]:
sns.heatmap(metrics.confusion_matrix(val_Y, abc_pred), annot = True)
plt.ylim(0,2)
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.title('Confusion Matrix for AdaBoostClassifier - Heart Disease Prediction')