In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import tensorflow as tf 
import sklearn
import os 
import pathlib
from PIL import Image
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.metrics import classification_report

### - I did not used python for loops, functions, or classes Because If someone wants to read this notebook and he/she is a beginner  in python  it will be easy for him/her to understand. There will be a repeated code for each model.


### - I used full name for variables to make this notebook more readable and to facilitate the follow-up of the sequence of modeling.

### see https://www.kaggle.com/general/253378 
### to learn how to deal with Imbalanced Dataset Metrics in binary classification problems

### see  https://www.kaggle.com/general/253592
### to understand the Ensemble Learning

# take a look at the data

In [None]:
data_path = "../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv"

data = pd.read_csv(data_path)

In [None]:
data.head()

In [None]:
print("The shape of data: {}".format(data.shape))

In [None]:
#NO missing values
data.info()

In [None]:
# 5 categorical variables
cat_variables = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'] 

for variable in cat_variables:
    data[variable] = data[variable].astype('category')

In [None]:
data.info()

In [None]:
# 6 numerical variables
numerical_variables = ['age', 'creatinine_phosphokinase',
                        'platelets', 'serum_creatinine',
                        'serum_sodium', 'time']

In [None]:
data[numerical_variables].describe().T

In [None]:
# no duplicated rows
data.duplicated().sum()

# Choosing the metrics

In [None]:
data["DEATH_EVENT"].value_counts()

In [None]:
print("{:.2F} of the data is a negative class and {:.2F} is positive".format(203/299, 69/299))

If we choose the accuracy to be our metric it will give us optimistic results even if the classifier doesn't work well
I will use the `confusion matrics`, `ROC`, `average_precision`, and `ROC_AUC` to evaluat the performance of the clasifier
after trying different classification algorithms on accuracy.  

# preparing the data

As we can see the numeric data form some classifiers  such as the SVC needs to be scaled
so I will prepare tow versions of the dataset one with scaled dataset and one hot encode 
the categorical variables and the other version is not 

## Unscaled data

In [None]:
unscaled_dataset_version = data

In [None]:
unscaled_dataset_version.head()

In [None]:
x_unscaled_version = unscaled_dataset_version.drop('DEATH_EVENT', axis=1)
y_unscaled_version = unscaled_dataset_version[['DEATH_EVENT']]

In [None]:
x_unscaled_version_np = np.array(x_unscaled_version)
y_unscaled_version_np = np.array(y_unscaled_version).reshape(-1)

In [None]:
#spliting the unscaled data
x_train_unscaled, x_test_unscaled, y_train_unscaled, y_test_unscaled = train_test_split(x_unscaled_version_np,
                                                                                       y_unscaled_version_np,
                                                                                       test_size=0.2,
                                                                                       shuffle=True,
                                                                                       random_state=0)

## Scaled data

In [None]:
#extracting polynomial features
#scaling the data
#selecting the best 12 features
num_pipeline = Pipeline([("polynomial", PolynomialFeatures(degree=2, include_bias=False)),
                         ("scaler", StandardScaler()),
                         ("feature_selection", SelectFromModel(LogisticRegression(max_iter=1000), max_features=12))])

In [None]:
# Scaling and One_Hot encoding the data
data_prep_pipeline = ColumnTransformer([
    ("num", num_pipeline, numerical_variables),
    ('cat', OneHotEncoder(), cat_variables)
])

In [None]:
x_preprocessed_data = data_prep_pipeline.fit_transform(x_unscaled_version, y_unscaled_version_np)
y_preprocessed_data = np.array(y_unscaled_version).reshape(-1)

In [None]:
x_preprocessed_data.shape

In [None]:
#spliting the scaled data
x_train_preprocessed, x_test_preprocessed, y_train_preprocessed, y_test_preprocessed = train_test_split(x_preprocessed_data,
                                                                                           y_preprocessed_data,
                                                                                           test_size=0.2,
                                                                                           shuffle=True,
                                                                                           random_state=0)

# Prediction models training and evaluation

## 1- KNeighbors Classifier

In [None]:
#using the unprocessed data
kn_model = KNeighborsClassifier()
kn_model.fit(x_train_unscaled , y_train_unscaled)
kn_train_score = kn_model.score(x_train_unscaled , y_train_unscaled)

print("KNeighbors Classifier Training Score: {:.3F}".format(kn_train_score))

In [None]:
#using the processed data
kn_model = KNeighborsClassifier()
kn_model.fit(x_train_preprocessed , y_train_preprocessed)
kn_train_score = kn_model.score(x_train_preprocessed , y_train_preprocessed)

print("KNeighbors Classifier Training Score: {:.3F}".format(kn_train_score))


# KNeighborsClassifier evaluated using shuffle-split cross-validation 
kn_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
kn_val_scores = cross_val_score(kn_model, x_train_preprocessed , y_train_preprocessed, cv=kn_shuffle_split)
print("KNeighbors Classifier Cross validation Score: {:.3F}".format(np.mean(kn_val_scores)))

## 2-  Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=1000, random_state=0)
lr_model.fit(x_train_unscaled , y_train_unscaled)
lr_train_score = lr_model.score(x_train_unscaled , y_train_unscaled)

print("Logistic Regression Training Score: {:.3F}".format(lr_train_score))


# LogisticRegression evaluated using shuffle-split cross-validation 
lr_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
lr_val_scores = cross_val_score(lr_model, x_unscaled_version , y_unscaled_version_np, cv=lr_shuffle_split)
print("Logistic Regression Cross validation Score: {:.3F}".format(np.mean(lr_val_scores)))

## 3- LinearSVC

In [None]:
# SVMs are sensitive to the feature scale so I will use the scaled data
liniar_svc_model = LinearSVC(max_iter=10000, random_state=0)
liniar_svc_model.fit(x_train_preprocessed , y_train_preprocessed)
Linear_svc_train_score = liniar_svc_model.score(x_train_preprocessed , y_train_preprocessed)

print("LinearSVC Training Score: {:.3F}".format(Linear_svc_train_score))


# LinearSVC evaluated using shuffle-split cross-validation 
Linear_svc_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
Linear_svc_val_scores = cross_val_score(liniar_svc_model, x_train_preprocessed , y_train_preprocessed, cv=Linear_svc_shuffle_split)
print("LinearSVC Cross validation Score: {:.3F}".format(np.mean(Linear_svc_val_scores)))

## 4- SVC

### polynomial kernel

In [None]:
poly_kernel_svc_model = SVC(kernel='poly', degree=2, coef0=0.01, C=1, random_state=0)
poly_kernel_svc_model.fit(x_train_preprocessed , y_train_preprocessed)
poly_kernel_train_score = poly_kernel_svc_model.score(x_train_preprocessed , y_train_preprocessed)

print("SVC Training Score: {:.3F}".format(poly_kernel_train_score))


# polynomial kernel svc evaluated using shuffle-split cross-validation 
poly_kernel_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
poly_kernel_val_scores = cross_val_score(poly_kernel_svc_model,
                                         x_train_preprocessed ,
                                         y_train_preprocessed,
                                         cv=poly_kernel_shuffle_split)

print("LinearSVC Cross validation Score: {:.3F}".format(np.mean(poly_kernel_val_scores)))

### RBF kernel

In [None]:
rbf_kernel_svc_model = SVC(kernel='rbf', gamma=6, C=5, random_state=0)
rbf_kernel_svc_model.fit(x_train_preprocessed , y_train_preprocessed)
rbf_kernel_train_score = rbf_kernel_svc_model.score(x_train_preprocessed , y_train_preprocessed)

print("SVC Training Score: {:.3F}".format(rbf_kernel_train_score))


# RBF kernel svc evaluated using shuffle-split cross-validation 
rbf_kernel_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
rbf_kernel_val_scores = cross_val_score(rbf_kernel_svc_model,
                                         x_train_preprocessed ,
                                         y_train_preprocessed,
                                         cv=rbf_kernel_shuffle_split)

print("LinearSVC Cross validation Score: {:.3F}".format(np.mean(rbf_kernel_val_scores)))

#This model is overfitting 

## 5- DecisionTree Classifier

In [None]:
#DecisionTreeClassifier does not require feature scaling
#This model is overfitting

dt_model = DecisionTreeClassifier(random_state=0)
dt_model.fit(x_train_unscaled , y_train_unscaled)
dt_train_score = dt_model.score(x_train_unscaled , y_train_unscaled)

print("DecisionTree Classifier Training Score: {:.3F}".format(dt_train_score))


# DecisionTreeClassifier evaluated using shuffle-split cross-validation 
dt_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
dt_val_scores = cross_val_score(dt_model, x_unscaled_version , y_unscaled_version_np, cv=dt_shuffle_split)
print("DecisionTree Classifier Cross validation Score: {:.3F}".format(np.mean(dt_val_scores)))

## 6- Ensemble Methods

### Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100,
                                 max_leaf_nodes=15,
                                 bootstrap=False,
                                 max_samples=100,
                                 n_jobs=-1,
                                 random_state=0)

rf_model.fit(x_train_unscaled , y_train_unscaled)
rf_train_score = rf_model.score(x_train_unscaled , y_train_unscaled)

print("Random Forest Training Score: {:.3F}".format(rf_train_score))


# RandomForestClassifier evaluated using shuffle-split cross-validation 
rf_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
rf_val_scores = cross_val_score(rf_model, x_train_unscaled , y_train_unscaled, cv=rf_shuffle_split)
print("Random Forest Cross validation Score: {:.3F}".format(np.mean(rf_val_scores)))

### Voting Classifiers

In [None]:
# Hard Voting 
v_log_clf = LogisticRegression(max_iter=1000, random_state=0)
v_random_forest = RandomForestClassifier()
v_svc = SVC(probability=True)


hard_voting_model = VotingClassifier(estimators=[("lr",v_log_clf),
                                            ("rf",v_random_forest),
                                            ("svc",v_svc)],
                               voting='hard')

hard_voting_model.fit(x_train_preprocessed , y_train_preprocessed)
hard_voting_train_score = hard_voting_model.score(x_train_preprocessed , y_train_preprocessed)

print("Hard Voting Classifiers Training Score: {:.3F}".format(hard_voting_train_score))


# VotingClassifier evaluated using shuffle-split cross-validation 
hard_voting_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
hard_voting_val_scores = cross_val_score(hard_voting_model, x_train_preprocessed , y_train_preprocessed, cv=hard_voting_shuffle_split)
print("Hard Voting Classifiers Cross validation Score: {:.3F}".format(np.mean(hard_voting_val_scores)))

In [None]:
# Soft Voting
v_log_clf = LogisticRegression(max_iter=1000, random_state=0)
v_random_forest = RandomForestClassifier()
v_svc = SVC(probability=True)


soft_voting_model = VotingClassifier(estimators=[("lr",v_log_clf),
                                            ("rf",v_random_forest),
                                            ("svc",v_svc)],
                               voting='soft')

soft_voting_model.fit(x_train_preprocessed , y_train_preprocessed)
soft_voting_train_score = soft_voting_model.score(x_train_preprocessed , y_train_preprocessed)

print("Soft Voting Classifiers Training Score: {:.3F}".format(soft_voting_train_score))


# VotingClassifier evaluated using shuffle-split cross-validation 
soft_voting_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
soft_voting_val_scores = cross_val_score(soft_voting_model, x_train_preprocessed , y_train_preprocessed, cv=soft_voting_shuffle_split)
print("Soft Voting Classifiers Cross validation Score: {:.3F}".format(np.mean(soft_voting_val_scores)))

### bagging and Pasting

#### bagging

In [None]:
lr_bagging_model = BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000),
                                     n_estimators=500,
                                     bootstrap=True,
                                     max_samples=100,
                                     n_jobs=-1,
                                     random_state=0)

lr_bagging_model.fit(x_train_unscaled , y_train_unscaled)
lr_bagging_train_score = lr_bagging_model.score(x_train_unscaled , y_train_unscaled)

print("LogisticRegression Bagging Classifier Training Score: {:.3F}".format(lr_bagging_train_score))


# BaggingClassifier evaluated using shuffle-split cross-validation 
lr_bagging_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
lr_baggign_val_scores = cross_val_score(lr_bagging_model, x_train_unscaled , y_train_unscaled, cv=lr_bagging_shuffle_split)
print("LogisticRegression Bagging Classifier Cross validation Score: {:.3F}".format(np.mean(lr_baggign_val_scores)))

In [None]:
rf_bagging_model = BaggingClassifier(base_estimator=RandomForestClassifier(),
                                     n_estimators=100,
                                     bootstrap=True,
                                     max_samples=100,
                                     n_jobs=-1,
                                     random_state=0)

rf_bagging_model.fit(x_train_unscaled , y_train_unscaled)
rf_bagging_train_score = rf_bagging_model.score(x_train_unscaled , y_train_unscaled)

print("RandomForest Bagging Classifier Training Score: {:.3F}".format(rf_bagging_train_score))


# BaggingClassifier evaluated using shuffle-split cross-validation 
rf_bagging_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
rf_baggign_val_scores = cross_val_score(rf_bagging_model, x_train_unscaled , y_train_unscaled, cv=rf_bagging_shuffle_split)
print("RandomForest Bagging Classifier Cross validation Score: {:.3F}".format(np.mean(rf_baggign_val_scores)))

#### pasting

In [None]:
rf_pasting_model = BaggingClassifier(base_estimator=RandomForestClassifier(max_depth=8,
                                                                             max_leaf_nodes=10,
                                                                             n_estimators=100,
                                                                             n_jobs=-1,
                                                                             random_state=0),
                                     n_estimators=100,
                                     bootstrap=False,
                                     max_samples=100,
                                     n_jobs=-1,
                                     random_state=0)

rf_pasting_model.fit(x_train_unscaled , y_train_unscaled)
rf_pasting_train_score = rf_pasting_model.score(x_train_unscaled , y_train_unscaled)

print("RandomForest pasting Classifier Training Score: {:.3F}".format(rf_pasting_train_score))


# BaggingClassifier evaluated using shuffle-split cross-validation 
rf_pasting_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
rf_pasting_val_scores = cross_val_score(rf_pasting_model, x_train_unscaled , y_train_unscaled, cv=rf_pasting_shuffle_split)
print("RandomForest pasting Classifier Cross validation Score: {:.3F}".format(np.mean(rf_pasting_val_scores)))

In [None]:
lr_pasting_model = BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000),
                                     n_estimators=100,
                                     bootstrap=False,
                                     max_samples=100,
                                     n_jobs=-1,
                                     random_state=0)

lr_pasting_model.fit(x_train_preprocessed , y_train_preprocessed)
lr_pasting_train_score = lr_pasting_model.score(x_train_preprocessed , y_train_preprocessed)

print("RandomForest Pasting Classifier Training Score: {:.3F}".format(lr_pasting_train_score))


# BaggingClassifier evaluated using shuffle-split cross-validation 
lr_pasting_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
lr_pasting_val_scores = cross_val_score(lr_pasting_model, x_train_preprocessed , y_train_preprocessed, cv=lr_pasting_shuffle_split)
print("RandomForest Pasting Classifier Cross validation Score: {:.3F}".format(np.mean(lr_pasting_val_scores)))

## Boosting

### AdaBoost

In [None]:
lr_adaboost_model = AdaBoostClassifier(base_estimator=LogisticRegression(max_iter=1000),
                                   n_estimators=100,
                                   learning_rate=0.1,
                                   algorithm='SAMME.R',
                                   random_state=0)

lr_adaboost_model.fit(x_train_unscaled , y_train_unscaled)
lr_adaboost_train_score = lr_adaboost_model.score(x_train_unscaled , y_train_unscaled)

print("LogisticRegression AdaBoost Classifier Training Score: {:.3F}".format(lr_adaboost_train_score))


# AdaBoostClassifier evaluated using shuffle-split cross-validation 
lr_adaboost_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
lr_adaboost_val_scores = cross_val_score(lr_adaboost_model, x_train_unscaled , y_train_unscaled, cv=lr_adaboost_shuffle_split)
print("LogisticRegression AdaBoost Classifier Cross validation Score: {:.3F}".format(np.mean(lr_adaboost_val_scores)))

In [None]:
rf_adaboost_model = AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=5,
                                                                             max_leaf_nodes=10,
                                                                             n_estimators=100,
                                                                             n_jobs=-1,
                                                                             random_state=0),
                                       
                                       n_estimators=100,
                                       learning_rate=0.1,
                                       algorithm='SAMME.R',
                                       random_state=0)

rf_adaboost_model.fit(x_train_preprocessed , y_train_preprocessed)
rf_adaboost_train_score = rf_adaboost_model.score(x_train_preprocessed , y_train_preprocessed)

print("RandomForestClassifier AdaBoost Classifier Training Score: {:.3F}".format(rf_adaboost_train_score))


# AdaBoostClassifier evaluated using shuffle-split cross-validation 
rf_adaboost_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
rf_adaboost_val_scores = cross_val_score(rf_adaboost_model, x_train_preprocessed , y_train_preprocessed, cv=rf_adaboost_shuffle_split)
print("RandomForestClassifier AdaBoost Classifier Cross validation Score: {:.3F}".format(np.mean(rf_adaboost_val_scores)))

### GradientBoostingClassifier

In [None]:
g_boosting_model = GradientBoostingClassifier(n_estimators=500,
                                             learning_rate=0.01,
                                             random_state=0,
                                             max_depth=5)

g_boosting_model.fit(x_train_unscaled , y_train_unscaled)
g_boosting_train_score = g_boosting_model.score(x_train_unscaled , y_train_unscaled)

print("Gradient Boosting Classifier Training Score: {:.3F}".format(g_boosting_train_score))


# GradientBoostingClassifier evaluated using shuffle-split cross-validation 
g_boosting_shuffle_split = StratifiedShuffleSplit(train_size=0.8, test_size=0.2, n_splits=3, random_state=0)
g_boosting_val_scores = cross_val_score(g_boosting_model, x_train_unscaled , y_train_unscaled, cv=g_boosting_shuffle_split)
print("Gradient Boosting Classifier Cross validation Score: {:.3F}".format(np.mean(g_boosting_val_scores)))

# overfitting

# summarizing the models Accuracies

In [None]:
cross_val_scores = [kn_val_scores, lr_val_scores,
                    Linear_svc_val_scores, poly_kernel_val_scores,
                    rbf_kernel_val_scores, dt_val_scores,
                    rf_val_scores, hard_voting_val_scores,
                    soft_voting_val_scores, lr_baggign_val_scores,
                    rf_baggign_val_scores, rf_pasting_val_scores,
                    lr_pasting_val_scores, lr_adaboost_val_scores,
                    rf_adaboost_val_scores, g_boosting_val_scores]

In [None]:
mean_cross_val_scores = []

for i in cross_val_scores:
    mean_cross_val_scores.append(np.mean(i))

In [None]:
models = pd.DataFrame({
    'Model': ['KNeighbors Classifier', 'Logistic Regression',
             'Linear SVC', 'SVC polynomial kernel', 'SVC RBF kernel',
             'Decision Tree Classifier', 'Random Forest',
             'Hard Voting Classifiers', 'Soft Voting Classifiers',
             'Bagging Classifier using LogisticRegression',
             'Bagging Classifier using RandomForestClassifier',
             'Pasting Classifier using RandomForestClassifier',
             'Pasting Classifier using LogisticRegression',
             'AdaBoost Classifier using LogisticRegression',
             'AdaBoost Classifier using RandomForestClassifier',
             'Gradient Boosting Classifier'],
    
    
    'Training Accuracy': [kn_train_score, lr_train_score,
                         Linear_svc_train_score, poly_kernel_train_score,
                         rbf_kernel_train_score, dt_train_score,
                         rf_train_score, hard_voting_train_score,
                         soft_voting_train_score, lr_bagging_train_score,
                         rf_bagging_train_score, rf_pasting_train_score,
                         lr_pasting_train_score, lr_adaboost_train_score,
                         rf_adaboost_train_score, g_boosting_train_score],
    
    
    'Cross Validation Accuracy': mean_cross_val_scores})


models

# Bagging Classifier using RandomForestClassifier, Soft Voting Classifiers, AdaBoost Classifier using RandomForestClassifier, will used for more testing 

## 1- Bagging Classifier using RandomForestClassifier

In [None]:
average_precision_score, roc_auc_score

In [None]:
Bagging_Classifier_using_RandomForestClassifier_test_acc =  rf_bagging_model.score(x_test_unscaled, y_test_unscaled)
print("Bagging Classifiernusing RandomForestClassifier Testing Accuracy: {:.3F}".format(Bagging_Classifier_using_RandomForestClassifier_test_acc))

In [None]:
confusion_matrix(y_test_unscaled, rf_bagging_model.predict(x_test_unscaled))

In [None]:
True_Positive = 1
True_Negative = 36
False_Positive = 15
False_Negative = 8

In [None]:
Bagging_Classifiernusing_RandomForestClassifier_Precision_score = precision_score(y_test_unscaled,
                                                                                  rf_bagging_model.predict(x_test_unscaled))

print("Bagging Classifiernusing RandomForestClassifier Precision score: {}".format(Bagging_Classifiernusing_RandomForestClassifier_Precision_score))

In [None]:
Bagging_Classifiernusing_RandomForestClassifier_Recall_score = recall_score(y_test_unscaled,
                                                                            rf_bagging_model.predict(x_test_unscaled))

print("Bagging Classifiernusing RandomForestClassifier Recall score: {}".format(Bagging_Classifiernusing_RandomForestClassifier_Recall_score))

In [None]:
Bagging_Classifiernusing_RandomForestClassifier_F1_score = f1_score(y_test_unscaled,
                                                                    rf_bagging_model.predict(x_test_unscaled))

print("Bagging Classifiernusing RandomForestClassifier F1 score: {}".format(Bagging_Classifiernusing_RandomForestClassifier_F1_score))

In [None]:
Bagging_Classifiernusing_RandomForestClassifier_Average_Precision_score = average_precision_score(y_test_unscaled,
                                                                                    rf_bagging_model.predict_proba(x_test_unscaled)[:,1])

print("Bagging Classifiernusing RandomForestClassifier Average Precision score: {}".format(Bagging_Classifiernusing_RandomForestClassifier_Average_Precision_score))

In [None]:
Bagging_Classifiernusing_RandomForestClassifier_ROC_AUC_score = roc_auc_score(y_test_unscaled,
                                                                                rf_bagging_model.predict_proba(x_test_unscaled)[:,1])

print("Bagging Classifiernusing RandomForestClassifier ROC AUC Score: {}".format(Bagging_Classifiernusing_RandomForestClassifier_ROC_AUC_score))

## 2- Soft Voting Classifiers

In [None]:
soft_Voting_Classifiers_test_acc =  soft_voting_model.score(x_test_preprocessed, y_test_preprocessed)
print("Soft Voting Classifiers Testing Accuracy: {:.3F}".format(soft_Voting_Classifiers_test_acc))

In [None]:
confusion_matrix(y_test_preprocessed, soft_voting_model.predict(x_test_preprocessed))

In [None]:
True_Positive = 15
True_Negative = 35
False_Positive = 2
False_Negative = 8

In [None]:
Soft_Voting_Classifiers_Precision_score = precision_score(y_test_preprocessed,
                                                          soft_voting_model.predict(x_test_preprocessed))

print("Soft Voting Classifiers Precision score: {}".format(Soft_Voting_Classifiers_Precision_score))

In [None]:
Soft_Voting_Classifiers_Recall_score = recall_score(y_test_preprocessed,
                                                    soft_voting_model.predict(x_test_preprocessed))

print("Soft Voting Classifiers Recall score: {}".format(Soft_Voting_Classifiers_Recall_score))

In [None]:
Soft_Voting_Classifiers_F1_score = f1_score(y_test_preprocessed,
                                            soft_voting_model.predict(x_test_preprocessed))

print("Soft Voting Classifiers F1 score: {}".format(Soft_Voting_Classifiers_F1_score))

In [None]:
Soft_Voting_Classifiers_Average_Precision_score = average_precision_score(y_test_preprocessed,
                                                                   soft_voting_model.predict_proba(x_test_preprocessed)[:,1])

print("Soft Voting Classifiers Average Precision score: {}".format(Soft_Voting_Classifiers_Average_Precision_score))

In [None]:
Soft_Voting_Classifiers_ROC_AUC_score = roc_auc_score(y_test_preprocessed,
                                                      soft_voting_model.predict_proba(x_test_preprocessed)[:,1])

print("Soft Voting Classifiers ROC AUC Score: {}".format(Soft_Voting_Classifiers_ROC_AUC_score))

## 3- AdaBoost Classifier using RandomForestClassifier

In [None]:
AdaBoost_Classifier_using_RandomForestClassifier_test_acc =  rf_adaboost_model.score(x_test_preprocessed, y_test_preprocessed)
print("AdaBoost Classifier using RandomForestClassifier Testing Accuracy: {:.3F}".format(AdaBoost_Classifier_using_RandomForestClassifier_test_acc))

In [None]:
confusion_matrix(y_test_preprocessed, rf_adaboost_model.predict(x_test_preprocessed))

In [None]:
True_Positive = 14
True_Negative = 34
False_Positive = 3
False_Negative = 9

In [None]:
AdaBoost_Classifier_using_RandomForestClassifier_Precision_score = precision_score(y_test_preprocessed,
                                                                              rf_adaboost_model.predict(x_test_preprocessed))

print("AdaBoost Classifier using RandomForestClassifier Precision score: {}".format(AdaBoost_Classifier_using_RandomForestClassifier_Precision_score))

In [None]:
AdaBoost_Classifier_using_RandomForestClassifier_Recall_score = recall_score(y_test_preprocessed,
                                                                            rf_adaboost_model.predict(x_test_preprocessed))

print("AdaBoost Classifier using RandomForestClassifier Recall score: {}".format(AdaBoost_Classifier_using_RandomForestClassifier_Recall_score))

In [None]:
AdaBoost_Classifier_using_RandomForestClassifier_F1_score = f1_score(y_test_preprocessed,
                                                                   rf_adaboost_model.predict(x_test_preprocessed))

print("AdaBoost Classifier using RandomForestClassifier F1 score: {}".format(AdaBoost_Classifier_using_RandomForestClassifier_F1_score))

In [None]:
AdaBoost_Classifier_using_RandomForestClassifier_Average_Precision_score = average_precision_score(y_test_preprocessed,
                                                                   rf_adaboost_model.predict_proba(x_test_preprocessed)[:,1])

print("AdaBoost Classifier using RandomForestClassifier Average Precision score: {}".format(AdaBoost_Classifier_using_RandomForestClassifier_Average_Precision_score))

In [None]:
AdaBoost_Classifier_using_RandomForestClassifier_ROC_AUC_score = roc_auc_score(y_test_preprocessed,
                                                         rf_adaboost_model.predict_proba(x_test_preprocessed)[:,1])


print("AdaBoost Classifier using RandomForestClassifier ROC AUC Score: {}".format(AdaBoost_Classifier_using_RandomForestClassifier_ROC_AUC_score))

In [None]:
models = pd.DataFrame({
    'Model': ['Soft Voting Classifiers', 
             'Bagging Classifier using RandomForestClassifier',
             'AdaBoost Classifier using RandomForestClassifier'],
    
    'Testset Accuracy': [soft_Voting_Classifiers_test_acc,
                        Bagging_Classifier_using_RandomForestClassifier_test_acc,
                        AdaBoost_Classifier_using_RandomForestClassifier_test_acc],
    
    
    'Precision Score': [Soft_Voting_Classifiers_Precision_score,
                       Bagging_Classifiernusing_RandomForestClassifier_Precision_score,
                       AdaBoost_Classifier_using_RandomForestClassifier_Precision_score],
    
    
    'Recall': [Soft_Voting_Classifiers_Recall_score,
              Bagging_Classifiernusing_RandomForestClassifier_Recall_score,
              AdaBoost_Classifier_using_RandomForestClassifier_Recall_score],


    'F1 score':[Soft_Voting_Classifiers_F1_score,
               Bagging_Classifiernusing_RandomForestClassifier_F1_score,
               AdaBoost_Classifier_using_RandomForestClassifier_F1_score],
    

    'Average Precision Score':[Soft_Voting_Classifiers_Average_Precision_score,
                              Bagging_Classifiernusing_RandomForestClassifier_Average_Precision_score,
                              AdaBoost_Classifier_using_RandomForestClassifier_Average_Precision_score],
    

    'ROC AUC Score': [Soft_Voting_Classifiers_ROC_AUC_score,
                     Bagging_Classifiernusing_RandomForestClassifier_ROC_AUC_score,
                     AdaBoost_Classifier_using_RandomForestClassifier_ROC_AUC_score]})


models

# Choosing the best Classifiers and manipulating threshold 

In this kind of projects we want all the infected persons to be correctly classified 
even if the uninfected persons classified as infected  they will go through more tests.

In [None]:
print(classification_report(y_test_unscaled,
                           rf_bagging_model.predict(x_test_unscaled)))

In [None]:
print(classification_report(y_test_unscaled,
                           rf_bagging_model.predict_proba(x_test_unscaled)[:,1] > 0.15))

In [None]:
confusion_matrix(y_test_unscaled, rf_bagging_model.predict_proba(x_test_unscaled)[:,1] > 0.15)

Setting the threshold to 0.15 give us 100% Recall and every infected  in correctly classified but 
14 uninfected  persons incorrecltly classified as infected its ok in this kind of projects they will go 
thought more tests.

But if these more test is expensive, you can make a trade off between the precision  and the recall by
changing the threshold.

And by tweking the threshold we get a 88.3% testset accuracy!!!

In [None]:
Bagging_Classifier_using_RandomForestClassifier_test_acc =  rf_bagging_model.score(x_test_unscaled, rf_bagging_model.predict_proba(x_test_unscaled)[:,1] > 0.6)
print("Bagging Classifiernusing RandomForestClassifier Testing Accuracy: {:.3F}".format(Bagging_Classifier_using_RandomForestClassifier_test_acc))