## Ensembling

In [49]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import  accuracy_score,log_loss, r2_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder


import warnings
warnings.filterwarnings("ignore")


In [50]:
df = pd.read_csv(r'C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Cases\Wisconsin\BreastCancer.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant
95719,6,10,10,10,8,10,7,10,7,Malignant
128059,1,1,1,1,2,5,5,1,1,Benign


In [51]:
df.Class.value_counts()

Class
Benign       458
Malignant    241
Name: count, dtype: int64

In [52]:
X = df.drop(columns=['Class'], axis=1)
y = df['Class']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

In [54]:
dtc = DecisionTreeClassifier(random_state=24)
lr = LogisticRegression(random_state=24)
nb = GaussianNB()
vote = VotingClassifier([('DT', dtc), ('LR', lr), ('NB', nb)], voting='soft')

In [55]:
vote.fit(X_train, y_train)

In [56]:
type(vote)

sklearn.ensemble._voting.VotingClassifier

In [57]:
y_pred = vote.predict(X_test)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

Accuracy Score:  0.9666666666666667


In [58]:
y_pred_prob = vote.predict_proba(X_test)[:, 1] #predict_proba works with only voting='soft'
print("Accuracy Score: ", roc_auc_score(y_test, y_pred_prob))

Accuracy Score:  0.9951690821256038


Gridsearch

In [59]:
dtc = DecisionTreeClassifier(random_state=24)
lr = LogisticRegression(random_state=24)
nb = GaussianNB()
vote = VotingClassifier([('DT', dtc), ('LR', lr), ('NB', nb)], voting='hard')


params = { 
    'LR__C': np.linspace(0.001, 3, 5),
    'DT__max_depth': [None, 2, 3],
    'DT__min_samples_leaf':  [2, 10, 20],
    'DT__min_samples_split':  [1, 10, 20],
}

kfold = KFold(n_splits=5, shuffle=True, random_state=24)


gcv = GridSearchCV(
    estimator=vote,
    param_grid=params,
    cv=kfold
)



In [60]:
gcv.fit(X_train, y_train)

In [61]:
print(gcv.best_score_)
print(gcv.best_params_)

0.9611613717652009
{'DT__max_depth': None, 'DT__min_samples_leaf': 20, 'DT__min_samples_split': 10, 'LR__C': 0.75075}


In [62]:
best_model = gcv.best_estimator_
best_model

In [63]:
y_pred = best_model.predict(X_test)
print('Accuracy Score:', accuracy_score(y_test, y_pred))

Accuracy Score: 0.9761904761904762


In [None]:
#VotingClassifier.predict_proba() works with only voting='soft'
vote_soft = VotingClassifier([('DT', dtc), ('LR', lr), ('NB', nb)], voting='soft')
gcv = GridSearchCV(
    estimator=vote_soft,
    param_grid=params,
    cv=kfold
)

gcv.fit(X_train, y_train)

In [65]:
print(gcv.best_score_)
print(gcv.best_params_)

0.9611613717652009
{'DT__max_depth': None, 'DT__min_samples_leaf': 20, 'DT__min_samples_split': 10, 'LR__C': 0.001}


In [66]:
best_model = gcv.best_estimator_

In [67]:
y_pred = best_model.predict(X_test)
print('Accuracy Score:', accuracy_score(y_test, y_pred))

Accuracy Score: 0.9761904761904762


In [68]:
y_pred_prob = best_model.predict_proba(X_test)[:, 1] #predict_proba works with only voting='soft'
print('ROC AUC score: ', roc_auc_score(y_test, y_pred_prob))

ROC AUC score:  0.9930555555555556
