<a href="https://colab.research.google.com/github/samtam0714/ML/blob/master/EnsModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
#Load Dataset
data=pd.read_csv('./cancer.csv')
data.head()

Unnamed: 0,id,Clump Thickness,UofCSize,UofCShape,Marginal Adhesion,SECSize,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [None]:
#Remove Model from Dataset
data=data.drop('id',axis=1)
data.head()

Unnamed: 0,Clump Thickness,UofCSize,UofCShape,Marginal Adhesion,SECSize,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [None]:
#Show Key Statistics
data.describe()

Unnamed: 0,Clump Thickness,UofCSize,UofCShape,Marginal Adhesion,SECSize,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [None]:
#Define x and y variable
x = data.drop('Class',axis=1).values
y = data['Class'].values

In [None]:
#Prepare for Models for Comparison

#Load Library for Training
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,stratify=y,random_state = 100)

#Fix the imbalanced Classes
from imblearn.over_sampling import SMOTE
smt=SMOTE(random_state=100)
x_train_smt,y_train_smt = smt.fit_resample(x_train,y_train)

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train_smt)
x_test2 = sc.fit_transform(x_test)

x_2 = sc.fit_transform(x)



In [None]:
#Prepare Models - Linear, Lasso, Ridge and Elastic-net 
import xgboost as xgb
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

models = [LogisticRegression(),RandomForestClassifier(random_state=100),
          BaggingClassifier(random_state=100),
          AdaBoostClassifier(random_state =100),
          GradientBoostingClassifier(random_state=100),XGBClassifier(random_state=100),
          LGBMClassifier(objective='binary',random_state=100)]

In [None]:
#Create Model Comparison
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate

#Scoring Parameters
scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_macro': 'recall_macro'}

MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0
for alg in models:    
    rkfcv = alg.fit(x_train2,y_train_smt)
    rkf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
    rKFcv = cross_validate(rkfcv, x_2, y, scoring=scoring,
                         cv=rkf)
                            
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Algorithm Name'] = MLA_name
    MLA_compare.loc[row_index,'Precision'] = round(rKFcv['test_prec_macro'].mean(),2)
    MLA_compare.loc[row_index,'Recall'] = round(rKFcv['test_rec_macro'].mean(),2)
    
    row_index+=1
    
MLA_compare.sort_values(by = ['Precision'], ascending = False, inplace = True)    
MLA_compare

Unnamed: 0,Algorithm Name,Precision,Recall
0,LogisticRegression,0.96,0.96
1,RandomForestClassifier,0.96,0.96
2,BaggingClassifier,0.96,0.95
3,AdaBoostClassifier,0.96,0.95
4,GradientBoostingClassifier,0.96,0.97
5,XGBClassifier,0.96,0.96
6,LGBMClassifier,0.96,0.96


In [None]:
#Create Voting Model
from sklearn.ensemble import VotingClassifier

estimators = []

model1 = LogisticRegression()
estimators.append(('LogReg', model1))

model2 = RandomForestClassifier(random_state=100)
estimators.append(('Rf', model2))

model3 = BaggingClassifier(random_state=100)
estimators.append(('Bagging', model3))

voting_clf=VotingClassifier(estimators,voting='soft')

scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_macro': 'recall_macro'}

for clf in (model1,model2,model3,voting_clf):
    rkfcv= clf.fit(x_train2,y_train_smt)
    ens_rkf1 = RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
    rKFcv = cross_validate(rkfcv, x_2, y, scoring=scoring, cv=ens_rkf1)
    print(clf.__class__.__name__,round(rKFcv['test_prec_macro'].mean(),2))   

LogisticRegression 0.96
RandomForestClassifier 0.96
BaggingClassifier 0.96
VotingClassifier 0.97


In [None]:
#Create Stacking Model
from mlxtend.classifier import StackingCVClassifier

#Identify Models
lr = LogisticRegression()
mod1 = RandomForestClassifier(random_state=100)
mod2 = BaggingClassifier(random_state=100)
mod3 = AdaBoostClassifier(random_state =100)

#Create Stacking Classifier
np.random.seed(100)
stackmod=StackingCVClassifier(classifiers=[mod1,mod2,mod3],
                             meta_classifier=lr)

scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_macro': 'recall_macro'}

for clf in (mod1,mod2,mod3,stackmod):
    rkfcv= clf.fit(x_train2,y_train_smt)
    ens_rkf1 = RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
    rKFcv = cross_validate(rkfcv, x_2, y, scoring=scoring, cv=ens_rkf1)
    print(clf.__class__.__name__,round(rKFcv['test_prec_macro'].mean(),2))  

RandomForestClassifier 0.96
BaggingClassifier 0.96
AdaBoostClassifier 0.96
StackingCVClassifier 0.96
