In [6]:
from sklearn.ensemble import VotingClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.metrics import roc_auc_score,accuracy_score,r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [22]:
df = pd.read_csv("BreastCancer.csv", index_col=0)
X= df.drop('Class', axis=1)
y= df['Class']

In [36]:
df.head()

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant
95719,6,10,10,10,8,10,7,10,7,Malignant
128059,1,1,1,1,2,5,5,1,1,Benign


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=24, test_size=0.3, stratify=y)

In [28]:
dtc = DecisionTreeClassifier(random_state=24, max_depth=3)
lr= LogisticRegression(random_state=24)
nb= GaussianNB()
voting = VotingClassifier([('DT',dtc), ('LR', lr),('NB', nb)], voting='soft')
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9761904761904762


In [29]:
#Soft Voting

In [27]:
y_pred_prob= voting.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.9951690821256038


In [30]:
kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=24)
params = {  'LR__C':np.linspace(0.001,3,5), 'DT__min_samples_split':[2,10,20], 'DT__min_samples_leaf':[1,10,20],'DT__max_depth':[None,2,3]}
gcv = GridSearchCV(voting,param_grid=params,cv=kfold,scoring='roc_auc',verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV 1/5] END DT__max_depth=None, DT__min_samples_leaf=1, DT__min_samples_split=2, LR__C=0.001;, score=0.997 total time=   0.0s
[CV 2/5] END DT__max_depth=None, DT__min_samples_leaf=1, DT__min_samples_split=2, LR__C=0.001;, score=0.995 total time=   0.0s
[CV 3/5] END DT__max_depth=None, DT__min_samples_leaf=1, DT__min_samples_split=2, LR__C=0.001;, score=0.994 total time=   0.0s
[CV 4/5] END DT__max_depth=None, DT__min_samples_leaf=1, DT__min_samples_split=2, LR__C=0.001;, score=0.990 total time=   0.0s
[CV 5/5] END DT__max_depth=None, DT__min_samples_leaf=1, DT__min_samples_split=2, LR__C=0.001;, score=0.993 total time=   0.0s
[CV 1/5] END DT__max_depth=None, DT__min_samples_leaf=1, DT__min_samples_split=2, LR__C=0.75075;, score=0.995 total time=   0.0s
[CV 2/5] END DT__max_depth=None, DT__min_samples_leaf=1, DT__min_samples_split=2, LR__C=0.75075;, score=0.995 total time=   0.0s
[CV 3/5] END DT__max_depth=None, DT__min_sam

In [31]:
print(gcv.best_params_)
print(gcv.best_score_)

{'DT__max_depth': None, 'DT__min_samples_leaf': 1, 'DT__min_samples_split': 2, 'LR__C': 0.001}
0.9937986416496842


In [32]:
best_tree = gcv.best_estimator_
best_tree