In [123]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

X = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
y = data_breast_cancer['target']

In [124]:
from sklearn.model_selection import train_test_split

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [126]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [127]:
from sklearn.ensemble import VotingClassifier

In [128]:
tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()
reg = LogisticRegression()

In [129]:
eclf_hard = VotingClassifier(estimators=[('tree_clf', tree_clf), ('knn_clf', knn_clf), ('reg', reg)], voting='hard')

In [130]:
eclf_soft = VotingClassifier(estimators=[('tree_clf', tree_clf), ('knn_clf', knn_clf), ('reg', reg)], voting='soft')

In [131]:
eclf_hard.fit(X_train, y_train)

In [132]:
eclf_soft.fit(X_train, y_train)

In [133]:
tree_clf.fit(X_train, y_train)

In [134]:
knn_clf.fit(X_train, y_train)

In [135]:
reg.fit(X_train, y_train)

In [136]:
from sklearn.metrics import accuracy_score

In [137]:
classifiers = [eclf_hard, eclf_soft, tree_clf, knn_clf, reg]

In [138]:
acc = []

In [139]:
for classifier in classifiers:
    clf_train_acc = accuracy_score(y_train, classifier.predict(X_train))
    clf_test_acc = accuracy_score(y_test, classifier.predict(X_test))
    acc.append((clf_train_acc, clf_test_acc))

In [140]:
print(acc)

[(0.8527472527472527, 0.631578947368421), (0.967032967032967, 0.5877192982456141), (1.0, 0.543859649122807), (0.7802197802197802, 0.6403508771929824), (0.7098901098901099, 0.6228070175438597)]


In [141]:
import pickle

In [142]:
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(acc, f)

In [143]:
with open('vote.pkl', 'wb') as f:
    pickle.dump(classifiers, f)

In [144]:
from sklearn.ensemble import BaggingClassifier 

In [145]:
bg_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30)

In [146]:
bg_clf.fit(X_train, y_train)

In [147]:
bg_clf_50 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, max_samples=0.5)

In [148]:
bg_clf_50.fit(X_train, y_train)

In [149]:
p_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=False)

In [150]:
p_clf.fit(X_train, y_train)

In [151]:
p_clf_50 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, bootstrap=False)

In [152]:
p_clf_50.fit(X_train, y_train)

In [153]:
from sklearn.ensemble import RandomForestClassifier

In [154]:
rf_clf = RandomForestClassifier(n_estimators=30)

In [155]:
rf_clf.fit(X_train, y_train)

In [156]:
from sklearn.ensemble import AdaBoostClassifier

In [157]:
ab_clf = AdaBoostClassifier(n_estimators=30)

In [158]:
ab_clf.fit(X_train, y_train)



In [159]:
from sklearn.ensemble import GradientBoostingClassifier

In [160]:
gb_clf = GradientBoostingClassifier(n_estimators=30)

In [161]:
gb_clf.fit(X_train, y_train)

In [162]:
classifiers = [bg_clf, bg_clf_50, p_clf, p_clf_50, rf_clf, ab_clf, gb_clf]
print(classifiers)

[BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(),
                  n_estimators=30), BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(),
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [163]:
acc = []

In [164]:
for classifier in classifiers:
    train_acc = accuracy_score(y_train, classifier.predict(X_train))
    test_acc = accuracy_score(y_test, classifier.predict(X_test))
    acc.append((train_acc, test_acc))

In [165]:
print(acc)

[(0.9956043956043956, 0.6578947368421053), (0.9186813186813186, 0.6666666666666666), (1.0, 0.5964912280701754), (0.9648351648351648, 0.6666666666666666), (0.9956043956043956, 0.6052631578947368), (0.8043956043956044, 0.6754385964912281), (0.8351648351648352, 0.6578947368421053)]


In [166]:
with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(acc, f)

In [167]:
with open('bag.pkl', 'wb') as f:
    pickle.dump(classifiers, f)

In [168]:
import numpy as np

In [169]:
X = data_breast_cancer['data']

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [171]:
bagging_classifier = BaggingClassifier(n_estimators=30, 
                                        max_samples=0.5,
                                        bootstrap=False,
                                        max_features=2)

bagging_classifier.fit(X_train, y_train)

In [172]:
acc = [accuracy_score(y_train, bagging_classifier.predict(X_train)), accuracy_score(y_test, bagging_classifier.predict(X_test))]

In [173]:
print(acc)

[1.0, 0.9649122807017544]


In [174]:
with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(acc, f)

In [175]:
with open('fea.pkl', 'wb') as f:
    pickle.dump([bagging_classifier], f)

In [176]:
print(bagging_classifier.estimators_)

[DecisionTreeClassifier(random_state=1058020768), DecisionTreeClassifier(random_state=508071529), DecisionTreeClassifier(random_state=2056129873), DecisionTreeClassifier(random_state=250387794), DecisionTreeClassifier(random_state=1488192377), DecisionTreeClassifier(random_state=438015681), DecisionTreeClassifier(random_state=1084049224), DecisionTreeClassifier(random_state=578060979), DecisionTreeClassifier(random_state=490734800), DecisionTreeClassifier(random_state=628634269), DecisionTreeClassifier(random_state=1286546163), DecisionTreeClassifier(random_state=1667351761), DecisionTreeClassifier(random_state=1429638794), DecisionTreeClassifier(random_state=227823612), DecisionTreeClassifier(random_state=156815140), DecisionTreeClassifier(random_state=902417656), DecisionTreeClassifier(random_state=721527811), DecisionTreeClassifier(random_state=621015138), DecisionTreeClassifier(random_state=416674543), DecisionTreeClassifier(random_state=735907140), DecisionTreeClassifier(random_st

In [177]:
print(bagging_classifier.estimators_features_)

[array([ 7, 20]), array([15, 14]), array([24, 15]), array([28,  0]), array([ 0, 24]), array([ 3, 16]), array([29,  0]), array([ 7, 16]), array([26, 12]), array([4, 8]), array([14,  1]), array([12, 10]), array([22, 21]), array([18, 25]), array([28,  6]), array([26,  2]), array([24,  1]), array([12, 24]), array([20,  2]), array([8, 5]), array([ 0, 25]), array([27, 28]), array([ 0, 19]), array([19,  6]), array([ 8, 22]), array([ 0, 27]), array([ 3, 12]), array([10, 15]), array([6, 0]), array([22,  0])]


In [178]:
import pandas as pd

In [179]:
train_accs = []
test_accs = []
feature_lists = []

for estimator, features in zip(bagging_classifier.estimators_, bagging_classifier.estimators_features_):
    estimator.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, estimator.predict(X_train))
    test_acc = accuracy_score(y_test, estimator.predict(X_test))

    feature_names = X.columns[features].tolist()
    
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    feature_lists.append(feature_names)

df = pd.DataFrame({
    "train_acc": train_accs,
    "test_acc": test_accs, 
    "features": feature_lists
})


In [180]:
df = df.sort_values(by=['train_acc', 'test_acc'], ascending=False)

In [181]:
df

Unnamed: 0,train_acc,test_acc,features
2,1.0,0.947368,"[worst smoothness, compactness error]"
4,1.0,0.947368,"[mean radius, worst smoothness]"
5,1.0,0.947368,"[mean area, concavity error]"
8,1.0,0.947368,"[worst concavity, perimeter error]"
9,1.0,0.947368,"[mean smoothness, mean symmetry]"
11,1.0,0.947368,"[perimeter error, radius error]"
14,1.0,0.947368,"[worst symmetry, mean concavity]"
16,1.0,0.947368,"[worst smoothness, mean texture]"
18,1.0,0.947368,"[worst radius, mean perimeter]"
24,1.0,0.947368,"[mean symmetry, worst perimeter]"


In [182]:
with open('acc_fea_rank.pkl', 'wb') as f:
    pickle.dump(df, f)