In [213]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

X = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
y = data_breast_cancer['target']

In [214]:
from sklearn.model_selection import train_test_split

In [215]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [169]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [170]:
from sklearn.ensemble import VotingClassifier

In [171]:
tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()
reg = LogisticRegression()

In [172]:
eclf_hard = VotingClassifier(estimators=[('tree_clf', tree_clf), ('knn_clf', knn_clf), ('reg', reg)], voting='hard')

In [173]:
eclf_soft = VotingClassifier(estimators=[('tree_clf', tree_clf), ('knn_clf', knn_clf), ('reg', reg)], voting='soft')

In [174]:
eclf_hard.fit(X_train, y_train)

In [175]:
eclf_soft.fit(X_train, y_train)

In [176]:
tree_clf.fit(X_train, y_train)

In [177]:
knn_clf.fit(X_train, y_train)

In [178]:
reg.fit(X_train, y_train)

In [179]:
from sklearn.metrics import accuracy_score

In [180]:
classifiers = [eclf_hard, eclf_soft, tree_clf, knn_clf, reg]

In [181]:
acc = []

In [182]:
for classifier in classifiers:
    clf_train_acc = accuracy_score(y_train, classifier.predict(X_train))
    clf_test_acc = accuracy_score(y_test, classifier.predict(X_test))
    acc.append((clf_train_acc, clf_test_acc))

In [183]:
print(acc)

[(0.8417582417582418, 0.7368421052631579), (0.9582417582417583, 0.6929824561403509), (1.0, 0.6403508771929824), (0.7604395604395604, 0.7192982456140351), (0.701098901098901, 0.7017543859649122)]


In [184]:
import pickle

In [185]:
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(acc, f)

In [186]:
with open('vote.pkl', 'wb') as f:
    pickle.dump(classifiers, f)

In [187]:
from sklearn.ensemble import BaggingClassifier 

In [188]:
bg_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30)

In [189]:
bg_clf.fit(X_train, y_train)

In [190]:
bg_clf_50 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, max_samples=0.5)

In [191]:
bg_clf_50.fit(X_train, y_train)

In [192]:
p_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=False)

In [193]:
p_clf.fit(X_train, y_train)

In [194]:
p_clf_50 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, bootstrap=False)

In [195]:
p_clf_50.fit(X_train, y_train)

In [196]:
from sklearn.ensemble import RandomForestClassifier

In [197]:
rf_clf = RandomForestClassifier(n_estimators=30)

In [198]:
rf_clf.fit(X_train, y_train)

In [199]:
from sklearn.ensemble import AdaBoostClassifier

In [200]:
ab_clf = AdaBoostClassifier(n_estimators=30)

In [201]:
ab_clf.fit(X_train, y_train)

In [202]:
from sklearn.ensemble import GradientBoostingRegressor

In [203]:
gb_clf = GradientBoostingRegressor(n_estimators=30)

In [234]:
gb_clf.fit(X_train, y_train)

In [235]:
classifiers = [bg_clf, bg_clf_50, p_clf, p_clf_50, rf_clf, ab_clf, gb_clf]

In [206]:
acc = []

In [233]:
for classifier in classifiers:
    train_acc = accuracy_score(y_train, classifier.predict(X_train))
    test_acc = accuracy_score(y_test, classifier.predict(X_test))
    acc.append((train_acc, test_acc))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [43]:
print(acc)

[(0.9978021978021978, 0.6666666666666666), (0.9120879120879121, 0.6754385964912281), (1.0, 0.631578947368421), (0.9472527472527472, 0.6666666666666666), (0.9956043956043956, 0.631578947368421), (0.8131868131868132, 0.7456140350877193)]


In [44]:
with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(acc, f)

In [45]:
with open('bag.pkl', 'wb') as f:
    pickle.dump(classifiers, f)

In [46]:
import numpy as np

In [125]:
X = data_breast_cancer['data']

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [127]:
bagging_classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                                        n_estimators=30, 
                                        max_samples=0.5,
                                        bootstrap=False,
                                        max_features=2)

bagging_classifier.fit(X_train, y_train)



In [128]:
acc = (accuracy_score(y_train, bagging_classifier.predict(X_train)), accuracy_score(y_test, bagging_classifier.predict(X_test)))

In [129]:
print(acc)

(0.9978021978021978, 0.9736842105263158)


In [130]:
with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(acc, f)

In [131]:
with open('fea.pkl', 'wb') as f:
    pickle.dump(bagging_classifier, f)

In [132]:
print(bagging_classifier.estimators_)

[DecisionTreeClassifier(random_state=2066532162), DecisionTreeClassifier(random_state=1362159568), DecisionTreeClassifier(random_state=337119540), DecisionTreeClassifier(random_state=1192344231), DecisionTreeClassifier(random_state=704962033), DecisionTreeClassifier(random_state=2002291383), DecisionTreeClassifier(random_state=480854749), DecisionTreeClassifier(random_state=1353383792), DecisionTreeClassifier(random_state=1810529906), DecisionTreeClassifier(random_state=1773893347), DecisionTreeClassifier(random_state=1151765569), DecisionTreeClassifier(random_state=144373594), DecisionTreeClassifier(random_state=985873361), DecisionTreeClassifier(random_state=849897426), DecisionTreeClassifier(random_state=111347471), DecisionTreeClassifier(random_state=672981911), DecisionTreeClassifier(random_state=1643240292), DecisionTreeClassifier(random_state=1067377575), DecisionTreeClassifier(random_state=462263190), DecisionTreeClassifier(random_state=759850378), DecisionTreeClassifier(random

In [133]:
print(bagging_classifier.estimators_features_)

[array([ 5, 22]), array([17,  1]), array([25, 11]), array([17, 16]), array([ 1, 11]), array([ 4, 18]), array([20,  2]), array([23,  9]), array([12, 21]), array([16, 20]), array([5, 9]), array([20, 18]), array([19, 18]), array([27,  1]), array([19, 14]), array([28,  5]), array([18,  3]), array([28, 14]), array([19,  4]), array([11,  8]), array([28,  7]), array([ 6, 20]), array([18, 23]), array([ 1, 26]), array([28, 14]), array([21,  2]), array([23,  1]), array([14,  9]), array([25, 27]), array([24, 26])]


In [152]:
import pandas as pd

In [153]:
train_accs = []
test_accs = []
feature_lists = []

for estimator, features in zip(bagging_classifier.estimators_, bagging_classifier.estimators_features_):
    estimator.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, estimator.predict(X_train))
    test_acc = accuracy_score(y_test, estimator.predict(X_test))

    feature_names = X.columns[features].values.tolist()
    
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    feature_lists.append(feature_names)

df = pd.DataFrame({
    "train_acc": train_accs,
    "test_acc": test_accs, 
    "features": feature_lists
})


In [157]:
df = df.sort_values(by=['train_acc', 'test_acc'])

In [162]:
with open('acc_fea_rank.pkl', 'wb') as f:
    pickle.dump(df, f)