In [None]:
"""
As methods such as 'Decision Tree' can be prone to the error of 'overfitting' on the training set which can in return lead to wrong predictions on new data, BOOTSTARP AGGREGATION (Bagging) is an ensembling style which attempts to resolve the overfitting error for regression or classification problems.

Bagging's objective is to improve the accuracy and performance of ml algorithms by randomly taking subsets of the original datasets, with replacement, then fitting either a regressor(for regression) or a classifier(for classification) to each subset.
predictions for each of these subsets are then aggregated via majority voting for classification, or averaginf for regression, thereby increasing the prediction's accuracy.
"""

In [None]:
# Evaluatng the Base Classifier: 

# import neccesary modules
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# load datasets
data = datasets.load_wine(as_frame=True)
X = data.data
y = data.target

# split X and y into train/test to properly evaluate our model on unseen data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

# instantiate Base Classifier and fit it to training data
dt = DecisionTreeClassifier(random_state=22)
dt.fit(X_train, y_train)

# predict the class of wine the unseen test set and evaluate the model performance
y_pred = dt.predict(X_test)
print("Train data accuracy: ", accuracy_score(y_true = y_train, y_pred = dt.predict(X_train)))
print("Test data accuracy: ", accuracy_score(y_true = y_test, y_pred = y_pred))


In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier

data = datasets.load_wine(as_frame = True)

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 22)

estimator_range = [2,4,6,8,10,12,14,16]

models = []
scores = []

for n_estimators in estimator_range:

    # Create bagging classifier
    clf = BaggingClassifier(n_estimators = n_estimators, random_state = 22)

    # Fit the model
    clf.fit(X_train, y_train)

    # Append the model and score to their respective list
    models.append(clf)
    scores.append(accuracy_score(y_true = y_test, y_pred = clf.predict(X_test)))

# Generate the plot of scores against number of estimators
plt.figure(figsize=(9,6))
plt.plot(estimator_range, scores)

# Adjust labels and font (to make visable)
plt.xlabel("n_estimators", fontsize = 18)
plt.ylabel("score", fontsize = 18)
plt.tick_params(labelsize = 16)

# Visualize plot
plt.show()

In [None]:
# By iterating through different values for the number of estimators we can see an increase in model performance from 82.2% to 95.5%. After 14 estimators the accuracy begins to drop, again if you set a different random_state the values you see will vary. That is why it is best practice to use cross validation to ensure stable results.

In [None]:
"""
Another Form of Evaluation
As bootstrapping chooses random subsets of observations to create classifiers, there are observations that are left out in the selection process. These "out-of-bag" observations can then be used to evaluate the model, similarly to that of a test set. Keep in mind, that out-of-bag estimation can overestimate error in binary classification problems and should only be used as a compliment to other metrics.

We saw in the last exercise that 12 estimators yielded the highest accuracy, so we will use that to create our model. This time setting the parameter oob_score to true to evaluate the model with out-of-bag score.
"""

# creating a model with out-of-bag metric
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier

data = datasets.load_wine(as_frame=True)

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)
oob_model = BaggingClassifier(n_estimators=12, oob_score=True, random_state=22)
oob_model.fit(X_train, y_train)

print(oob_model.oob_score_)

In [None]:
# Since the samples used in OOB and the test set are different, and the dataset is relatively small, there is a difference in the accuracy. It is rare that they would be exactly the same, again OOB should be used quick means for estimating error, but is not the only evaluation metric.

In [None]:
# Generating decision tree from bagging classifier

"""
It is also possible to see the individual decision trees that went into the aggregated classifier. This helps us to gain a more intuitive understanding on how the bagging model arrives at its predictions.

Note: This is only functional with smaller datasets, where the trees are relatively shallow and narrow making them easy to visualize.
"""

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import plot_tree

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)
clf = BaggingClassifier(n_estimators=12, oob_score=True, random_state=22)

clf.fit(X_train, y_train)
plt.figure(figsize=(30, 20))
plot_tree(clf.estimators_[0], feature_names=X.columns)