In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import math
import pandas as pd
from IPython.display import clear_output


In [None]:
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
def draw_points(X, y):
    clear_output(wait=True)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

    plt.figure(2, figsize=(8, 6))
    plt.clf()

    # Plot the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1,
                edgecolor='k')
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()

draw_points(X_train, y_train)



In [None]:
clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
def sample_new_X_y(X,y, with_replacement=True):
    # With replacement, it is bagging. Without replacement, pasting
    X_sample = pd.DataFrame(X).sample(int(math.sqrt(len(X)) * 5), replace=with_replacement)
    y_sample = y[X_sample.index]
    return X_sample.to_numpy(), y_sample

X_sample, y_sample = sample_new_X_y(X_train, y_train)

draw_points(X_sample, y_sample)

In [None]:
import time
from sklearn.metrics import f1_score


def get_scores_and_clf_list(model=DecisionTreeClassifier, draw=True):
    classifier_list = []
    scores = []
    for i in range(10):
        current_classifier = model()
        X_sample, y_sample = sample_new_X_y(X_train, y_train, False)
        current_classifier.fit(X_sample, y_sample)

        if draw: draw_points(X_sample, y_sample)
        print(f"Classifier number {i + 1}")
        y_pred = current_classifier.predict(X_test)
        if draw: print(classification_report(y_test, y_pred))
        scores.append(f1_score(y_test, y_pred, average='weighted'))
        classifier_list.append((f"Classifier number {i + 1}", current_classifier))
        if draw: time.sleep(2)
    return scores, classifier_list

scores, classifier_list= get_scores_and_clf_list(draw=True)
    

In [None]:
sum(scores)/len(scores)

## Bagging and Pasting

In [None]:
from collections import Counter

y_pred = []
for idx, label in enumerate(y_test):
    instance = X_test[idx]
    predictions = [clf[1].predict(instance.reshape(1, -1))[0] for clf in classifier_list]
    y_pred.append(Counter(predictions).most_common()[0][0])
    
print(classification_report(y_test, y_pred))


In [None]:
#Soft voting
y_pred = []
import numpy as np
for idx, label in enumerate(y_test):
    instance = X_test[idx]
    predictions_prob = [clf[1].predict_proba(instance.reshape(1, -1))[0] for clf in classifier_list]
    mean_probability = np.mean(np.vstack(predictions_prob), axis=0)

    prediction = np.argmax(mean_probability)
    if idx % 20 == 0:
        print(f"Prediction was {prediction} and probabilities were{mean_probability}")
    y_pred.append(prediction)
    
print(classification_report(y_test, y_pred))

## Stacking

In [None]:
scores, classifier_list= get_scores_and_clf_list(model=LogisticRegression, draw=True)


In [None]:
sum(scores)/len(scores)

In [None]:
import numpy as np

def get_new_X_for_stacking(X, y):
    new_X = []

    for idx, label in enumerate(y):
        instance = X[idx]
        predictions = [clf[1].predict_proba(instance.reshape(1, -1))[0] for clf in classifier_list]
        new_X.append(np.concatenate(predictions))
    return new_X

In [None]:
new_X_train = get_new_X_for_stacking(X_train, y_train)
clf = DecisionTreeClassifier()
clf.fit(new_X_train, y_train)

In [None]:
y_pred = clf.predict(get_new_X_for_stacking(X_test, y_test))

In [None]:
print(classification_report(y_test, y_pred))

## Boosting

In [None]:
from sklearn.datasets import load_boston

In [None]:
boston = load_boston()

In [None]:
X = boston.data

In [None]:
y = boston.target

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(max_depth=2)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

print(f"MSE: {mean_squared_error(y_test, y_pred)}")

In [None]:
y_train_red = clf.predict(X_train)
y_red = y_train - y_train_red

In [None]:
clf2 = DecisionTreeRegressor(max_depth=2)

In [None]:
clf2.fit(X_train, y_red)

In [None]:
y_pred_red = clf2.predict(X_test)

In [None]:
y_pred = clf.predict(X_test) + clf2.predict(X_test)

In [None]:
print(f"MSE: {mean_squared_error(y_test, y_pred)}")