In [None]:
import time

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import matplotlib_inline
import seaborn as sns

from tqdm.notebook import tqdm

from src.backend import ensembles as ens

In [None]:
sns.set_theme(context='paper', style='whitegrid', font_scale=2)
matplotlib_inline.backend_inline.set_matplotlib_formats('pdf', 'svg')
np.random.seed(42)

# Data preparation

In [None]:
data = pd.read_csv('kc_house_data.csv', parse_dates=['date']).drop('id', axis=1)
data.date = pd.to_numeric(data.date)

data, target = data.drop('price', axis=1).to_numpy(), data['price'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=42)
X_train.shape, X_test.shape

# Random Forest

In [None]:
depths = np.arange(2, 11)
feature_subsample_sizes = np.linspace(0.1, 1, 5)

In [None]:
losses_rf = {}

for dpt in depths:
    losses_rf[dpt] = {}
    for ftr in feature_subsample_sizes:
        model = ens.RandomForestMSE(n_estimators=100, max_depth=dpt, feature_subsample_size=ftr)
        _, val_loss = model.fit(X_train, y_train, X_test, y_test)

        losses_rf[dpt][ftr] = np.sqrt(val_loss)

In [None]:
times_rf = {}

for i, dpt in enumerate(depths):
    times_rf[dpt] = {}
    for ftr in feature_subsample_sizes:
        times_rf[dpt][ftr] = []
        for n_est in range(5, 101, 5):
            start_time = time.perf_counter()

            model = ens.RandomForestMSE(n_estimators=n_est, max_depth=dpt, feature_subsample_size=ftr)
            model.fit(X_train, y_train)

            times_rf[dpt][ftr].append(time.perf_counter() - start_time)
    print(f'iteration [{i + 1}/{len(depths)}] ended', end='\r')

In [None]:
fig, ax = plt.subplots(len(depths), len(feature_subsample_sizes), figsize=(15, 18), sharex=True, sharey='row')
for i, ftr_sub_size in enumerate(feature_subsample_sizes):
    for j, dpt in enumerate(depths):
        ax[j][i].plot(losses_rf[dpt][ftr_sub_size])
        if j == 0:
            ax[j][i].set_title(f'ftr size = {ftr_sub_size}')
        if i + 1 == len(feature_subsample_sizes):
            ax[j][i].yaxis.set_label_position("right")
            ax[j][i].set_ylabel(f'depth = {dpt}')
fig.supxlabel('iteration')
fig.supylabel('loss')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(len(depths), len(feature_subsample_sizes), figsize=(15, 20), sharex=True, sharey='row')
for i, ftr_sub_size in enumerate(feature_subsample_sizes):
    for j, dpt in enumerate(depths):
        ax[j][i].plot(times_rf[dpt][ftr_sub_size])
        if j == 0:
            ax[j][i].set_title(f'ftr size = {ftr_sub_size}')
        if i + 1 == len(feature_subsample_sizes):
            ax[j][i].yaxis.set_label_position("right")
            ax[j][i].set_ylabel(f'depth = {dpt}')
plt.xticks(ticks=np.arange(0, len(times_rf_no_dpt[0.1]) + 1, 10), labels=np.linspace(5, 100, 3, dtype=int))
fig.supxlabel('iterations')
fig.supylabel('time (s)')
plt.tight_layout()
plt.show()

In [None]:
losses_rf_no_dpt = {}

for ftr in feature_subsample_sizes:
    model = ens.RandomForestMSE(n_estimators=100, max_depth=None, feature_subsample_size=ftr)
    _, val_loss = model.fit(X_train, y_train, X_test, y_test)

    losses_rf_no_dpt[ftr] = np.sqrt(val_loss)

In [None]:
times_rf_no_dpt = {}

for i, ftr in enumerate(feature_subsample_sizes):
    iter_start = time.perf_counter()
    times_rf_no_dpt[ftr] = []
    for n_est in range(5, 101, 5):
        start_time = time.perf_counter()

        model = ens.RandomForestMSE(n_estimators=n_est, max_depth=None, feature_subsample_size=ftr)
        model.fit(X_train, y_train)

        times_rf_no_dpt[ftr].append(time.perf_counter() - start_time)
    iter_time = time.perf_counter() - iter_start
    print(f'iteration [{i + 1}/{len(feature_subsample_sizes)}] ended, time: {iter_time:.02f}', end='\r')

In [None]:
plt.figure(figsize=(8, 5))

for ftr in feature_subsample_sizes:
    plt.plot(losses_rf_no_dpt[ftr], label=str(ftr)[:5])

plt.legend()
plt.xlabel('iteration')
plt.ylabel('loss')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))

for ftr in feature_subsample_sizes:
    plt.plot(times_rf_no_dpt[ftr][:-1], label=str(ftr)[:5])

plt.xticks(ticks=np.arange(0, len(times_rf_no_dpt[0.1]), 2), labels=np.arange(5, 101, 10))

plt.legend()
plt.xlabel('iterations')
plt.ylabel('time (s)')
plt.show()

# Gradient Boosting

In [None]:
depths = np.arange(2, 11)
feature_subsample_sizes = np.linspace(0.1, 1, 5)
lrs = [0.001, 0.01, 0.1, 1, 10]

In [None]:
losses_gb = {}

for dpt in depths:
    losses_gb[dpt] = {}
    for ftr in feature_subsample_sizes:
        losses_gb[dpt][ftr] = {}
        for lr in lrs:
            model = ens.GradientBoostingMSE(n_estimators=100, max_depth=dpt, feature_subsample_size=ftr, learning_rate=lr)
            _, val_loss = model.fit(X_train, y_train, X_test, y_test)
    
            losses_gb[dpt][ftr][lr] = np.sqrt(val_loss)

In [None]:
times_gb = {}

for i, dpt in enumerate(depths):
    iter_start = time.perf_counter()
    times_gb[dpt] = {}
    for ftr in feature_subsample_sizes:
        times_gb[dpt][ftr] = {}
        for lr in lrs:
            times_gb[dpt][ftr][lr] = []
            for n_est in range(5, 101, 5):
                start_time = time.perf_counter()
    
                model = ens.RandomForestMSE(n_estimators=n_est, max_depth=dpt, feature_subsample_size=ftr)
                model.fit(X_train, y_train)
    
                times_gb[dpt][ftr][lr].append(time.perf_counter() - start_time)
    iter_time = time.perf_counter() - iter_start
    print(f'iteration [{i + 1}/{len(depths)}] ended, time: {iter_time:.02f}', end='\r')

In [None]:
fig, ax = plt.subplots(len(depths), len(feature_subsample_sizes), figsize=(15, 18), sharex=True, sharey='row')
for i, ftr_sub_size in enumerate(feature_subsample_sizes):
    for j, dpt in enumerate(depths):
        for lr in lrs[:-1]:
            ax[j][i].plot(losses_gb[dpt][ftr_sub_size][lr], label=str(lr)[:5])
        if j == 0:
            ax[j][i].set_title(f'ftr size = {ftr_sub_size}')
        if i + 1 == len(feature_subsample_sizes):
            ax[j][i].yaxis.set_label_position("right")
            ax[j][i].set_ylabel(f'depth = {dpt}')
        handles, labels = ax[j][i].get_legend_handles_labels()
fig.supxlabel('iteration')
fig.supylabel('loss')
fig.legend(handles, labels, loc='upper center', ncol=6, bbox_to_anchor=(0.5, 0), title='learning rate')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(len(depths), len(feature_subsample_sizes), figsize=(15, 18), sharex=True, sharey='row')
for i, ftr_sub_size in enumerate(feature_subsample_sizes):
    for j, dpt in enumerate(depths):
        for lr in lrs[:-1]:
            ax[j][i].plot(times_gb[dpt][ftr_sub_size][lr], label=str(lr)[:5])
        if j == 0:
            ax[j][i].set_title(f'ftr size = {ftr_sub_size}')
        if i + 1 == len(feature_subsample_sizes):
            ax[j][i].yaxis.set_label_position("right")
            ax[j][i].set_ylabel(f'depth = {dpt}')
        handles, labels = ax[j][i].get_legend_handles_labels()
plt.xticks(ticks=np.arange(0, 21, 10), labels=np.linspace(5, 100, 3, dtype=int))
fig.supxlabel('iteration')
fig.supylabel('time (s)')
fig.legend(handles, labels, loc='upper center', ncol=6, bbox_to_anchor=(0.5, 0), title='learning rate')
plt.tight_layout()
plt.show()

In [None]:
losses_gb_no_dpt = {}

for ftr in feature_subsample_sizes:
    losses_gb_no_dpt[ftr] = {}
    for lr in lrs:
        model = ens.GradientBoostingMSE(n_estimators=50, max_depth=100, feature_subsample_size=ftr, learning_rate=lr)
        _, val_loss = model.fit(X_train, y_train, X_test, y_test)

        losses_gb_no_dpt[ftr][lr] = np.sqrt(val_loss)

In [None]:
times_gb_no_dpt = {}

for i, ftr in enumerate(feature_subsample_sizes):
    times_gb_no_dpt[ftr] = {}
    for lr in lrs:
        times_gb_no_dpt[ftr][lr] = []
        for n_est in range(5, 51, 5):
            start_time = time.perf_counter()

            model = ens.GradientBoostingMSE(n_estimators=n_est, max_depth=100, feature_subsample_size=ftr, learning_rate=lr)
            model.fit(X_train, y_train)

            times_gb_no_dpt[ftr][lr].append(time.perf_counter() - start_time)
    print(f'iteration [{i + 1}/{len(feature_subsample_sizes)}] ended', end='\r')

In [None]:
fig, ax = plt.subplots(len(lrs) - 1, len(feature_subsample_sizes), figsize=(12, 8), sharex=True, sharey='row')
for i, ftr_sub_size in enumerate(feature_subsample_sizes):
    for j, lr in enumerate(lrs[:-1]):
        ax[j][i].plot(losses_gb_no_dpt[ftr_sub_size][lr])
        if j == 0:
            ax[j][i].set_title(f'ftr size = {ftr_sub_size}')
        if i + 1 == len(feature_subsample_sizes):
            ax[j][i].yaxis.set_label_position("right")
            ax[j][i].set_ylabel(f'lr = {lr}')
fig.supxlabel('iteration')
fig.supylabel('loss')
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(len(lrs), len(feature_subsample_sizes), figsize=(12, 8), sharex=True, sharey='row')
for i, ftr_sub_size in enumerate(feature_subsample_sizes):
    for j, lr in enumerate(lrs):
        ax[j][i].plot(times_gb_no_dpt[ftr_sub_size][lr])
        if j == 0:
            ax[j][i].set_title(f'ftr size = {ftr_sub_size}')
        if i + 1 == len(feature_subsample_sizes):
            ax[j][i].yaxis.set_label_position("right")
            ax[j][i].set_ylabel(f'lr = {lr}')
plt.xticks(ticks=np.arange(0, 11, 10), labels=np.linspace(5, 50, 2, dtype=int))
fig.supxlabel('iterations')
fig.supylabel('time (s)')
plt.tight_layout()
plt.show()