# **B**ootstraped **Agg**regation

In [44]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import random

from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [45]:
n_points = 10000
X, y = make_classification(n_samples=n_points, random_state=23)
data_indeces = np.arange(len(X))

## Bagging from scratch

In [46]:
## pick a number uniformly at random from a sample(X)
def pick_a_num(X):
  index = round(random.random()*(len(X)-1))
  return X[index]

## resample samples from the sample(X) of size 'n'
def resample(X, n):
  sample = []
  for i in range(n):
    sample.append(pick_a_num(X))
  return sample

In [47]:
n_estimators = 1000 # no. of base models
m = 700 # no. of samples drawn from X,y for each base models

## Randomly sample the 'n_estimators' samples from 'X','y' with replacement 
base_samples_list = []; oob_samples_list = []
for k in range(n_estimators):
  base_samples = resample(data_indeces, m)
  oob_samples = [i for i in data_indeces if i not in base_samples ]

  base_samples_list.append(base_samples)
  oob_samples_list.append(oob_samples)

## Trian 'n_estimators' Decision Trees from the resampled 'n_estimators' samples 
train_acc_list = []; oob_acc_list = []
for k in range(n_estimators):
  X_train = X[base_samples_list[k],:]; y_train = y[base_samples_list[k]]
  X_oob = X[oob_samples_list[k],:]; y_oob = y[oob_samples_list[k]]

  ## Train Decision Trees on every resampled datasets
  clf = DecisionTreeClassifier(max_depth=3, random_state=12)
  clf.fit(X_train, y_train)

  ## predict and calculate train and oob score
  y_pred_train = clf.predict(X_train)
  y_pred_oob = clf.predict(X_oob)

  train_acc_list.append(accuracy_score(y_train, y_pred_train))
  oob_acc_list.append(accuracy_score(y_oob, y_pred_oob))

train_mean_accuracy = np.mean(train_acc_list)
oob_mean_accuracy = np.mean(oob_acc_list)

print(f" Train accuracy: {train_mean_accuracy}\n OOB accuracy: {oob_mean_accuracy}")

 Train accuracy: 0.9590642857142857
 OOB accuracy: 0.9413963496621757


## Comparision with Sklearn's RandomForestClassifier

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
clf = RandomForestClassifier(max_depth=3, oob_score=True, random_state=0)
clf.fit(X, y)

y_pred = clf.predict(X)
train_accuracy_sklearn = accuracy_score(y,y_pred)
oob_accuracy_sklearn = clf.oob_score_

print(f"sklearn's:\n Train accuracy: {train_accuracy_sklearn}\n OOB accuracy: {oob_accuracy_sklearn}")

sklearn's:
 Train accuracy: 0.9509
 OOB accuracy: 0.9499
