<a href="https://colab.research.google.com/github/rajdeepbanerjee-git/JNCLectures_Intro_to_ML/blob/main/Week10/Lec10_AdaBoost_M1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- data source [link](https://www.kaggle.com/datasets/uciml/iris)
- Algorithm: Adaboost.M1, reference: "Elements of statistical learning" - Hastie, Tibshirani [e-book](https://hastie.su.domains/Papers/ESLII.pdf)

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score

from sklearn import datasets

In [2]:
iris = pd.read_csv("Iris.csv")
data = iris[(iris['Species'] == 'Iris-versicolor') | (iris['Species'] == 'Iris-virginica')]
data['Label'] = data['Species'].replace(to_replace = ['Iris-versicolor','Iris-virginica'], value=[1,-1])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Label'] = data['Species'].replace(to_replace = ['Iris-versicolor','Iris-virginica'], value=[1,-1])


In [3]:
data.drop(["Id", "Species"], axis = 1, inplace = True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(["Id", "Species"], axis = 1, inplace = True)


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Label
50,7.0,3.2,4.7,1.4,1
51,6.4,3.2,4.5,1.5,1
52,6.9,3.1,4.9,1.5,1
53,5.5,2.3,4.0,1.3,1
54,6.5,2.8,4.6,1.5,1


In [4]:
X = data.drop(["Label"], axis = 1)
y = data["Label"]

In [5]:
# fit basic regression tree with depth 3
base_tree = DecisionTreeClassifier(max_depth=1, random_state=0)
base_tree.fit(X, y)
y_pred = base_tree.predict(X)

# metrics
f1_score(y, y_pred)

0.9423076923076924

#### Steps:
- get bootstrap data with weights initialized/updated
- train and predict
- use misclassified observation weights to compute alpha
- update weights of misclassified observations using alpha
- repeat M times

In [6]:
rng = np.random.RandomState(42)

def get_bootstrap_data(X_train, y_train, weights):
    # gets bootstrapped data using weights supplied
    train_indices = X_train.index
    bootstrap_indices = rng.choice(train_indices,
                                  len(train_indices),
                                  replace = True,
                                  p = weights)
    X_train_bootstrap = X_train.loc[bootstrap_indices]
    y_train_bootstrap = y_train.loc[bootstrap_indices]

    return X_train_bootstrap, y_train_bootstrap, bootstrap_indices

def get_updated_weights(y, y_pred, weights):
  # updates weights for the misclassified observations
  # note, y is the data with which you will check misclassification
  misclass_weights = weights[np.where(y_pred != y)]
  err = np.sum(misclass_weights)/np.sum(weights)
  alpha = np.log((1 - err)/err)
  weights[np.where(y_pred != y)] = weights[np.where(y_pred != y)]*np.exp(alpha)
  weights[np.where(y_pred == y)] = weights[np.where(y_pred == y)]*np.exp(-1*alpha)
  norm_weights = weights/np.sum(weights) # normalize so that weight sum to 1
  return norm_weights, alpha

In [11]:
M = 5 # number of boosting trees
weights = (1/len(X))*np.ones(len(X))
y_pred_list = []
alpha_list = []
weights_list = []
#bs_ind_list = []


for m in range(M):

  weights_list.append(weights)

  # get weighted bootstrapped data
  X_bs, y_bs, _ = get_bootstrap_data(X_train = X, y_train = y, weights = weights)

  # fit and predict
  tree = DecisionTreeClassifier(max_depth=1, random_state=0)
  tree.fit(X_bs, y_bs)
  y_pred = tree.predict(X) # you have to predict on X, not X_bs

  # get misclassified observations to update weights
  new_weights, alpha = get_updated_weights(y = y, y_pred = y_pred, weights = weights)
  weights = new_weights

  # save
  y_pred_list.append(y_pred)
  alpha_list.append(alpha)
  #bs_ind_list.append(bs_ind)



In [14]:
def get_result(alpha_list, y_pred_list, M):

  temp = [alpha_list[i]*y_pred_list[i] for i in range(M)]
  temp_res = np.sum(np.stack(temp), axis = 0)
  # the sign matters!
  res = np.where(temp_res >= 0, 1, -1)

  return res

In [15]:
res = get_result(alpha_list = alpha_list, y_pred_list = y_pred_list, M = M)
f1_score(y, res)

0.970873786407767

Boosting got me from 94.2% to 97% by just 5 iterations!