# 1. Bagging

In [562]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from scipy import stats

import pandas as pd
import numpy as np

class BaggingDecisionTree:
    def __init__(self, n_estimators=10, max_depth=3):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.estimators = []
        for i in range(n_estimators):
            learner = DecisionTreeClassifier(max_depth=self.max_depth, random_state=42)
            self.estimators.append(learner)

    def train(self, X_train, y_train):
        for learner in self.estimators:
            if self.n_estimators > 1:
                inds = np.random.randint(0,len(X_train), size=300+int(len(X_train)/self.n_estimators))
                X_train_sub = X_train[inds,:]
                y_train_sub = y_train[inds]
                #print(f'ind_1={inds[0]}')
                learner.fit(X_train_sub, y_train_sub)
            else:
                #print('Fitting single learning to entire training data')
                learner.fit(X_train, y_train)

    def predict(self, X_test):
        results = []
        for i,learner in enumerate(self.estimators):
            res = learner.predict(X_test)
            results.append(res)
        
        results = np.array(results)
        results_df = pd.DataFrame(results.T)
        results_df.to_csv('weak_learner_results.csv', index=False)
        return np.round(np.mean(results,axis=0))


In [563]:
# Load dataset
cars = pd.read_csv("car_evaluation.csv")
#print(cars.describe())

# Work on non-numeric data points
cars['number of doors'] = cars['number of doors'].replace('5more', '5')
cars['number of doors'] = cars['number of doors'].astype('float')
cars['number of persons'] = cars['number of persons'].replace('more', '5')
cars['number of persons'] = cars['number of persons'].astype('float')
#print(f'cars.shape={cars.shape}')
#print(cars['buying price'].unique())
#print(cars['maintenance cost'].unique())
#print(cars['lug_boot'].unique())
#print(cars['safety'].unique())
#print(cars['decision'].unique())
#print(cars.head())

# use encoding on categorical features
enc1 = OrdinalEncoder(categories=[['vhigh', 'high', 'med', 'low']])
enc2 = OrdinalEncoder(categories=[['big', 'med', 'small']])
enc3 = OrdinalEncoder(categories=[['vgood', 'good', 'acc', 'unacc']])
cars['bp_enc'] = enc1.fit_transform(cars[['buying price']])
cars['mc_enc'] = enc1.fit_transform(cars[['maintenance cost']])
cars['lb_enc'] = enc2.fit_transform(cars[['lug_boot']])
cars['sf_enc'] = enc1.fit_transform(cars[['safety']])
cars['dec_enc'] = enc3.fit_transform(cars[['decision']])
cars = cars.drop(['buying price', 'maintenance cost', 'lug_boot', 'safety', 'decision'], axis=1)
cars = cars.reset_index(drop=True)
#print(cars.head())
#print(cars.describe())
cars = cars.to_numpy()


In [564]:
## Train and test the decision trees

#print(cars.describe())
np.random.seed = 42
X = cars[:,:6]
#print(f'{X.shape}, {np.unique(X[:,0])}, {np.unique(X[:,1])}, {np.unique(X[:,2])}, {np.unique(X[:,3])}, {np.unique(X[:,4])}, {np.unique(X[:,5])}')
y = cars[:,6]
#print(f'{y.shape}, {np.unique(y)}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, stratify=y, random_state=42)

#print(f'X_train.shape={X_train.shape}')

# 

# Try different configurations of trees
for n_est, dep in zip([1, 10],[3, 5]):
    tree = BaggingDecisionTree(n_estimators=n_est, max_depth=dep)
    tree.train(X_train, y_train)
    results = tree.predict(X_test)
    accuracy = accuracy_score(results, y_test)
    r2_score1 = r2_score(results, y_test)
    rms_error = np.sqrt(mean_squared_error(results, y_test))
    print(f'n_estimators={n_est}, max_depth={dep}, accuracy={accuracy:.4f}, r2={r2_score1:.4f}, rms={rms_error:.4f}')

print(f'When we use n_estimators=1 (single tree for the entire dataset), the performance is not very good.')
print('As we increase the number of estimators and the depth of the tree, the performance of the ensemble improves.')





n_estimators=1, max_depth=3, accuracy=0.7977, r2=-0.7880, rms=0.5613
n_estimators=10, max_depth=5, accuracy=0.8873, r2=0.7272, rms=0.3914
When we use n_estimators=1 (single tree for the entire dataset), the performance is not very good.
As we increase the number of estimators and the depth of the tree, the performance of the ensemble improves.


# 2. Boosting

In [565]:

class BoostedDecisionTree:
    def __init__(self, n_estimators=10, max_depth=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.estimators = []
        self.alphas = []
        self.num_classes = None
        for i in range(n_estimators):
            learner = DecisionTreeClassifier(max_depth=self.max_depth, random_state=42)
            self.estimators.append(learner)

    def train(self, X_train, y_train):
        self.num_classes = len(np.unique(y_train))
        weights = np.ones_like(y_train)
        weights /= len(y_train)
        for learner in self.estimators:    
            learner.fit(X_train, y_train, sample_weight=weights)
            y_pred = learner.predict(X_train)
            errors = (y_pred != y_train).astype(int)
            #print(f'y_pred={y_pred[10:20]}')
            #print(f'y_train={y_train[10:20]}')
            #print(f'errors={errors[10:20]}')
            epsilon_t = np.sum(weights*errors)/np.sum(weights)
            alpha_t = 0.5*np.log((1-epsilon_t)/epsilon_t) + np.log(3)
            self.alphas.append(alpha_t)
            weights *= (1 + errors * np.exp(alpha_t))
            weights /= np.sum(weights)
            
    def predict(self, X_test):
        results = np.zeros(shape=(len(X_test), self.num_classes))
        
        for l, learner in enumerate(self.estimators):
            res = learner.predict(X_test)
            for i in range(len(X_test)):
                results[i, int(res[i])] += self.alphas[l]
        
        results_df = pd.DataFrame(results.T)
        results_df.to_csv('weak_learner_results.csv', index=False)
        return np.argmax(results, axis=1)


In [566]:
#print(cars.describe())
np.random.seed = 42
X = cars[:,:6]
#print(f'{X.shape}, {np.unique(X[:,0])}, {np.unique(X[:,1])}, {np.unique(X[:,2])}, {np.unique(X[:,3])}, {np.unique(X[:,4])}, {np.unique(X[:,5])}')
y = cars[:,6]
#print(f'{y.shape}, {np.unique(y)}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, stratify=y, random_state=42)

#print(f'X_train.shape={X_train.shape}')

# 
for n_est, dep in zip([10, 20, 20],[1, 1, 3]):
    tree = BoostedDecisionTree(n_estimators=n_est, max_depth=dep)
    tree.train(X_train, y_train)
    results = tree.predict(X_test)
    accuracy = accuracy_score(results, y_test)
    r2_score1 = r2_score(results, y_test)
    rms_error = np.sqrt(mean_squared_error(results, y_test))
    print(f'n_estimators={n_est}, max_depth={dep}, accuracy={accuracy:.4f}, r2={r2_score1:.4f}, rms={rms_error:.4f}')

print('As seen in the scores above, as we increase the number of estimators and the depth of trees, the accuracy of the ensemble improves')

n_estimators=10, max_depth=1, accuracy=0.8006, r2=-1.0284, rms=0.5840
n_estimators=20, max_depth=1, accuracy=0.7775, r2=0.3862, rms=0.5638
n_estimators=20, max_depth=3, accuracy=0.9711, r2=0.9161, rms=0.2150
As seen in the scores above, as we increase the number of estimators and the depth of trees, the accuracy of the ensemble improves
