## Boosting
Aim - Implementation of Boosting using AdaBoost & GradientBoosting Approach

In [None]:
#Importing libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
dataset = load_iris()
x = dataset.data
y = dataset.target

In [None]:
print(x.shape, y.shape)

(150, 4) (150,)


In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.25, random_state = 47)

In [None]:
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(112, 4) (38, 4) (112,) (38,)


In [None]:
#Fitting an AdaBoostClassifier with 50 estimators/models/iterations
model = AdaBoostClassifier(n_estimators = 50)
model.get_params()

{'algorithm': 'deprecated',
 'estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [None]:
model.fit(xtrain, ytrain)

In [None]:
ypred = model.predict(xtest)
print("AdaBoost Model accuracy: ", accuracy_score(ytest, ypred))

AdaBoost Model accuracy:  0.9473684210526315


### Gradient Boosting

In [None]:
#Reading the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
print(test_data.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


In [None]:
#Dropping irrelevant columns
drop_columns = ['Name','Age','SibSp','Parch','Ticket','Cabin','Embarked']
train_data.drop(labels = drop_columns, axis =1, inplace = True)
test_data.drop(labels = drop_columns, axis =1, inplace = True)

In [None]:
#Encoding the categorical data
lb = LabelEncoder()
train_data['Sex'] = lb.fit_transform(train_data['Sex'])
test_data['Sex'] = lb.fit_transform(test_data['Sex'])

In [None]:
#Seperating the target variable and features
ytrain = train_data['Survived']
train_data.drop(labels = 'Survived', axis = 1, inplace = True)
xtrain = train_data

In [None]:
#Splitting the data into training and validation
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size = 0.30, random_state = 12)

In [None]:
lr_list = [0.05,0.075,0.1,0.25,0.5,0.75,1]

#Looping through each of the learning rates to check the model performance and obtain the best one
for lr in lr_list:
  gb = GradientBoostingClassifier(n_estimators = 20, learning_rate = lr, max_features=2, max_depth=2, random_state=0)
  gb.fit(xtrain, ytrain)
  print(f"Learning Rate:{lr:.3f}\tAccuracyScore (Training):{gb.score(xtrain, ytrain):.3f}\tAccuracyScore (Validation):{gb.score(xval, yval):.3f}")

Learning Rate:0.050	AccuracyScore (Training):0.804	AccuracyScore (Validation):0.739
Learning Rate:0.075	AccuracyScore (Training):0.822	AccuracyScore (Validation):0.731
Learning Rate:0.100	AccuracyScore (Training):0.815	AccuracyScore (Validation):0.761
Learning Rate:0.250	AccuracyScore (Training):0.841	AccuracyScore (Validation):0.757
Learning Rate:0.500	AccuracyScore (Training):0.865	AccuracyScore (Validation):0.795
Learning Rate:0.750	AccuracyScore (Training):0.878	AccuracyScore (Validation):0.780
Learning Rate:1.000	AccuracyScore (Training):0.881	AccuracyScore (Validation):0.746


In [None]:
#Final Gradient Boosting model with best learning rate
gb2 = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features = 2, max_depth = 2, random_state = 0)
gb2.fit(xtrain, ytrain)

In [None]:
#Predicting on the validation set
predictions = gb2.predict(xval)

#Evaluating the model performance based on the confusion matrix and classification report
print("Confusion matrix", confusion_matrix(yval, predictions))
print(classification_report(yval, predictions))

Confusion matrix [[149  12]
 [ 43  64]]
              precision    recall  f1-score   support

           0       0.78      0.93      0.84       161
           1       0.84      0.60      0.70       107

    accuracy                           0.79       268
   macro avg       0.81      0.76      0.77       268
weighted avg       0.80      0.79      0.79       268

