# Exploring Classification using Adaboost

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
classification_data = pd.read_csv("../../Data/classification_data.csv")
airline_features = pd.read_csv("../../Data/new_airfare_features.csv")

In [3]:
classification_data.drop(columns = "Unnamed: 0", inplace = True)
airline_features["DDATE"] = pd.to_datetime(airline_features["DDATE"], format='%Y-%m-%d')
airline_features["DDATE_YEAR"] = pd.DatetimeIndex(airline_features["DDATE"]).year
airline_features["DDATE_MONTH"] = pd.DatetimeIndex(airline_features["DDATE"]).month
airline_features["DDATE_DATE"] = pd.DatetimeIndex(airline_features["DDATE"]).day
airline_features.drop(columns = "DDATE", inplace = True)

all_classification_data = pd.merge(classification_data, airline_features, how='left',
                                   left_on=['PICKUP_DATE_YEAR','PICKUP_DATE_MONTH', 'PICKUP_DATE_DATE'], 
                                   right_on = ['DDATE_YEAR','DDATE_MONTH', 'DDATE_DATE'])

all_classification_data.dropna(inplace=True)

In [4]:
all_classification_data.head()

Unnamed: 0,AVG_BEFORE_DAYS,HOLIDAY,HOLIDAY AFTER,HOLIDAY BEFORE,WEEKEND,CHANGE,PICKUP_DATE_YEAR,PICKUP_DATE_MONTH,PICKUP_DATE_DATE,OUTSIPP_economy,...,price_per_mile,total_fluc_50,total_fluc_70,total_fluc_120,avg_price,pct_change,trend,DDATE_YEAR,DDATE_MONTH,DDATE_DATE
37,3.137476,0,0,0,0,Decrease,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0
38,-0.16588,0,0,0,0,Decrease,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0
39,-1.185111,0,0,0,0,No change,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0
40,-1.144296,0,0,0,0,Increase,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0
41,0.25812,0,0,0,0,Decrease,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0


# Machine Learning

In [5]:
drop_columns = ["DDATE_YEAR", "DDATE_MONTH", "DDATE_DATE", "trend", "price_per_mile", "avg_price"]
ml_data = all_classification_data.copy()
ml_data.drop(columns = drop_columns, inplace = True)
classification_columns = [x for x in ml_data.columns.to_list() if x != "CHANGE"]
X = ml_data[classification_columns]
y = ml_data["CHANGE"]

X_train = X[:int(X.shape[0]*0.7)]
X_test = X[int(X.shape[0]*0.7):]
y_train = y[:int(X.shape[0]*0.7)]
y_test = y[int(X.shape[0]*0.7):]

# Checking correct sort order for train test splits
print("Min train date:", min(X_train["PICKUP_DATE_YEAR"]*100 + X_train["PICKUP_DATE_MONTH"]))
print("Max train date:", max(X_train["PICKUP_DATE_YEAR"]*100 + X_train["PICKUP_DATE_MONTH"]))
print()
print("Min test date:", min(X_test["PICKUP_DATE_YEAR"]*100 + X_test["PICKUP_DATE_MONTH"]))
print("Max test date:", max(X_test["PICKUP_DATE_YEAR"]*100 + X_test["PICKUP_DATE_MONTH"]))

Min train date: 201802
Max train date: 201907

Min test date: 201907
Max test date: 202001


In [6]:
# imports
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, balanced_accuracy_score, classification_report
                             ,roc_curve, auc)

In [7]:
best_model = RandomForestClassifier(random_state=1, criterion="gini", max_depth=4, max_features="auto", 
                                   n_estimators=100)
# best_model = RandomForestClassifier()
best_model.fit(X_train,y_train)
pred = best_model.predict(X_test)
print("Training Accuracy for Random Forest",accuracy_score(y_train,best_model.predict(X_train)))
print("Test Accuracy for Random Forest",accuracy_score(y_test,pred))
print()
print("Training Classification report\n", classification_report(y_train,best_model.predict(X_train)))
print()
print("Test Classification report\n", classification_report(y_test, pred))

Training Accuracy for Random Forest 0.5517594991364422
Test Accuracy for Random Forest 0.5159299836292658

Training Classification report
               precision    recall  f1-score   support

    Decrease       0.64      0.36      0.47      5689
    Increase       0.71      0.42      0.53      5682
   No change       0.48      0.81      0.60      7157

    accuracy                           0.55     18528
   macro avg       0.61      0.53      0.53     18528
weighted avg       0.60      0.55      0.54     18528


Test Classification report
               precision    recall  f1-score   support

    Decrease       0.53      0.16      0.25      2509
    Increase       0.47      0.29      0.36      1925
   No change       0.52      0.90      0.66      3507

    accuracy                           0.52      7941
   macro avg       0.51      0.45      0.42      7941
weighted avg       0.51      0.52      0.46      7941



In [8]:
for x, y in zip(X_train.columns.tolist(), best_model.feature_importances_):
    temp = (x,round(y,2)) 
    print(x,round(y,2))

AVG_BEFORE_DAYS 0.12
HOLIDAY 0.0
HOLIDAY AFTER 0.0
HOLIDAY BEFORE 0.0
WEEKEND 0.0
PICKUP_DATE_YEAR 0.03
PICKUP_DATE_MONTH 0.34
PICKUP_DATE_DATE 0.09
OUTSIPP_economy 0.01
OUTSIPP_luxury 0.01
OUTSIPP_midrange 0.0
OUTSIPP_premium 0.0
total_fluc_50 0.11
total_fluc_70 0.07
total_fluc_120 0.14
pct_change 0.07


In [9]:
%%time
ada =  AdaBoostClassifier(base_estimator = best_model)
ada.fit(X_train,y_train)
ada_pred = ada.predict(X_test)

print("Training Accuracy for AdaBoost",accuracy_score(y_train,ada.predict(X_train)))
print("Test Accuracy for AdaBoost",accuracy_score(y_test,ada_pred))
print()
print("Training Classification report\n", classification_report(y_train,ada.predict(X_train)))
print()
print("Test Classification report\n", classification_report(y_test, ada_pred))

Training Accuracy for AdaBoost 0.7290587219343696
Test Accuracy for AdaBoost 0.43029845107669057

Training Classification report
               precision    recall  f1-score   support

    Decrease       0.74      0.71      0.73      5689
    Increase       0.81      0.71      0.75      5682
   No change       0.67      0.76      0.71      7157

    accuracy                           0.73     18528
   macro avg       0.74      0.73      0.73     18528
weighted avg       0.74      0.73      0.73     18528


Test Classification report
               precision    recall  f1-score   support

    Decrease       0.49      0.22      0.31      2509
    Increase       0.32      0.64      0.42      1925
   No change       0.56      0.46      0.51      3507

    accuracy                           0.43      7941
   macro avg       0.46      0.44      0.41      7941
weighted avg       0.48      0.43      0.42      7941

CPU times: user 46.2 s, sys: 455 ms, total: 46.7 s
Wall time: 47 s


In [10]:
%%time
ada =  AdaBoostClassifier(base_estimator = best_model, learning_rate=0.05)
ada.fit(X_train,y_train)
ada_pred = ada.predict(X_test)

print("Training Accuracy for AdaBoost",accuracy_score(y_train,ada.predict(X_train)))
print("Test Accuracy for AdaBoost",accuracy_score(y_test,ada_pred))
print()
print("Training Classification report\n", classification_report(y_train,ada.predict(X_train)))
print()
print("Test Classification report\n", classification_report(y_test, ada_pred))

Training Accuracy for AdaBoost 0.5933182210708118
Test Accuracy for AdaBoost 0.5096335474121647

Training Classification report
               precision    recall  f1-score   support

    Decrease       0.66      0.47      0.55      5689
    Increase       0.74      0.49      0.59      5682
   No change       0.52      0.78      0.62      7157

    accuracy                           0.59     18528
   macro avg       0.64      0.58      0.59     18528
weighted avg       0.63      0.59      0.59     18528


Test Classification report
               precision    recall  f1-score   support

    Decrease       0.52      0.16      0.25      2509
    Increase       0.44      0.31      0.36      1925
   No change       0.52      0.87      0.65      3507

    accuracy                           0.51      7941
   macro avg       0.49      0.45      0.42      7941
weighted avg       0.50      0.51      0.45      7941

CPU times: user 46.2 s, sys: 433 ms, total: 46.7 s
Wall time: 47.4 s


In [11]:
%%time
ada_grid = AdaBoostClassifier(base_estimator = best_model)

# Setting grid parameters
# loss can only be deviance as exponential takes only 2 classes
learning_rate = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200, 400]
algorithm = ["SAMME", "SAMME.R"]


parameters = dict(learning_rate = learning_rate, n_estimators = n_estimators,
                 algorithm = algorithm)

gridF = GridSearchCV(estimator = ada_grid, param_grid = parameters, cv = 5, verbose = 1, scoring = "accuracy",
                      n_jobs = -1)
gridF.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 33.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 84.4min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 354.9min finished


CPU times: user 5.43 s, sys: 605 ms, total: 6.03 s
Wall time: 5h 54min 52s


GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=RandomForestClassifier(bootstrap=True,
                                                                                ccp_alpha=0.0,
                                                                                class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=4,
                                                                                max_features='auto',
                                                                                max_leaf_nodes=None,
                                                                                max_samples=None,
                                                                                min_impurity_decrease=0.0,
             

In [12]:
gridF.best_params_

{'algorithm': 'SAMME', 'learning_rate': 0.01, 'n_estimators': 1}

In [14]:
#Fitting the best model
best_ada = AdaBoostClassifier(base_estimator = best_model, algorithm="SAMME", learning_rate=0.01, n_estimators=1)
best_ada.fit(X_train,y_train)
best_ada_pred = best_ada.predict(X_test)

print("Training Accuracy for AdaBoost",accuracy_score(y_train,best_ada.predict(X_train)))
print("Test Accuracy for AdaBoost",accuracy_score(y_test,best_ada_pred))
print()
print("Training Classification report\n", classification_report(y_train,best_ada.predict(X_train)))
print()
print("Test Classification report\n", classification_report(y_test, best_ada_pred))

Training Accuracy for AdaBoost 0.5471178756476683
Test Accuracy for AdaBoost 0.477521722704949

Training Classification report
               precision    recall  f1-score   support

    Decrease       0.63      0.38      0.47      5689
    Increase       0.71      0.40      0.51      5682
   No change       0.48      0.80      0.60      7157

    accuracy                           0.55     18528
   macro avg       0.61      0.53      0.53     18528
weighted avg       0.60      0.55      0.53     18528


Test Classification report
               precision    recall  f1-score   support

    Decrease       0.50      0.18      0.26      2509
    Increase       0.26      0.09      0.13      1925
   No change       0.50      0.90      0.64      3507

    accuracy                           0.48      7941
   macro avg       0.42      0.39      0.35      7941
weighted avg       0.44      0.48      0.40      7941

