In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
classification_data = pd.read_csv("../../Data/classification_data.csv")
airline_features = pd.read_csv("../../Data/new_airfare_features.csv")

In [8]:
# Old preprocessing from Car Rental Classification_v2
# classification_data.drop(columns = "Unnamed: 0", inplace = True)
airline_features["DDATE"] = pd.to_datetime(airline_features["DDATE"], format='%Y-%m-%d')
airline_features["DDATE_YEAR"] = pd.DatetimeIndex(airline_features["DDATE"]).year
airline_features["DDATE_MONTH"] = pd.DatetimeIndex(airline_features["DDATE"]).month
airline_features["DDATE_DATE"] = pd.DatetimeIndex(airline_features["DDATE"]).day
airline_features.drop(columns = "DDATE", inplace = True)

all_classification_data = pd.merge(classification_data, airline_features, how='left',
                                   left_on=['PICKUP_DATE_YEAR','PICKUP_DATE_MONTH', 'PICKUP_DATE_DATE'], 
                                   right_on = ['DDATE_YEAR','DDATE_MONTH', 'DDATE_DATE'])

all_classification_data.dropna(inplace=True)

In [9]:
all_classification_data.head()

Unnamed: 0,AVG_BEFORE_DAYS,HOLIDAY,HOLIDAY AFTER,HOLIDAY BEFORE,WEEKEND,CHANGE,PICKUP_DATE_YEAR,PICKUP_DATE_MONTH,PICKUP_DATE_DATE,OUTSIPP_economy,...,price_per_mile,total_fluc_50,total_fluc_70,total_fluc_120,avg_price,pct_change,trend,DDATE_YEAR,DDATE_MONTH,DDATE_DATE
37,3.137476,0,0,0,0,Decrease,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0
38,-0.16588,0,0,0,0,Decrease,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0
39,-1.185111,0,0,0,0,No change,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0
40,-1.144296,0,0,0,0,Increase,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0
41,0.25812,0,0,0,0,Decrease,2018,2,1,0,...,0.104127,0.0,0.0,0.0,0.104127,0.0,Decrease,2018.0,2.0,1.0


In [11]:
all_classification_data.columns.to_list()

['AVG_BEFORE_DAYS',
 'HOLIDAY',
 'HOLIDAY AFTER',
 'HOLIDAY BEFORE',
 'WEEKEND',
 'CHANGE',
 'PICKUP_DATE_YEAR',
 'PICKUP_DATE_MONTH',
 'PICKUP_DATE_DATE',
 'OUTSIPP_economy',
 'OUTSIPP_luxury',
 'OUTSIPP_midrange',
 'OUTSIPP_premium',
 'price_per_mile',
 'total_fluc_50',
 'total_fluc_70',
 'total_fluc_120',
 'avg_price',
 'pct_change',
 'trend',
 'DDATE_YEAR',
 'DDATE_MONTH',
 'DDATE_DATE']

# Machine Learning

In [13]:
# Dropping not required columns
drop_columns = ["DDATE_YEAR", "DDATE_MONTH", "DDATE_DATE", "trend", "price_per_mile", "avg_price"]
ml_data = all_classification_data.copy()
ml_data.drop(columns = drop_columns, inplace = True)
classification_columns = [x for x in ml_data.columns.to_list() if x != "CHANGE"]
X = ml_data[classification_columns]
y = ml_data["CHANGE"]

X_train = X[:int(X.shape[0]*0.7)]
X_test = X[int(X.shape[0]*0.7):]
y_train = y[:int(X.shape[0]*0.7)]
y_test = y[int(X.shape[0]*0.7):]

# Checking correct sort order for train test splits
print("Min train date:", min(X_train["PICKUP_DATE_YEAR"]*100 + X_train["PICKUP_DATE_MONTH"]))
print("Max train date:", max(X_train["PICKUP_DATE_YEAR"]*100 + X_train["PICKUP_DATE_MONTH"]))
print()
print("Min test date:", min(X_test["PICKUP_DATE_YEAR"]*100 + X_test["PICKUP_DATE_MONTH"]))
print("Max test date:", max(X_test["PICKUP_DATE_YEAR"]*100 + X_test["PICKUP_DATE_MONTH"]))

Min train date: 201802
Max train date: 201907

Min test date: 201907
Max test date: 202001


## Gradient Boost

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, roc_auc_score, balanced_accuracy_score, classification_report
                             ,roc_curve, auc)

In [69]:
grad = GradientBoostingClassifier()
grad.fit(X_train, y_train)
grad_pred = grad.predict(X_test)
print("Training Accuracy for default Gradient Boost ",accuracy_score(y_train,grad.predict(X_train)))
print("Test Accuracy for default Gradient Boost ",accuracy_score(y_test,grad_pred))
print()
print("Training Classification report\n", classification_report(y_train,grad.predict(X_train)))
print()
print("Test Classification report\n", classification_report(y_test, grad_pred))

Training Accuracy for default Gradient Boost  0.6546848013816926
Test Accuracy for default Gradient Boost  0.44289132351089283

Training Classification report
               precision    recall  f1-score   support

    Decrease       0.68      0.62      0.65      5689
    Increase       0.73      0.62      0.67      5682
   No change       0.60      0.71      0.65      7157

    accuracy                           0.65     18528
   macro avg       0.67      0.65      0.66     18528
weighted avg       0.66      0.65      0.66     18528


Test Classification report
               precision    recall  f1-score   support

    Decrease       0.41      0.26      0.32      2509
    Increase       0.35      0.56      0.43      1925
   No change       0.55      0.51      0.53      3507

    accuracy                           0.44      7941
   macro avg       0.44      0.44      0.43      7941
weighted avg       0.46      0.44      0.44      7941

CPU times: user 6.31 s, sys: 74.4 ms, total: 6.39

In [70]:
%%time
# Refer - https://medium.com/all-things-ai/in-depth-parameter-tuning-for-gradient-boosting-3363992e9bae
grad_grid = GradientBoostingClassifier()

# Setting grid parameters
# loss can only be deviance as exponential takes only 2 classes
learning_rate = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200, 400]
min_samples_split = [2, 0.1, 0.3, 0.5, 0.8, 1.0]
min_samples_leaf = [1, 0.1, 0.2, 0.4, 0.5]
max_features = ["auto", "sqrt" ,"log2"]


parameters = dict(learning_rate = learning_rate, n_estimators = n_estimators,
                  min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf,
                max_features = max_features)

gridF = GridSearchCV(estimator = grad_grid, param_grid = parameters, cv = 5, verbose = 1, scoring = "accuracy",
                      n_jobs = -1)
gridF.fit(X_train, y_train)

Fitting 5 folds for each of 5400 candidates, totalling 27000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 33.4min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 40.8min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 46.3min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 51.9min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 63.0min
[Parallel(n_jobs=-1)]: Done 11234 tasks      |

CPU times: user 3min 6s, sys: 12 s, total: 3min 18s
Wall time: 2h 39min 54s


[Parallel(n_jobs=-1)]: Done 27000 out of 27000 | elapsed: 159.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [71]:
# Best parameters
gridF.best_params_

{'learning_rate': 1,
 'max_features': 'log2',
 'min_samples_leaf': 0.4,
 'min_samples_split': 2,
 'n_estimators': 2}

In [72]:
best_grad_grid = GradientBoostingClassifier(learning_rate = 1, max_features = "log2",
                                           min_samples_leaf = 0.4, min_samples_split =2,
                                           n_estimators = 2)
best_grad_grid.fit(X_train, y_train)
best_grad_grid_pred = best_grad_grid.predict(X_test)
print("Training Accuracy for default Gradient Boost ",accuracy_score(y_train,best_grad_grid.predict(X_train)))
print("Test Accuracy for default Gradient Boost ",accuracy_score(y_test,best_grad_grid_pred))
print()
print("Training Classification report\n", classification_report(y_train,best_grad_grid.predict(X_train)))
print()
print("Test Classification report\n", classification_report(y_test, best_grad_grid_pred))

Training Accuracy for default Gradient Boost  0.41294257340241797
Test Accuracy for default Gradient Boost  0.47676615035889686

Training Classification report
               precision    recall  f1-score   support

    Decrease       0.42      0.18      0.25      5689
    Increase       0.31      0.16      0.21      5682
   No change       0.44      0.80      0.56      7157

    accuracy                           0.41     18528
   macro avg       0.39      0.38      0.34     18528
weighted avg       0.39      0.41      0.36     18528


Test Classification report
               precision    recall  f1-score   support

    Decrease       0.76      0.10      0.18      2509
    Increase       0.29      0.05      0.09      1925
   No change       0.47      0.98      0.64      3507

    accuracy                           0.48      7941
   macro avg       0.51      0.38      0.30      7941
weighted avg       0.52      0.48      0.36      7941



Not a considerabale improvement in accuracy using Gradient Boost Classifier