# Table of Contents
* 1. Import and review data   
* 2. K-Nearest Neighbors
* 3. Random Forest
* 4. LightGBM
* 5. XBoost
* 6. LightGBM - Categorical Features

      

In [1]:
# Import packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lightgbm as lgb
import xgboost as xgb

from sklearn.svm import SVR

In [2]:
# Confirm the disk location is correct for importing
os.chdir('C:/Users/steve/Documents/springboard/Capstone2/data')
cwd = os.getcwd()
print(cwd)

C:\Users\steve\Documents\springboard\Capstone2\data


In [3]:
%%time
# Import Data
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train_df = pd.read_csv('y_train_unscaled.csv')
y_test_df = pd.read_csv('y_test_unscaled.csv')

CPU times: total: 47.5 s
Wall time: 49.1 s


In [4]:
# Replace characters that are incompatible with certain models
X_train.columns = X_train.columns.str.replace(r"[^\w]", "_", regex=True)
X_test.columns = X_test.columns.str.replace(r"[^\w]", "_", regex=True)

# Reformat target variable to be readable by machine models
y_train = y_train_df.values.ravel()
y_test = y_test_df.values.ravel()

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5513783, 166)
(5513783,)
(1378446, 166)
(1378446,)


In [27]:
# Split the training and test data into a smaller sample
from sklearn.model_selection import train_test_split

sample_fraction = 0.2 #.02 for knn and random forest, .2 for lightgbm and xgboost

X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=1-sample_fraction, random_state=42)
X_test_sample, _, y_test_sample, _ = train_test_split(X_test, y_test, test_size=1-sample_fraction, random_state=42)


In [28]:
print(X_train_sample.shape)
print(y_train_sample.shape)
print(X_test_sample.shape)
print(y_test_sample.shape)

(1102756, 166)
(1102756,)
(275689, 166)
(275689,)


# K-Nearest Neighbors

In [8]:
# Define function for BayesianOptimization for KNN hyperparapeter search

def knn_eval(n_neighbors, leaf_size, p):
    """
    Trains a K-Nearest Neignbors model with given hyperparameters and returns -mse.
    """    
    # Convert parameters to appropriate types
    n_neighbors = int(n_neighbors)
    leaf_size = int(leaf_size)
    p = int(p)
    
    # Create model
    knn = KNeighborsRegressor(
        n_neighbors=n_neighbors,
        leaf_size=leaf_size,
        p=p)
    
    #train and test model
    knn.fit(X_train_sample, y_train_sample)
    y_pred_knn = knn.predict(X_test_sample)
    mse = mean_squared_error(y_test_sample, y_pred_knn)
    return -mse  


In [9]:
# Define the paramater bounds for testing
param_bounds_knn = {
    'n_neighbors': (1, 30),
    'leaf_size': (10, 50),
    'p': (1, 2)}


In [10]:
# Set up the optimizer
optimizer_knn = BayesianOptimization(
    f=knn_eval,  
    pbounds=param_bounds_knn, 
    random_state=42,
    verbose=2)

In [11]:
%%time
# Run optimization
optimizer_knn.maximize(init_points=5, n_iter=15)

# Print best parameters
print("Best hyperparameters found:", optimizer_knn.max)

|   iter    |  target   | leaf_size | n_neig... |     p     |
-------------------------------------------------------------
| [39m1        [39m | [39m-505.7   [39m | [39m24.98    [39m | [39m28.57    [39m | [39m1.732    [39m |
| [39m2        [39m | [39m-582.5   [39m | [39m33.95    [39m | [39m5.525    [39m | [39m1.156    [39m |
| [39m3        [39m | [39m-506.2   [39m | [39m12.32    [39m | [39m26.12    [39m | [39m1.601    [39m |
| [39m4        [39m | [39m-991.8   [39m | [39m38.32    [39m | [39m1.597    [39m | [39m1.97     [39m |
| [39m5        [39m | [39m-555.5   [39m | [39m43.3     [39m | [39m7.158    [39m | [39m1.182    [39m |
| [39m6        [39m | [39m-582.5   [39m | [39m34.1     [39m | [39m5.728    [39m | [39m1.234    [39m |
| [39m7        [39m | [39m-505.9   [39m | [39m18.64    [39m | [39m27.47    [39m | [39m1.447    [39m |
| [39m8        [39m | [39m-509.0   [39m | [39m22.66    [39m | [39m22.63    [39m | [

In [12]:
# Extract the best parameters
best_params_knn = optimizer_knn.max['params']
best_params_knn['n_neighbors'] = int(best_params_knn['n_neighbors'])
best_params_knn['leaf_size'] = int(best_params_knn['leaf_size'])
best_params_knn['p'] = int(best_params_knn['p'])

In [13]:
%%time
# Train the model with the best parameters on the sample data
knn_best = KNeighborsRegressor(**best_params_knn)
knn_best.fit(X_train_sample, y_train_sample)


CPU times: total: 875 ms
Wall time: 167 ms


In [14]:
%%time
# Make predictions
y_pred_knn = knn_best.predict(X_test_sample)

CPU times: total: 7min 41s
Wall time: 48 s


In [15]:
# Calculate evaluation metrics
mae_knn = mean_absolute_error(y_test_sample, y_pred_knn)
mse_knn = mean_squared_error(y_test_sample, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
r2_knn = r2_score(y_test_sample, y_pred_knn)

# Print evaluation metrics
print(f'Mean Absolute Error (MAE): {mae_knn}')
print(f'Mean Squared Error (MSE): {mse_knn}')
print(f'Root Mean Squared Error (RMSE): {rmse_knn}')
print(f'R-squared (R2): {r2_knn}')

Mean Absolute Error (MAE): 13.934794464346469
Mean Squared Error (MSE): 505.1558146480125
Root Mean Squared Error (RMSE): 22.4756716172846
R-squared (R2): 0.032093685350367385


# Random Forest

In [16]:
# Define function for BayesianOptimization for random forest hyperparapeter search

def rf_eval(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    """
    Trains a Random Forest model with given hyperparameters and returns -mse.
    """
    # Convert parameters appripirate types 
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    
    
    # Create Model
    rfbo = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1,)
    
    # Train and test model
    rfbo.fit(X_train_sample, y_train_sample)
    y_pred_rfbo = rfbo.predict(X_test_sample)
    mse = mean_squared_error(y_test_sample, y_pred_rfbo)
    return -mse  


In [17]:
# Define the paramater bounds for testing
pbounds_rf = {
    "n_estimators": (50, 200),  
    "max_depth": (5, 50), 
    "min_samples_split": (2, 10), 
    "min_samples_leaf": (1, 10),}

In [18]:
# Set up the optimizer
optimizer_rf = BayesianOptimization(
    f=rf_eval,  
    pbounds=pbounds_rf, 
    random_state=42,
    verbose=2)

In [19]:
%%time
# Run optimization
optimizer_rf.maximize(init_points=4, n_iter=8)

# Print best parameters
print("Best hyperparameters found:", optimizer_rf.max)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m-493.6   [39m | [39m21.85    [39m | [39m9.556    [39m | [39m7.856    [39m | [39m139.8    [39m |
| [39m2        [39m | [39m-498.0   [39m | [39m12.02    [39m | [39m2.404    [39m | [39m2.465    [39m | [39m179.9    [39m |
| [39m3        [39m | [39m-495.3   [39m | [39m32.05    [39m | [39m7.373    [39m | [39m2.165    [39m | [39m195.5    [39m |
| [39m4        [39m | [39m-517.9   [39m | [39m42.46    [39m | [39m2.911    [39m | [39m3.455    [39m | [39m77.51    [39m |
| [39m5        [39m | [39m-524.4   [39m | [39m49.23    [39m | [39m1.063    [39m | [39m7.072    [39m | [39m160.8    [39m |
| [39m6        [39m | [39m-494.1   [39m | [39m24.78    [39m | [39m8.916    [39m | [39m4.992    [39m | [39m140.1    [39m |
| [39m7        [39m | [39m-505.5   [39m | [

In [20]:
# Extract the best parameters
best_params_rf = optimizer_rf.max['params']
best_params_rf['n_estimators'] = int(best_params_rf['n_estimators'])
best_params_rf['max_depth'] = int(best_params_rf['max_depth'])
best_params_rf['min_samples_split'] = int(best_params_rf['min_samples_split'])
best_params_rf['min_samples_leaf'] = int(best_params_rf['min_samples_leaf'])

In [24]:
%%time
# Train the final model with the best parameters
rf_best = RandomForestRegressor(**best_params_rf, random_state=42,n_jobs=-1, verbose=2)
rf_best.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 139
building tree 2 of 139
building tree 3 of 139
building tree 4 of 139
building tree 5 of 139
building tree 6 of 139
building tree 7 of 139
building tree 8 of 139
building tree 9 of 139
building tree 10 of 139
building tree 11 of 139
building tree 12 of 139
building tree 13 of 139
building tree 14 of 139
building tree 15 of 139building tree 16 of 139

building tree 17 of 139
building tree 18 of 139
building tree 19 of 139
building tree 20 of 139
building tree 21 of 139
building tree 22 of 139
building tree 23 of 139
building tree 24 of 139
building tree 25 of 139
building tree 26 of 139
building tree 27 of 139
building tree 28 of 139
building tree 29 of 139
building tree 30 of 139


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 15.0min


building tree 31 of 139
building tree 32 of 139
building tree 33 of 139
building tree 34 of 139
building tree 35 of 139
building tree 36 of 139
building tree 37 of 139
building tree 38 of 139
building tree 39 of 139
building tree 40 of 139
building tree 41 of 139
building tree 42 of 139
building tree 43 of 139
building tree 44 of 139
building tree 45 of 139
building tree 46 of 139
building tree 47 of 139
building tree 48 of 139
building tree 49 of 139
building tree 50 of 139
building tree 51 of 139
building tree 52 of 139
building tree 53 of 139
building tree 54 of 139
building tree 55 of 139
building tree 56 of 139
building tree 57 of 139
building tree 58 of 139
building tree 59 of 139
building tree 60 of 139
building tree 61 of 139
building tree 62 of 139
building tree 63 of 139
building tree 64 of 139
building tree 65 of 139
building tree 66 of 139
building tree 67 of 139
building tree 68 of 139
building tree 69 of 139
building tree 70 of 139
building tree 71 of 139
building tree 72

[Parallel(n_jobs=-1)]: Done 139 out of 139 | elapsed: 92.9min finished


CPU times: total: 15h 3min 51s
Wall time: 1h 32min 58s


In [25]:
%%time
# Make predictions
y_pred_rf_best= rf_best.predict(X_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.6s


CPU times: total: 58.8 s
Wall time: 5.84 s


[Parallel(n_jobs=12)]: Done 139 out of 139 | elapsed:    5.1s finished


In [26]:
# Calculate evaluation metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf_best)
mse_rf = mean_squared_error(y_test, y_pred_rf_best)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf_best)

# Print evaluation metrics
print(f'Mean Absolute Error (MAE): {mae_rf}')
print(f'Mean Squared Error (MSE): {mse_rf}')
print(f'Root Mean Squared Error (RMSE): {rmse_rf}')
print(f'R-squared (R2): {r2_rf}')

Mean Absolute Error (MAE): 13.674630556613442
Mean Squared Error (MSE): 503.62348752929915
Root Mean Squared Error (RMSE): 22.44155715473637
R-squared (R2): 0.06760678522204433


# Gradient Boosting Machines - LightGBM

In [29]:
# Define function for BayesianOptimization for LightGBM hyperparapeter search

def lgb_eval(n_estimators, learning_rate, num_leaves, max_depth, subsample, colsample_bytree ):
    """
    Trains a LightGBM model with given hyperparameters and returns the returns -mse
    """
    # Convert parameters appripirate types
    n_estimators =  int(n_estimators)
    learning_rate = float(learning_rate)
    num_leaves = int(num_leaves)
    max_depth = int(max_depth)
    subsample = float(subsample)
    colsample_bytree = float(colsample_bytree)
        
    # Create Model
    lgbbo = lgb.LGBMRegressor(
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    num_leaves=num_leaves,
    max_depth=max_depth,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    random_state=42)
    
    # Train and test model
    lgbbo.fit(X_train_sample, y_train_sample)
    y_pred_lgbbo = lgbbo.predict(X_test_sample)
    mse = mean_squared_error(y_test_sample, y_pred_lgbbo)
    return -mse  

In [30]:
# Define the paramater bounds for testing
param_bounds_lgb = {
    'n_estimators': (100, 500),
    'learning_rate': (0.01, 0.2),    
    'num_leaves': (20, 100),
    'max_depth': (5, 50),  
    'subsample': (0.6, 1.0),
    "colsample_bytree": (0.6, 1.0)}
    

In [31]:
# Set up the optimizer
optimizer_lgb = BayesianOptimization(
    f=lgb_eval,
    pbounds=param_bounds_lgb,
    random_state=42,
    verbose=2)

In [32]:
%%time
# Run optimization
optimizer_lgb.maximize(init_points=10, n_iter=50)

# Print best parameters
print("Best hyperparameters found:", optimizer_lgb.max)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004628 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m1        [39m | [39m-493.2   [39m | [39m0.7498   [39m | [39m0.1906   [39m | [39m37.94    [39m | [39m339.5    [39m | [39m32.48    [39m | [39m0.6624   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006946 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise



| [39m6        [39m | [39m-495.3   [39m | [39m0.843    [39m | [39m0.0424   [39m | [39m7.927    [39m | [39m479.6    [39m | [39m97.25    [39m | [39m0.9234   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008870 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m7        [39m | [39m-499.9   [39m | [39m0.7218   [39m | [39m0.02856  [39m | [39m35.79    [39m | [39m276.1    [39m | [39m29.76    [39m | [39m0.7981   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007541 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

| [39m16       [39m | [39m-491.2   [39m | [39m0.6687   [39m | [39m0.1021   [39m | [39m20.57    [39m | [39m348.7    [39m | [39m91.4     [39m | [39m0.9575   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008945 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m17       [39m | [39m-490.9   [39m | [39m0.757    [39m | [39m0.1777   [39m | [39m15.89    [39m | [39m347.3    [39m | [39m91.9     [39m | [39m0.7009   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

| [39m30       [39m | [39m-491.3   [39m | [39m0.6855   [39m | [39m0.132    [39m | [39m29.77    [39m | [39m347.4    [39m | [39m87.21    [39m | [39m0.6646   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m31       [39m | [39m-491.8   [39m | [39m0.7006   [39m | [39m0.07743  [39m | [39m24.29    [39m | [39m345.0    [39m | [39m88.71    [39m | [39m0.916    [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

| [39m44       [39m | [39m-491.3   [39m | [39m0.9381   [39m | [39m0.1094   [39m | [39m39.66    [39m | [39m350.0    [39m | [39m91.72    [39m | [39m0.951    [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m45       [39m | [39m-491.2   [39m | [39m0.8137   [39m | [39m0.1144   [39m | [39m22.87    [39m | [39m357.6    [39m | [39m87.97    [39m | [39m0.9414   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

| [39m58       [39m | [39m-491.0   [39m | [39m0.7266   [39m | [39m0.1852   [39m | [39m35.12    [39m | [39m340.1    [39m | [39m99.89    [39m | [39m0.636    [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006368 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m59       [39m | [39m-492.1   [39m | [39m0.9412   [39m | [39m0.05933  [39m | [39m42.57    [39m | [39m342.5    [39m | [39m94.67    [39m | [39m0.6863   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

In [33]:
# Extract the best parameters
best_params_lgb = optimizer_lgb.max['params']
best_params_lgb['n_estimators'] = int(best_params_lgb['n_estimators'])
best_params_lgb['learning_rate'] = float(best_params_lgb['learning_rate'])
best_params_lgb['num_leaves'] = int(best_params_lgb['num_leaves'])
best_params_lgb['max_depth'] = int(best_params_lgb['max_depth'])
best_params_lgb['subsample'] = float(best_params_lgb['subsample'])
best_params_lgb['colsample_bytree'] = float(best_params_lgb['colsample_bytree'])


In [34]:
%%time
# Train the final model with the best parameters
lgb_best = lgb.LGBMRegressor(**best_params_lgb, random_state=42)
lgb_best.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 585
[LightGBM] [Info] Number of data points in the train set: 5513783, number of used features: 166
[LightGBM] [Info] Start training from score 5.711103
CPU times: total: 4min 38s
Wall time: 30.2 s


In [35]:
%%time
# Make predictions
y_pred_lgb = lgb_best.predict(X_test)

CPU times: total: 41.3 s
Wall time: 4.5 s


In [36]:
# Calculate evaluation metrics
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
rmse_lgb = np.sqrt(mse_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

# Print evaluation metrics
print(f'Mean Absolute Error (MAE): {mae_lgb}')
print(f'Mean Squared Error (MSE): {mse_lgb}')
print(f'Root Mean Squared Error (RMSE): {rmse_lgb}')
print(f'R-squared (R2): {r2_lgb}')

Mean Absolute Error (MAE): 13.49468480154234
Mean Squared Error (MSE): 493.12285554102556
Root Mean Squared Error (RMSE): 22.206369706483443
R-squared (R2): 0.08704733606842852


# XGBoost

In [37]:
# Define function for BayesianOptimization for XGBoost hyperparapeter search

def xgb_eval(n_estimators, learning_rate, max_depth, subsample, colsample_bytree, gamma, min_child_weight, reg_alpha, reg_lambda):
    """
    Trains XGBoot model with given hyperparameters and returns -mse.
    """
    # Convert parameters appripirate types
    n_estimators =  int(n_estimators)
    learning_rate = float(learning_rate)
    max_depth = int(max_depth)
    subsample = float(subsample)
    colsample_bytree = float(colsample_bytree)
    gamma = int(gamma)
    min_child_weight = int(min_child_weight)
    reg_alpha = int(reg_alpha)
    reg_lambda = int(reg_lambda)
    
    
    # Create Model
    xgbbo = xgb.XGBRegressor(
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    max_depth=max_depth,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    gamma = gamma,
    min_child_weight = min_child_weight, 
    reg_alpha = reg_alpha,
    reg_lambda = reg_lambda,    
    random_state=42)
    
    #train and test model
    xgbbo.fit(X_train_sample, y_train_sample)
    y_pred_xgbbo = xgbbo.predict(X_test_sample)
    mse = mean_squared_error(y_test_sample, y_pred_xgbbo)
    return -mse  

In [38]:
# Define the paramater bounds for testing
param_bounds_xgb = {
    'n_estimators': (100, 500),
    'learning_rate': (0.01, 0.3),    
    'max_depth': (3, 10),  
    'subsample': (0.5, 1.0),
    "colsample_bytree": (0.5, 1.0),
    "gamma": (0, 10),
    "min_child_weight": (0, 10),
    "reg_alpha":(0, 10),
    "reg_lambda":(0, 10)}

In [39]:
# Set up the optimizer
optimizer_xgb = BayesianOptimization(
    f=xgb_eval,
    pbounds=param_bounds_xgb,
    random_state=42,
    verbose=2)

In [40]:
%%time
# Run optimization
optimizer_xgb.maximize(init_points=10, n_iter=50)

# Print best parameters
print("Best hyperparameters found:", optimizer_xgb.max)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m-493.2   [39m | [39m0.6873   [39m | [39m9.507    [39m | [39m0.2223   [39m | [39m7.191    [39m | [39m1.56     [39m | [39m162.4    [39m | [39m0.5808   [39m | [39m8.662    [39m | [39m0.8006   [39m |
| [39m2        [39m | [39m-493.7   [39m | [39m0.854    [39m | [39m0.2058   [39m | [39m0.2913   [39m | [39m8.827    [39m | [39m2.123    [39m | [39m172.7    [39m | [39m1.834    [39m | [39m3.042    [39m | [39m0.7624   [39m |
| [39m3        [39m | [39m-501.2   [39m | [39m0.716    [39m | [39m2.912    [39m | [39m0.1874   [39m | [39m3.976    [39m | [39m2.921    [39m | [39m246.5    [39m | [39m4.561    [39m | [39m7.852    [39m | [39m0.5998   [39m |


In [41]:
# Extract the best parameters
best_params_xgb = optimizer_xgb.max['params']
best_params_xgb['n_estimators'] = int(best_params_xgb['n_estimators'])
best_params_xgb['learning_rate'] = float(best_params_xgb['learning_rate'])
best_params_xgb['max_depth'] = int(best_params_xgb['max_depth'])
best_params_xgb['subsample'] = float(best_params_xgb['subsample'])
best_params_xgb['colsample_bytree'] = float(best_params_xgb['colsample_bytree'])
best_params_xgb['gamma'] = int(best_params_xgb['gamma'])
best_params_xgb['min_child_weight'] = int(best_params_xgb['min_child_weight'])
best_params_xgb['reg_alpha'] = int(best_params_xgb['reg_alpha'])
best_params_xgb['reg_lambda'] = int(best_params_xgb['reg_lambda'])


In [42]:
%%time
# Train the final model with the best parameters
xgb_best = xgb.XGBRegressor(**best_params_xgb, random_state=42)
xgb_best.fit(X_train, y_train)


CPU times: total: 29min 19s
Wall time: 2min 36s


In [43]:
%%time
# Make predictions
y_pred_xgb = xgb_best.predict(X_test)


CPU times: total: 32.8 s
Wall time: 3.28 s


In [44]:
# Calculate evaluation metrics
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Print evaluation metrics
print(f'Mean Absolute Error (MAE): {mae_xgb}')
print(f'Mean Squared Error (MSE): {mse_xgb}')
print(f'Root Mean Squared Error (RMSE): {rmse_xgb}')
print(f'R-squared (R2): {r2_xgb}')

Mean Absolute Error (MAE): 13.455708601999934
Mean Squared Error (MSE): 491.40541109839484
Root Mean Squared Error (RMSE): 22.16766589197868
R-squared (R2): 0.09022696049961398


Initial Conclusions:  
KNN model is too slow at predicting on a dataset this large. It also scored the worst on all evaluation metrics, so not a good model to use in our case.  
  
  
Random Forest model performs better than the KNN model on its metrics, but is slow on training, so may not be the best model to use.  
  
  
Both LightGBM and XGBoost perform better than Random Forest in all evaluation metrics. They also have a much faster training speed compared to the Random Forest, so these models would be better than either of the two previous models.  
  
  
LightGBM is able to work with categorical features, without the need for one-hot encoding, so we will try a second LightGBM model, this time without splitting the catigorical features. 

# LightGBM - Categorical

In [65]:
%%time
# Import train/test sets without one-hot encoding
X_train_cat = pd.read_csv('X_train_cat.csv')
X_test_cat = pd.read_csv('X_test_cat.csv')
y_train_catdf = pd.read_csv('y_train_cat.csv')
y_test_catdf = pd.read_csv('y_test_cat.csv')

CPU times: total: 4.06 s
Wall time: 4.15 s


In [66]:
# Reformat target variable to be readable by machine models
y_train_cat = y_train_catdf.values.ravel()
y_test_cat = y_test_catdf.values.ravel()

In [67]:
print(X_train_cat.shape)
print(y_train_cat.shape)
print(X_test_cat.shape)
print(y_test_cat.shape)

(5513433, 9)
(5513433,)
(1378359, 9)
(1378359,)


In [68]:
# Preview data
X_train_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5513433 entries, 0 to 5513432
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Date             object 
 1   Airline          object 
 2   Origin           object 
 3   Dest             object 
 4   Distance         float64
 5   hour_depart      int64  
 6   airflightnumber  object 
 7   day              object 
 8   month            int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 378.6+ MB


In [69]:
# Rename day column to day_of_week column
X_train_cat.rename(columns={'day': 'day_of_week'}, inplace=True)
X_test_cat.rename(columns={'day': 'day_of_week'}, inplace=True)

In [70]:
# Change object type coumns to category columns to be compatible with the model
X_train_cat["Airline"] = X_train_cat["Airline"].astype("category")
X_train_cat["Origin"] = X_train_cat["Origin"].astype("category")
X_train_cat["Dest"] = X_train_cat["Dest"].astype("category")
X_train_cat["day_of_week"] = X_train_cat["day_of_week"].astype("category")
X_train_cat['airflightnumber'] = X_train_cat['airflightnumber'].astype("category")

X_test_cat["Airline"] = X_test_cat["Airline"].astype("category")
X_test_cat["Origin"] = X_test_cat["Origin"].astype("category")
X_test_cat["Dest"] = X_test_cat["Dest"].astype("category")
X_test_cat["day_of_week"] = X_test_cat["day_of_week"].astype("category")
X_test_cat['airflightnumber'] = X_test_cat['airflightnumber'].astype("category")

In [71]:
# Split Date column into day (numerical day in the month) and week of the year column
X_train_cat["Date"] = pd.to_datetime(X_train_cat["Date"])
X_train_cat["day"] = X_train_cat["Date"].dt.day
X_train_cat["week_of_year"] = X_train_cat["Date"].dt.isocalendar().week
X_train_cat.drop(columns=["Date"], inplace=True)

X_test_cat["Date"] = pd.to_datetime(X_test_cat["Date"])
X_test_cat["day"] = X_test_cat["Date"].dt.day
X_test_cat["week_of_year"] = X_test_cat["Date"].dt.isocalendar().week
X_test_cat.drop(columns=["Date"], inplace=True)

In [72]:
# Split month column into cyclical encoding (to deal with December to January rollover)
X_train_cat["month_sin"] = np.sin(2 * np.pi * X_test_cat["month"] / 12)
X_train_cat["month_cos"] = np.cos(2 * np.pi * X_train_cat["month"] / 12)

X_test_cat["month_sin"] = np.sin(2 * np.pi * X_test_cat["month"] / 12)
X_test_cat["month_cos"] = np.cos(2 * np.pi * X_test_cat["month"] / 12)

X_train_cat.drop(columns=["month"], inplace=True)
X_test_cat.drop(columns=["month"], inplace=True)


In [73]:
# Convert HHMM to separate hour & minute columns
X_train_cat["hour"] = X_train_cat["hour_depart"] // 100
X_train_cat["minute"] = X_train_cat["hour_depart"] % 100

X_test_cat["hour"] = X_test_cat["hour_depart"] // 100
X_test_cat["minute"] = X_test_cat["hour_depart"] % 100


# Convert to cyclical encoding (to deal with 2300 to 0000 rollover)
X_train_cat["hour_sin"] = np.sin(2 * np.pi * X_train_cat["hour"] / 24)
X_train_cat["hour_cos"] = np.cos(2 * np.pi * X_train_cat["hour"] / 24)
X_train_cat["minute_sin"] = np.sin(2 * np.pi * X_train_cat["minute"] / 60)
X_train_cat["minute_cos"] = np.cos(2 * np.pi * X_train_cat["minute"] / 60)

X_test_cat["hour_sin"] = np.sin(2 * np.pi * X_test_cat["hour"] / 24)
X_test_cat["hour_cos"] = np.cos(2 * np.pi * X_test_cat["hour"] / 24)
X_test_cat["minute_sin"] = np.sin(2 * np.pi * X_test_cat["minute"] / 60)
X_test_cat["minute_cos"] = np.cos(2 * np.pi * X_test_cat["minute"] / 60)


# Drop original columns
X_train_cat.drop(columns=["hour", "minute"], inplace=True)
X_train_cat.drop(columns=["hour_depart"], inplace=True)

X_test_cat.drop(columns=["hour", "minute"], inplace=True)
X_test_cat.drop(columns=["hour_depart"], inplace=True)

In [74]:
#drop Airflightnumber as it 
X_train_cat.drop(columns=["airflightnumber"], inplace=True)
X_test_cat.drop(columns=["airflightnumber"], inplace=True)

In [75]:
X_train_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5513433 entries, 0 to 5513432
Data columns (total 13 columns):
 #   Column        Dtype   
---  ------        -----   
 0   Airline       category
 1   Origin        category
 2   Dest          category
 3   Distance      float64 
 4   day_of_week   category
 5   day           int32   
 6   week_of_year  UInt32  
 7   month_sin     float64 
 8   month_cos     float64 
 9   hour_sin      float64 
 10  hour_cos      float64 
 11  minute_sin    float64 
 12  minute_cos    float64 
dtypes: UInt32(1), category(4), float64(7), int32(1)
memory usage: 373.3 MB


In [76]:
X_train_cat.head()

Unnamed: 0,Airline,Origin,Dest,Distance,day_of_week,day,week_of_year,month_sin,month_cos,hour_sin,hour_cos,minute_sin,minute_cos
0,9E,DTW,CWA,363.0,Sunday,11,10,-2.449294e-16,6.123234000000001e-17,1.224647e-16,-1.0,0.0,1.0
1,F9,PHL,IND,588.0,Friday,15,24,0.8660254,-1.0,-0.258819,-0.9659258,-0.9781476,0.2079117
2,EV,DSM,IAH,802.0,Friday,25,21,0.5,-0.8660254,1.0,6.123234000000001e-17,1.0,2.832769e-16
3,MQ,ORD,AVP,632.0,Wednesday,11,28,-2.449294e-16,-0.8660254,0.258819,-0.9659258,-0.8660254,0.5
4,MQ,DFW,TXK,181.0,Thursday,6,49,0.5,1.0,1.224647e-16,-1.0,5.665539e-16,-1.0


In [55]:
sample_fraction = 0.2

X_traincat_sample, _, y_traincat_sample, _ = train_test_split(X_train_cat, y_train_cat, test_size=1-sample_fraction, random_state=42)
X_testcat_sample, _, y_testcat_sample, _ = train_test_split(X_test_cat, y_test_cat, test_size=1-sample_fraction, random_state=42)


In [56]:
# Define the objective function to optimize
def lgbcat_eval(n_estimators, learning_rate, num_leaves, max_depth, subsample, colsample_bytree, reg_alpha, reg_lambda):
    """
    Trains a LightGBM with given hyperparameters and returns the returns -mse
    """
    # Convert parameters appripirate types
    n_estimators =  int(n_estimators)
    learning_rate = float(learning_rate)
    num_leaves = int(num_leaves)
    max_depth = int(max_depth)
    subsample = float(subsample)
    colsample_bytree = float(colsample_bytree)
    
        
    # Create Model
    lgbbocat = lgb.LGBMRegressor(
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    num_leaves=num_leaves,
    max_depth=max_depth,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    reg_alpha = reg_alpha,
    reg_lambda = reg_lambda,
    random_state=42)
    
    # Train and test the model
    lgbbocat.fit(X_train_cat, y_train_cat, categorical_feature=categorical_features)
    y_pred_lgbbocat = lgbbocat.predict(X_test_cat)
    mse = mean_squared_error(y_test_cat, y_pred_lgbbocat)
    return -mse  


In [57]:
# Define the paramater bounds for testing
param_bounds_lgbcat = {
    'n_estimators': (100, 500),
    'learning_rate': (0.01, 0.2),    
    'num_leaves': (20, 100),
    'max_depth': (5, 50),  
    'subsample': (0.6, 1.0),
    "colsample_bytree": (0.6, 1.0)}    

In [58]:
#set up the optimizer
optimizer_lgbcat = BayesianOptimization(
    f=lgb_eval,
    pbounds=param_bounds_lgb,
    random_state=42,
    verbose=2)

In [59]:
%%time
# Run optimization
optimizer_lgbcat.maximize(init_points=10, n_iter=50)

# Print best parameters
print("Best hyperparameters found:", optimizer_lgbcat.max)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m1        [39m | [39m-493.2   [39m | [39m0.7498   [39m | [39m0.1906   [39m | [39m37.94    [39m | [39m339.5    [39m | [39m32.48    [39m | [39m0.6624   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise



| [39m6        [39m | [39m-495.3   [39m | [39m0.843    [39m | [39m0.0424   [39m | [39m7.927    [39m | [39m479.6    [39m | [39m97.25    [39m | [39m0.9234   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m7        [39m | [39m-499.9   [39m | [39m0.7218   [39m | [39m0.02856  [39m | [39m35.79    [39m | [39m276.1    [39m | [39m29.76    [39m | [39m0.7981   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

| [39m17       [39m | [39m-490.9   [39m | [39m0.757    [39m | [39m0.1777   [39m | [39m15.89    [39m | [39m347.3    [39m | [39m91.9     [39m | [39m0.7009   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m18       [39m | [39m-491.1   [39m | [39m0.6124   [39m | [39m0.1321   [39m | [39m16.02    [39m | [39m343.3    [39m | [39m96.6     [39m | [39m0.8716   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

| [39m31       [39m | [39m-491.8   [39m | [39m0.7006   [39m | [39m0.07743  [39m | [39m24.29    [39m | [39m345.0    [39m | [39m88.71    [39m | [39m0.916    [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m32       [39m | [39m-491.0   [39m | [39m0.9764   [39m | [39m0.162    [39m | [39m29.17    [39m | [39m341.2    [39m | [39m97.52    [39m | [39m0.8768   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

| [39m45       [39m | [39m-491.2   [39m | [39m0.8137   [39m | [39m0.1144   [39m | [39m22.87    [39m | [39m357.6    [39m | [39m87.97    [39m | [39m0.9414   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m46       [39m | [39m-493.4   [39m | [39m0.8412   [39m | [39m0.03587  [39m | [39m20.43    [39m | [39m363.6    [39m | [39m82.66    [39m | [39m0.7976   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

| [39m59       [39m | [39m-492.1   [39m | [39m0.9412   [39m | [39m0.05933  [39m | [39m42.57    [39m | [39m342.5    [39m | [39m94.67    [39m | [39m0.6863   [39m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 583
[LightGBM] [Info] Number of data points in the train set: 1102756, number of used features: 165
[LightGBM] [Info] Start training from score 5.719183
| [39m60       [39m | [39m-492.0   [39m | [39m0.7522   [39m | [39m0.05823  [39m | [39m49.23    [39m | [39m408.4    [39m | [39m89.1     [39m | [39m0.6895   [39m |
Best hyperparameters found: {'target': -490.6941216113791, 'params': {'colsample_bytree': 0.819758663105755, 'learning_rate': 0.1071949436149192, 'max_depth': 49.23290923768107, 'n_estimators': 409.1520297877578, 'num_leaves': 95.066365

In [60]:
# Extract the best parameters
best_params_lgbcat = optimizer_lgbcat.max['params']
best_params_lgbcat['n_estimators'] = int(best_params_lgbcat['n_estimators'])
best_params_lgbcat['learning_rate'] = float(best_params_lgbcat['learning_rate'])
best_params_lgbcat['num_leaves'] = int(best_params_lgbcat['num_leaves'])
best_params_lgbcat['max_depth'] = int(best_params_lgbcat['max_depth'])
best_params_lgbcat['subsample'] = float(best_params_lgbcat['subsample'])
best_params_lgbcat['colsample_bytree'] = float(best_params_lgbcat['colsample_bytree'])


In [61]:
%%time
# Train the final model with the best parameters
lgbc_best = lgb.LGBMRegressor(**best_params_lgbcat, random_state=42)
lgbc_best.fit(X_train_cat, y_train_cat)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1025
[LightGBM] [Info] Number of data points in the train set: 5513433, number of used features: 13
[LightGBM] [Info] Start training from score 5.702085
CPU times: total: 6min
Wall time: 36.8 s


In [62]:
%%time
# Make predictions
y_pred_lgbc = lgbc_best.predict(X_test_cat)

CPU times: total: 1min
Wall time: 6.13 s


In [63]:
# Calculate evaluation metrics
mae_lgbc = mean_absolute_error(y_test_cat, y_pred_lgbc)
mse_lgbc = mean_squared_error(y_test_cat, y_pred_lgbc)
rmse_lgbc = np.sqrt(mse_lgbc)
r2_lgbc = r2_score(y_test_cat, y_pred_lgbc)

# Print evaluation metrics
print(f'Mean Absolute Error (MAE): {mae_lgbc}')
print(f'Mean Squared Error (MSE): {mse_lgbc}')
print(f'Root Mean Squared Error (RMSE): {rmse_lgbc}')
print(f'R-squared (R2): {r2_lgbc}')

Mean Absolute Error (MAE): 12.810996187940905
Mean Squared Error (MSE): 455.97570928808136
Root Mean Squared Error (RMSE): 21.353587738084702
R-squared (R2): 0.16012999489620583


Conclusion:  
This model performs better than XGBoost and the 1st LightGBM model in all metrics/ Based on this, the 2nd LightGBM model is the best model to use of the model's tested. That being said, there is much room for improvement. Currently, this model only explains approximately 16% of the variance in the data.

Possible improvements could be made by having more features, such as data across multiple years, the reason for each delay, weather information during each flight, or if the previous flight was delayed.
