# 0. Import Libraries

In [36]:
import pandas as pd
import numpy as np
from my_krml_25246568.data.sets import pop_target
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from joblib import dump

# 1. Load Data

In [48]:
df = pd.read_csv('../data/processed/processed_data.csv', low_memory=False)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13519999 entries, 0 to 13519998
Data columns (total 18 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   startingAirport           float64
 1   destinationAirport        float64
 2   totalTravelDistance       float64
 3   travelDurationMins        float64
 4   cabinTypeSegmentSegment1  float64
 5   cabinTypeSegmentSegment2  float64
 6   cabinTypeSegmentSegment3  float64
 7   cabinTypeSegmentSegment4  float64
 8   numStops                  float64
 9   flightYear                float64
 10  flightMonth               float64
 11  flightDay                 float64
 12  flightHour                float64
 13  flightMinute              float64
 14  searchYear                float64
 15  searchMonth               float64
 16  searchDay                 float64
 17  totalFare                 float64
dtypes: float64(18)
memory usage: 1.8 GB


In [50]:
df.head()

Unnamed: 0,startingAirport,destinationAirport,totalTravelDistance,travelDurationMins,cabinTypeSegmentSegment1,cabinTypeSegmentSegment2,cabinTypeSegmentSegment3,cabinTypeSegmentSegment4,numStops,flightYear,flightMonth,flightDay,flightHour,flightMinute,searchYear,searchMonth,searchDay,totalFare
0,0.964373,-1.603932,0.710709,0.244224,-0.050707,0.637703,-0.281603,-0.052332,0.377654,0.0,-0.621904,0.582158,0.025652,1.69509,0.0,-1.021327,0.38687,103.98
1,0.964373,-1.603932,1.025539,-0.193447,-0.050707,0.637703,-0.281603,-0.052332,0.377654,0.0,-0.621904,0.582158,-1.200963,1.589283,0.0,-1.021327,0.38687,216.58
2,0.964373,-1.603932,1.025539,0.578117,-0.050707,0.637703,-0.281603,-0.052332,0.377654,0.0,-0.621904,0.468623,1.497591,-1.108799,0.0,-1.021327,0.38687,216.58
3,0.964373,-1.603932,1.025539,-0.184423,-0.050707,0.637703,-0.281603,-0.052332,0.377654,0.0,-0.621904,0.582158,0.270975,1.589283,0.0,-1.021327,0.38687,237.58
4,0.964373,-1.603932,1.551857,1.958812,-0.050707,0.637703,-0.281603,-0.052332,0.377654,0.0,-0.621904,0.582158,0.393637,0.795729,0.0,-1.021327,0.38687,307.21


In [51]:
df.shape

(13519999, 18)

# 2. Split Dataset

## [2.1] Sampling dataset

Sampling the dataset based on the origin destination pair, so that all the pairs are included in the sample

In [52]:
# fraction of each group to sample
sample_fraction = 0.3

In [53]:
# Perform stratified sampling by 'startingAirport' and 'destinationAirport'
sampled_df = df.groupby(['startingAirport', 'destinationAirport'], group_keys=False).apply \
(lambda x: x.sample(frac=sample_fraction))

  sampled_df = df.groupby(['startingAirport', 'destinationAirport'], group_keys=False).apply(lambda x: x.sample(frac=sample_fraction))


In [54]:
# Reset index
sampled_df = sampled_df.reset_index(drop=True)

In [55]:
sampled_df.shape

(4055994, 18)

## [2.2] Splitting Sampled Data

### Extracting target variable

In [56]:
sampled_df, target = pop_target(sampled_df, 'totalFare')

### Splitting into training and validation

In [57]:
X_train, X_val, y_train, y_val = train_test_split(sampled_df, target, test_size=0.3, random_state=42)

### Splitting validation into validation and testing

In [58]:
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

# 3. Baseline Model

In [59]:
y_mean = y_train.mean()
y_mean

np.float64(373.8322143248352)

In [60]:
y_base = np.full(y_train.shape, y_mean)
print("RMSE on Training Data:", rmse(y_train, y_base))

RMSE on Training Data: 207.50063992018602


In [61]:
y_mean_val = y_val.mean()
y_mean_val

np.float64(373.97909533053155)

In [62]:
y_val_base = np.full(y_val.shape, y_mean_val)
print("RMSE on Validation Data:", rmse(y_val, y_val_base))

RMSE on Validation Data: 208.48180773308704


In [63]:
y_mean_test = y_test.mean()
y_mean_test

np.float64(373.3327081360947)

In [64]:
y_test_base = np.full(y_test.shape, y_mean_test)
print("RMSE on Testing Data:", rmse(y_test, y_test_base))

RMSE on Testing Data: 206.88779210514693


# 4. Model Picking

In [65]:
def train_and_evaluate(models, X_train, y_train, X_val, y_val, X_test, y_test):
    results = {}
    
    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Predictions on training and validation sets
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)
        
        # Calculate RMSE for training and validation sets
        train_rmse = rmse(y_train, y_train_pred)
        val_rmse = rmse(y_val, y_val_pred)
        test_rmse = rmse(y_test, y_test_pred)
        
        # Store the results
        results[name] = {'Train RMSE': train_rmse, 'Validation RMSE': val_rmse, 'Test RMSE': test_rmse}
        print(f"{name} - Train RMSE: {train_rmse}, Validation RMSE: {val_rmse}, Test RMSE: {test_rmse}")
    
    return pd.DataFrame(results).T

In [66]:
models = {
    'Linear Regression': LinearRegression(),
    'ElasticNet': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor()
}

In [67]:
results = train_and_evaluate(models, X_train, y_train, X_val, y_val, X_test, y_test)
print(results)

Linear Regression - Train RMSE: 159.6910586835628, Validation RMSE: 160.69128026500124, Test RMSE: 159.41555335338293
ElasticNet - Train RMSE: 165.08119165908357, Validation RMSE: 166.12391507924224, Test RMSE: 164.56311253805904
Decision Tree - Train RMSE: 8.639020507192985, Validation RMSE: 99.38408597128749, Test RMSE: 98.63103229294242
XGBoost - Train RMSE: 114.28259842877618, Validation RMSE: 115.47380586440094, Test RMSE: 114.313406750339
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 725
[LightGBM] [Info] Number of data points in the train set: 2839195, number of used features: 15
[LightGBM] [Info] Start training from score 373.832215
LightGBM - Train RMSE: 126.71729095618693, Validation RMSE: 127.70910017514102, Test RMSE: 126.38486389973264
                   Train RMSE  V

# 5. Increasing Sample Size

In [41]:
# fraction of each group to sample
sample_fraction = 0.5

In [42]:
# Perform stratified sampling by 'startingAirport' and 'destinationAirport'
sampled_df_1 = df.groupby(['startingAirport', 'destinationAirport'], group_keys=False).apply(lambda x: x.sample(frac=sample_fraction))

  sampled_df_1 = df.groupby(['startingAirport', 'destinationAirport'], group_keys=False).apply(lambda x: x.sample(frac=sample_fraction))


In [43]:
sampled_df_1 = sampled_df_1.reset_index(drop=True)

In [44]:
sampled_df_1, target = pop_target(sampled_df_1, 'totalFare')

In [45]:
X_train, X_val, y_train, y_val = train_test_split(sampled_df_1, target, test_size=0.3, random_state=42)

In [46]:
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.3, random_state=42)

In [47]:
results = train_and_evaluate(models, X_train, y_train, X_val, y_val, X_test, y_test)
print(results)

Linear Regression - Train RMSE: 159.81528945924043, Validation RMSE: 160.2281542890805, Test RMSE: 160.29340489922322
ElasticNet - Train RMSE: 165.22311631429795, Validation RMSE: 165.55353156979874, Test RMSE: 165.82496274061714
Decision Tree - Train RMSE: 10.873027468363507, Validation RMSE: 87.97101502194292, Test RMSE: 88.09606686842487
XGBoost - Train RMSE: 114.52875358421866, Validation RMSE: 115.02437856514844, Test RMSE: 115.10356705204691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 725
[LightGBM] [Info] Number of data points in the train set: 4732003, number of used features: 15
[LightGBM] [Info] Start training from score 373.728531
LightGBM - Train RMSE: 126.92233282337213, Validation RMSE: 127.2175764959121, Test RMSE: 127.28277387712644
                   Train RMSE 