## Problem statement : Predict total fare for each flight for all carriers on a given day.

In [527]:
!pip install xgboost lightgbm




[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [528]:
import pandas as pd
import numpy as np
import re
import warnings
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_predict,cross_val_score
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,StackingRegressor,VotingRegressor,HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge,Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor
import numpy as np


warnings.filterwarnings('ignore')

In [529]:
# Loading the datasets

#Train Dataframe
service_index_df=next(pd.read_csv("training\\training\\service_index.csv",index_col=0,chunksize=10000))
train_fares_df=next(pd.read_csv("training\\training\\train_fares.csv",index_col=0,on_bad_lines='skip', engine='python',chunksize=10000))
train_schedules_df=next(pd.read_csv("training\\training\\train_schedules.csv",index_col=0,chunksize=10000))

#Test Dataframe
test_fares_df=pd.read_csv("test\\test\\test_fares_data.csv",index_col=0, engine='python')
test_schedules_df=pd.read_csv("test\\test\\test_schedules.csv",index_col=0)

In [530]:
mean_fare = train_fares_df['total_fare'].mean()
mean_fare

np.float64(615.5043)

In [531]:
for df in [service_index_df,train_fares_df,train_schedules_df,test_fares_df,test_schedules_df]:
  print(df.duplicated().sum())
  df.drop_duplicates(inplace=True)

0
1080
0
0
0


In [532]:
#Basic testing and cleaning to make sure I dont face any joining issues afterwards
def data_check(df):
    for col in df.select_dtypes(exclude='number').columns:
        print(f'''\n================================{[ col ]}====================================''')
        print(df[col].unique())
        print(df[col].isnull().sum())
data_check(test_schedules_df)


['L1' 'OTH' 'U3' 'L2' 'U1' 'L3']
0

['Airport26' 'Airport17' 'Airport4' 'Airport20' 'Airport30' 'Airport31'
 'Airport60' 'Airport43']
0

['Airport30' 'Airport4' 'Airport17' 'Airport43' 'Airport31' 'Airport60'
 'Airport26' 'Airport20']
0

['2019-01-01' '2019-01-02' '2019-01-03' '2019-01-04' '2019-01-05'
 '2019-01-06' '2019-01-07']
0

['2019-01-01 18:28:00.0' '2019-01-01 07:50:00.0' '2019-01-01 11:55:00.0'
 ... '2019-01-07 19:49:00.0' '2019-01-07 07:48:00.0'
 '2019-01-07 09:56:00.0']
0

['2019-01-01 19:49:00.0' '2019-01-01 09:12:00.0' '2019-01-01 13:19:00.0'
 ... '2019-01-07 22:47:00.0' '2019-01-07 12:47:00.0'
 '2019-01-07 18:51:00.0']
0

['2019-01-02 00:28:00.0' '2019-01-01 13:50:00.0' '2019-01-01 17:55:00.0'
 ... '2019-01-08 01:49:00.0' '2019-01-07 13:48:00.0'
 '2019-01-07 15:56:00.0']
0

['2019-01-02 03:49:00.0' '2019-01-01 17:12:00.0' '2019-01-01 21:19:00.0'
 ... '2019-01-08 04:47:00.0' '2019-01-07 18:47:00.0'
 '2019-01-08 00:51:00.0']
0


In [533]:
#Make date consistent in all
def make_date_consistent(df,sample_size=500):
    for col in df.select_dtypes(include='object'):
        sample = df[col].head(sample_size)
        if sample.empty:
            continue

        parsed = pd.to_datetime(sample, errors='coerce')
        if parsed.notna().all():
            df[col] = pd.to_datetime(df[col], errors='coerce').dt.strftime('%Y-%m-%d %H-%M-%S')
    return df


for df in [service_index_df,train_fares_df,train_schedules_df,test_fares_df,test_schedules_df]:
    df=make_date_consistent(df)

In [534]:
# Joining fares to schedules
def process_schedule_data(df):
  df['flight_duration']=abs((pd.to_datetime(df['flt_departure_local_time'])-pd.to_datetime(df['flt_arrival_local_time'])).dt.total_seconds()/60)
  df['tz']=abs((pd.to_datetime(df['flt_departure_local_time'])-pd.to_datetime(df['flt_departure_gmt'])).dt.total_seconds()/60)
  df = (
    df
    .groupby(['carrier', 'flt_num', 'origin', 'destination', 'flt_departure_dt'])
    .agg(
        flight_duration_mean=('flight_duration', 'mean'),
        tz_mean=('tz', 'mean'),
        num_flights=('flt_departure_local_time', 'count')
    )
    .reset_index())
  return df

train_schedules_df_grouped=process_schedule_data(train_schedules_df.copy())
test_schedules_df_grouped=process_schedule_data(test_schedules_df.copy())
df_train=pd.merge(train_fares_df,train_schedules_df_grouped,how='left',on=['carrier','flt_num','origin','destination','flt_departure_dt'])
df_test=pd.merge(test_fares_df,test_schedules_df_grouped,how='left',on=['carrier','flt_num','origin','destination','flt_departure_dt'])
df_train['mo']=pd.to_datetime(df_train['flt_departure_dt']).dt.month
df_test['mo']=pd.to_datetime(df_test['flt_departure_dt']).dt.month



In [535]:
service_mapping=service_index_df.groupby(
    ['mo','origin','destination','carrier']
).agg(
    {'scaled_demand':'mean',
     'scaled_share':'mean'}
)
df_train = df_train.merge(
    service_mapping,
    on=['mo','origin','destination','carrier'],
    how='left',
    validate='many_to_one'
)

df_test = df_test.merge(
    service_mapping,
    on=['mo','origin','destination','carrier'],
    how='left',
    validate='many_to_one'
)

In [536]:
display(test_fares_df.tail(2))
display(test_schedules_df.tail(2))
display(service_index_df.tail(2))
display(df_train.tail(2))

Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,origin_city,destination_city
29824,Airport17,Airport4,L1,18471,2019-01-07 00-00-00,2018-12-31 00-00-00,City17,City4
29825,Airport17,Airport4,L1,1487,2019-01-07 00-00-00,2018-12-31 00-00-00,City17,City4


Unnamed: 0,carrier,flt_num,origin,destination,flt_departure_dt,flt_departure_local_time,flt_arrival_local_time,flt_departure_gmt,flt_arrival_gmt
1233,L2,8018,Airport43,Airport4,2019-01-07 00-00-00,2019-01-07 09-56-00,2019-01-07 12-47-00,2019-01-07 15-56-00,2019-01-07 18-47-00
1234,L1,5823,Airport43,Airport4,2019-01-07 00-00-00,2019-01-07 15-55-00,2019-01-07 18-51-00,2019-01-07 21-55-00,2019-01-08 00-51-00


Unnamed: 0,yr,mo,origin,destination,carrier,scaled_demand,scaled_share
9999,2018,6,Airport30,Airport60,OTH,7255,245
10000,2018,6,Airport31,Airport4,L1,3336,210


Unnamed: 0,origin,destination,carrier,flt_num,flt_departure_dt,observation_date,total_fare,origin_city,destination_city,flight_duration_mean,tz_mean,num_flights,mo,scaled_demand,scaled_share
8918,Airport60,Airport30,L1,3120,2018-01-05 00-00-00,2017-12-01 00-00-00,802,City56,City27,,,,1,6579.0,262.0
8919,Airport60,Airport30,U3,3688,2018-01-05 00-00-00,2017-12-01 00-00-00,356,City56,City27,,,,1,,


In [539]:
df_train.isna().sum().sort_values(ascending=False)

'''
scaled_demand                100308
scaled_share                 100308
demand_x_weekend             100308
share_x_congestion           100308
flight_duration_log           99108
tz_abs                        99108
tz_mean                       99108
num_flights                   99108
flight_duration_mean          99108

'''

def custom_impute(df_train):
    grouped_data=df_train.groupby(['origin','destination','carrier','flt_num']).agg({'flight_duration_mean':'mean','tz_mean':'mean','num_flights':'mean'})
    df_train=df_train.merge(grouped_data,how='left',on=['origin','destination','carrier','flt_num'],suffixes=('', '_grp'))
    df_train['flight_duration_mean'].fillna(df_train['flight_duration_mean_grp'],inplace=True)
    df_train['tz_mean'].fillna(df_train['tz_mean_grp'],inplace=True)
    df_train['num_flights'].fillna(df_train['num_flights_grp'],inplace=True)
    df_train.drop(['flight_duration_mean_grp','tz_mean_grp','num_flights_grp'],axis=1,inplace=True)

    grouped_data=df_train.groupby(['origin','destination','carrier']).agg({'flight_duration_mean':'mean','tz_mean':'mean','num_flights':'mean'})
    df_train=df_train.merge(grouped_data,how='left',on=['origin','destination','carrier'],suffixes=('', '_grp'))
    df_train['flight_duration_mean'].fillna(df_train['flight_duration_mean_grp'],inplace=True)
    df_train['tz_mean'].fillna(df_train['tz_mean_grp'],inplace=True)
    df_train['num_flights'].fillna(df_train['num_flights_grp'],inplace=True)
    df_train.drop(['flight_duration_mean_grp','tz_mean_grp','num_flights_grp'],axis=1,inplace=True)

    grouped_data=df_train.groupby(['origin','destination']).agg({'flight_duration_mean':'mean','tz_mean':'mean','num_flights':'mean'})
    df_train=df_train.merge(grouped_data,how='left',on=['origin','destination'],suffixes=('', '_grp'))
    df_train['flight_duration_mean'].fillna(df_train['flight_duration_mean_grp'],inplace=True)
    df_train['tz_mean'].fillna(df_train['tz_mean_grp'],inplace=True)
    df_train['num_flights'].fillna(df_train['num_flights_grp'],inplace=True)
    df_train.drop(['flight_duration_mean_grp','tz_mean_grp','num_flights_grp'],axis=1,inplace=True)

    grouped_data=df_train.groupby(['origin','destination','carrier','mo']).agg({'scaled_demand':'mean','scaled_share':'mean'})
    df_train=df_train.merge(grouped_data,how='left',on=['origin','destination','carrier','mo'],suffixes=('', '_grp'))
    df_train['scaled_demand'].fillna(df_train['scaled_demand_grp'],inplace=True)
    df_train['scaled_share'].fillna(df_train['scaled_share_grp'],inplace=True)
    df_train.drop(['scaled_demand_grp','scaled_share_grp'],axis=1,inplace=True)

    grouped_data=df_train.groupby(['origin','destination','carrier']).agg({'scaled_demand':'mean','scaled_share':'mean'})
    df_train=df_train.merge(grouped_data,how='left',on=['origin','destination','carrier'],suffixes=('', '_grp'))
    df_train['scaled_demand'].fillna(df_train['scaled_demand_grp'],inplace=True)
    df_train['scaled_share'].fillna(df_train['scaled_share_grp'],inplace=True)
    df_train.drop(['scaled_demand_grp','scaled_share_grp'],axis=1,inplace=True)

    grouped_data=df_train.groupby(['origin','destination']).agg({'scaled_demand':'mean'})
    df_train=df_train.merge(grouped_data,how='left',on=['origin','destination'],suffixes=('', '_grp'))
    df_train['scaled_demand'].fillna(df_train['scaled_demand_grp'],inplace=True)
    df_train.drop(['scaled_demand_grp'],axis=1,inplace=True)

    grouped_data=df_train.groupby(['carrier']).agg({'scaled_share':'mean'})
    df_train=df_train.merge(grouped_data,how='left',on=['carrier'],suffixes=('', '_grp'))
    df_train['scaled_share'].fillna(df_train['scaled_share_grp'],inplace=True)
    df_train.drop(['scaled_share_grp'],axis=1,inplace=True)

    num_cols=df_train.select_dtypes(include=['number']).columns
    val_cols=df_train.select_dtypes(include=['object']).columns
    df_train[num_cols]=df_train[num_cols].fillna(df_train[num_cols].mean())
    df_train[val_cols]=df_train[val_cols].fillna(df_train[val_cols].mode().iloc[0])

    return df_train

df_train=custom_impute(df_train)
df_test=custom_impute(df_test)

In [540]:
def add_features(df):
    df = df.copy()

    dt = pd.to_datetime(df['flt_departure_dt'])
    ob_dt= pd.to_datetime(df['observation_date'])
    df['month'] = dt.dt.month
    df['day_of_week'] = dt.dt.weekday            # 0=Mon, 6=Sun
    df['week_of_year'] = dt.dt.isocalendar().week.astype(int)
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['is_month_start'] = dt.dt.is_month_start.astype(int)
    df['is_month_end'] = dt.dt.is_month_end.astype(int)

    df['route'] = df['origin'] + '_' + df['destination']
    df['route_len'] = df['route'].str.len()      # weak but harmless

    df['flights_per_route_day'] = (
        df.groupby(['route', 'flt_departure_dt'])['flt_num']
          .transform('count')
    )
    df['carrier_flights_route_day'] = (
        df.groupby(['carrier', 'route', 'flt_departure_dt'])['flt_num']
          .transform('count')
    )
    df['carrier_share_route_day'] = (
        df['carrier_flights_route_day'] /
        df['flights_per_route_day'].replace(0, np.nan)
    )
    df['days_until_departure']= abs((dt - ob_dt).dt.days)
    df['flight_duration_log'] = np.log1p(df['flight_duration_mean'])
    df['tz_abs'] = df['tz_mean'].abs()
    df['origin_freq'] = df['origin'].map(df['origin'].value_counts(normalize=True))
    df['destination_freq'] = df['destination'].map(df['destination'].value_counts(normalize=True))
    df['route_freq'] = df['route'].map(df['route'].value_counts(normalize=True))
    df['demand_x_weekend'] = df['scaled_demand'] * df['is_weekend']
    df['share_x_congestion'] = (df['scaled_share'] * df['flights_per_route_day'])
    return df


df_train = add_features(df_train,)
df_test = add_features(df_test, )

In [541]:
for col in df_train.select_dtypes(include='number').columns:
  print(f'''{col} ===> {df_train[col].corr(df_train['total_fare'])}''')

flt_num ===> 0.14527915274106765
total_fare ===> 0.9999999999999999
flight_duration_mean ===> 0.046910928649144736
tz_mean ===> nan
num_flights ===> nan
mo ===> -0.07696254170426657
scaled_demand ===> -0.09493968691895895
scaled_share ===> -0.07788487174383457
month ===> -0.07696254170426657
day_of_week ===> 0.049947095051116615
week_of_year ===> -0.1294833333403524
is_weekend ===> 0.10033598065851151
is_month_start ===> -0.001826596492673091
is_month_end ===> -0.018182400345895394
route_len ===> -0.1444208195632326
flights_per_route_day ===> -0.04923619787730573
carrier_flights_route_day ===> 0.00883793453093429
carrier_share_route_day ===> 0.18984000072905324
days_until_departure ===> -0.027824842427283374
flight_duration_log ===> 0.043368280116206755
tz_abs ===> nan
origin_freq ===> -0.055199696499253326
destination_freq ===> -0.18209467110916808
route_freq ===> -0.06193607960934564
demand_x_weekend ===> 0.1000572345149849
share_x_congestion ===> -0.08785073946261288


In [543]:
# Drop columns that won't be used for modeling
# Keep missing values for imputation instead of dropping rows

df_train = df_train.drop(['flt_departure_dt','observation_date'], axis=1)
df_test = df_test.drop(['flt_departure_dt','observation_date'], axis=1)

df_train.shape, df_test.shape


((8920, 32), (29825, 31))

In [544]:
y=df_train.pop('total_fare')
X=df_train

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [545]:
# Columns
num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(exclude='number').columns

# Preprocess
numeric_pipe = Pipeline(steps=[
    ('scale',StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', min_frequency=50, sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, num_cols),
        ('cat', categorical_pipe, cat_cols),
    ],
    remainder='passthrough'
)


base_models = [
    ('elastic', ElasticNet(alpha=0.5, l1_ratio=0.5)),
    ('rf', RandomForestRegressor(
        n_estimators=600,
        random_state=42,
        n_jobs=-1
    )),
    ('hgb', HistGradientBoostingRegressor(
        max_iter=500,
        learning_rate=0.05,
        random_state=42
    ))
]

stack = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    n_jobs=-1,
    passthrough=False 
)


pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('model', stack),
])

pipe


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,50
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimators,"[('elastic', ...), ('rf', ...), ...]"
,final_estimator,Ridge()
,cv,5
,n_jobs,-1
,passthrough,False
,verbose,0

0,1,2
,alpha,0.5
,l1_ratio,0.5
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,

0,1,2
,n_estimators,600
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.05
,max_iter,500
,max_leaf_nodes,31
,max_depth,
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [546]:
X_train.isna().sum().sort_values(ascending=False)

origin                       0
destination                  0
carrier                      0
flt_num                      0
origin_city                  0
destination_city             0
flight_duration_mean         0
tz_mean                      0
num_flights                  0
mo                           0
scaled_demand                0
scaled_share                 0
month                        0
day_of_week                  0
week_of_year                 0
is_weekend                   0
is_month_start               0
is_month_end                 0
route                        0
route_len                    0
flights_per_route_day        0
carrier_flights_route_day    0
carrier_share_route_day      0
days_until_departure         0
flight_duration_log          0
tz_abs                       0
origin_freq                  0
destination_freq             0
route_freq                   0
demand_x_weekend             0
share_x_congestion           0
dtype: int64

In [547]:
# Train 
tmp = pipe.fit(X_train, y_train)


In [548]:
# Evaluate

y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mae


86.10109630152319