In [29]:
# Baseline model
# Idea: Estimated delay is the average delays of previous flights on the same flight route.

In [62]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

df = pd.read_csv('data/eda_data.csv',index_col=0)


In [56]:
def calc_average_delay_for_each_route(X_train, y_train):
    # return average_delays
    # Combine X_train and y_train into a single DataFrame
    train_data = pd.concat([X_train, y_train], axis=1)

    # Calculate average delays for each route
    average_delays = train_data.groupby('flight_route')['target'].mean().reset_index()

    return average_delays

In [57]:
def estimate_delays(average_delays,X_test):
    # Merge X_test with average_delays on 'route'
    # merged_data = X_test.merge(average_delays, on='route', how='left')
    merged_data = X_test.merge(average_delays, on=['flight_route'], how='left')

    # Replace missing values with a default delay value (e.g., 0)
    merged_data['target'].fillna(0, inplace=True)

    # Return the estimated delays
    estimated_delays = merged_data['target']

    return estimated_delays

In [65]:
# Delay is req for classification - but not req. for the baseline model
# df['delayed'] = df['target'] >0.1

# Idea is using flight routes to predict delays
# Creating alight route column 
df['flight_route'] = df['depstn'] + '-' + df['arrstn']

# Defining features and target
X = df.drop(['target'],axis=1)
y = df['target'] 

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

# Calculate average delays on each flight route
average_delays = calc_average_delay_for_each_route(X_train,y_train)

# Predict flight delays based on average delay at each airport
y_pred = estimate_delays(average_delays,X_test)

# Scoring the prediction using rmse
rmse_cv = mean_squared_error(y_test, y_pred, squared=False)

# Displaying the score 
model_scores_df = pd.DataFrame({
    'Evaluation Metric': ["rmse"],
    'Model Score': [rmse_cv]
})

display(model_scores_df)


Unnamed: 0,Evaluation Metric,Model Score
0,rmse,116.980063


In [None]:
# To test the stability of the prediction:
# Split the data set randomly 1000 times
# Calculate average rmse score

rmse_scores = []
for i in range(1,1000):
    # Splitting data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

    # Calculate average delays on each flight route
    average_delays = calc_average_delay_for_each_route(X_train,y_train)

    # Predict flight delays based on average delay at each airport
    y_pred = estimate_delays(average_delays,X_test)

    # Scoring the prediction using rmse
    rmse_cv = mean_squared_error(y_test, y_pred, squared=False)
    rmse_scores.append(rmse_cv)
    
# Displaying the score 
model_scores_df = pd.DataFrame({
    'Evaluation Metric': ["rmse"],
    'Model Score': [rmse_cv]
})

display(model_scores_df)

In [63]:

# Other scoring methods
r2_cv = r2_score(y_test, y_pred)
mse_cv = mean_squared_error(y_test, y_pred)
rmse_cv = mean_squared_error(y_test, y_pred, squared=False)
mae_cv = mean_absolute_error(y_test, y_pred)

rmse_cv = mean_squared_error(y_test, y_pred, squared=False)

model_scores_df = pd.DataFrame({
    'Evaluation Metric': ["r2","mse","rmse","mae"],
    'Model Score': [r2_cv, mse_cv, rmse_cv, mae_cv]
})

display(model_scores_df)


Unnamed: 0,Evaluation Metric,Model Score
0,r2,0.029222
1,mse,13684.335163
2,rmse,116.980063
3,mae,54.340054
