In [3]:
# Extending Baseline model(Notebook 7) based on Erics Idea
# Idea: ML fit for each route.

In [32]:

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
import gc  # Import the garbage collection module



In [5]:
df = pd.read_csv('data/eda_data.csv',index_col=0)
# Preprocessing

# Dropping some columns
df.drop(['id', 'std', 'sta', 'fltid','arr_iata','dep_iata','ac'], axis=1,inplace=True)

# Creating flight route column 
df['flight_route'] = df['depstn'] + '-' + df['arrstn']

# Label encoding
categorical_columns = ['depstn','arrstn','status','arr_country','dep_country','season','airline_code','international_flight']

# label_encoder object knows 
# can use ordinal encoder
# how to understand word labels
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
for i in categorical_columns:
    df[i]= label_encoder.fit_transform(df[i])

In [6]:
# Defining features and target
X = df.drop(['target'],axis=1)
y = df['target'] 

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [33]:
# Combine X_train and y_train into a single DataFrame to group and model by flight route
train_data = pd.concat([X_train, y_train], axis=1)

# List of models for each route
models= {}

# Looping over the routes
for idx, (route, group) in enumerate(train_data.groupby('flight_route')):
    X_fr_train = group.drop(['target','flight_route'],axis=1)
    y_fr_train = group['target']
    # Model for each route
    model = RandomForestRegressor()
    model.fit(X_fr_train, y_fr_train)

    # Add models to list
    models[route] = model
    print("Progress: ",round(idx*100/len(train_data.groupby('flight_route')),0),"%")

Progress:  0.0 %
Progress:  0.0 %
Progress:  0.0 %
Progress:  0.0 %
Progress:  1.0 %
Progress:  1.0 %
Progress:  1.0 %
Progress:  1.0 %
Progress:  1.0 %
Progress:  1.0 %
Progress:  1.0 %
Progress:  2.0 %
Progress:  2.0 %
Progress:  2.0 %
Progress:  2.0 %
Progress:  2.0 %
Progress:  2.0 %
Progress:  2.0 %
Progress:  3.0 %
Progress:  3.0 %
Progress:  3.0 %
Progress:  3.0 %
Progress:  3.0 %
Progress:  3.0 %
Progress:  4.0 %
Progress:  4.0 %
Progress:  4.0 %
Progress:  4.0 %
Progress:  4.0 %
Progress:  4.0 %
Progress:  4.0 %
Progress:  5.0 %
Progress:  5.0 %
Progress:  5.0 %
Progress:  5.0 %
Progress:  5.0 %
Progress:  5.0 %
Progress:  5.0 %
Progress:  6.0 %
Progress:  6.0 %
Progress:  6.0 %
Progress:  6.0 %
Progress:  6.0 %
Progress:  6.0 %
Progress:  6.0 %
Progress:  7.0 %
Progress:  7.0 %
Progress:  7.0 %
Progress:  7.0 %
Progress:  7.0 %
Progress:  7.0 %
Progress:  7.0 %
Progress:  8.0 %
Progress:  8.0 %
Progress:  8.0 %
Progress:  8.0 %
Progress:  8.0 %
Progress:  8.0 %
Progress:  9.0

In [22]:
def predict_delay_for_a_single_route(features,model):
    delay = model.predict(features.drop(['flight_route'],axis=1))
    return delay

In [34]:
# Predict delays for the test set
# y_pred = pd.DataFrame(columns=["predicted_delay"])
y_pred = np.empty(len(X_test))
# print(len(X_test))
for i in range(len(X_test)):
    print("Progress: ",round(i*100/len(X_test),0),"%","idx: ",i)
    # try:
    # Attempt to make predictions
    if X_test.iloc[i]['flight_route'] in models:
        delay = predict_delay_for_a_single_route(X_test.iloc[[i]],models[X_test.iloc[i]['flight_route']])
    else:
        delay = 0
    y_pred[i] = delay
    
    # Explicitly release memory occupied by 'delay' and any other objects as needed
    del delay
    gc.collect()  # Trigger garbage collection
    # except Exception as e:
    #     # Catch and handle any exceptions that occur
    #     print(f"An error occurred: {str(e)}")
    

# print(y_pred.head())
# y_test

Progress:  0.0 % idx:  0
Progress:  0.0 % idx:  1
Progress:  0.0 % idx:  2
Progress:  0.0 % idx:  3
Progress:  0.0 % idx:  4
Progress:  0.0 % idx:  5
Progress:  0.0 % idx:  6
Progress:  0.0 % idx:  7
Progress:  0.0 % idx:  8
Progress:  0.0 % idx:  9
Progress:  0.0 % idx:  10
Progress:  0.0 % idx:  11
Progress:  0.0 % idx:  12
Progress:  0.0 % idx:  13
Progress:  0.0 % idx:  14
Progress:  0.0 % idx:  15
Progress:  0.0 % idx:  16
Progress:  0.0 % idx:  17
Progress:  0.0 % idx:  18
Progress:  0.0 % idx:  19
Progress:  0.0 % idx:  20
Progress:  0.0 % idx:  21
Progress:  0.0 % idx:  22
Progress:  0.0 % idx:  23
Progress:  0.0 % idx:  24
Progress:  0.0 % idx:  25
Progress:  0.0 % idx:  26
Progress:  0.0 % idx:  27
Progress:  0.0 % idx:  28
Progress:  0.0 % idx:  29
Progress:  0.0 % idx:  30
Progress:  0.0 % idx:  31
Progress:  0.0 % idx:  32
Progress:  0.0 % idx:  33
Progress:  0.0 % idx:  34
Progress:  0.0 % idx:  35
Progress:  0.0 % idx:  36
Progress:  0.0 % idx:  37
Progress:  0.0 % idx: 

In [35]:
y_pred_df = pd.DataFrame({'target': y_pred})

In [37]:
y_pred_df.head()

Unnamed: 0,target
0,20.61
1,0.0
2,102.48
3,0.0
4,136.88


In [38]:
y_test.head()

50752     13.0
33940      0.0
84915    455.0
70284      0.0
50655    239.0
Name: target, dtype: float64

In [36]:
# Scoring the prediction using rmse
rmse_cv = mean_squared_error(y_test, y_pred_df, squared=False)
print("RMSE score:",rmse_cv)

RMSE score: 122.07051047408655


In [28]:
# Scoring the prediction using rmse
rmse_cv = mean_squared_error(y_test, y_pred_df, squared=False)
print("RMSE score:",rmse_cv)

Iteation: 32349 RMSE score: 2056980384.1000657


In [67]:
# To test the stability of the prediction:
# Split the data set randomly 1000 times
# Calculate average rmse score

rmse_scores = []
for i in range(1,1000):
    # Splitting data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

    # Calculate average delays on each flight route
    average_delays = calc_average_delay_for_each_route(X_train,y_train)

    # Predict flight delays based on average delay at each airport
    y_pred = estimate_delays(average_delays,X_test)

    # Scoring the prediction using rmse
    rmse_cv = mean_squared_error(y_test, y_pred, squared=False)
    rmse_scores.append(rmse_cv)
    print("Iteation:",i,"RMSE score:",rmse_cv)

mean_rmse_score = sum(rmse_scores)/len(rmse_scores)
# Displaying the score 
model_scores_df = pd.DataFrame({
    'Evaluation Metric': ["rmse"],
    'Mean RMSE Score': [mean_rmse_score]
})

display(model_scores_df)

Iteation: 1 RMSE score: 109.36161722031547
Iteation: 2 RMSE score: 117.24242325155274
Iteation: 3 RMSE score: 113.32845202890178
Iteation: 4 RMSE score: 116.13265937328882
Iteation: 5 RMSE score: 114.10617950595977
Iteation: 6 RMSE score: 115.29856604653432
Iteation: 7 RMSE score: 115.63738736901689
Iteation: 8 RMSE score: 115.78923499664398
Iteation: 9 RMSE score: 112.1970683110567
Iteation: 10 RMSE score: 118.38386732505033
Iteation: 11 RMSE score: 117.56445975231901
Iteation: 12 RMSE score: 115.52441924707333
Iteation: 13 RMSE score: 114.43428685594418
Iteation: 14 RMSE score: 111.9881347602403
Iteation: 15 RMSE score: 114.23919472094678
Iteation: 16 RMSE score: 115.13960044180764
Iteation: 17 RMSE score: 116.22251676875716
Iteation: 18 RMSE score: 115.59325722963095
Iteation: 19 RMSE score: 115.303339058245
Iteation: 20 RMSE score: 114.42239279903768
Iteation: 21 RMSE score: 115.76165210239483
Iteation: 22 RMSE score: 119.66190421343698
Iteation: 23 RMSE score: 115.2818952624563
It

Unnamed: 0,Evaluation Metric,Mean RMSE Score
0,rmse,115.519887


In [56]:
def calc_average_delay_for_each_route(X_train, y_train):
    # return average_delays
    # Combine X_train and y_train into a single DataFrame
    train_data = pd.concat([X_train, y_train], axis=1)

    # Calculate average delays for each route
    average_delays = train_data.groupby('flight_route')['target'].mean().reset_index()

    return average_delays

In [57]:
def estimate_delays(average_delays,X_test):
    # Merge X_test with average_delays on 'route'
    # merged_data = X_test.merge(average_delays, on='route', how='left')
    merged_data = X_test.merge(average_delays, on=['flight_route'], how='left')

    # Replace missing values with a default delay value (e.g., 0)
    merged_data['target'].fillna(0, inplace=True)

    # Return the estimated delays
    estimated_delays = merged_data['target']

    return estimated_delays

In [65]:
# Delay is req for classification - but not req. for the baseline model
# df['delayed'] = df['target'] >0.1

# Idea is using flight routes to predict delays
# Creating alight route column 
df['flight_route'] = df['depstn'] + '-' + df['arrstn']

# Defining features and target
X = df.drop(['target'],axis=1)
y = df['target'] 

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

# Calculate average delays on each flight route
average_delays = calc_average_delay_for_each_route(X_train,y_train)

# Predict flight delays based on average delay at each airport
y_pred = estimate_delays(average_delays,X_test)

# Scoring the prediction using rmse
rmse_cv = mean_squared_error(y_test, y_pred, squared=False)

# Displaying the score 
model_scores_df = pd.DataFrame({
    'Evaluation Metric': ["rmse"],
    'Model Score': [rmse_cv]
})

display(model_scores_df)


Unnamed: 0,Evaluation Metric,Model Score
0,rmse,116.980063


In [None]:
# To test the stability of the prediction:
# Split the data set randomly 1000 times
# Calculate average rmse score

rmse_scores = []
for i in range(1,1000):
    # Splitting data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

    # Calculate average delays on each flight route
    average_delays = calc_average_delay_for_each_route(X_train,y_train)

    # Predict flight delays based on average delay at each airport
    y_pred = estimate_delays(average_delays,X_test)

    # Scoring the prediction using rmse
    rmse_cv = mean_squared_error(y_test, y_pred, squared=False)
    rmse_scores.append(rmse_cv)
    print("Iteation:",i,"RMSE score:",rmse_cv)

mean_rmse_score = sum(rmse_scores)/len(rmse_scores)
# Displaying the score 
model_scores_df = pd.DataFrame({
    'Evaluation Metric': ["rmse"],
    'Mean RMSE Score': [mean_rmse_score]
})

display(model_scores_df)

Iteation: 1 RMSE score: 109.36161722031547
Iteation: 2 RMSE score: 117.24242325155274
Iteation: 3 RMSE score: 113.32845202890178
Iteation: 4 RMSE score: 116.13265937328882
Iteation: 5 RMSE score: 114.10617950595977
Iteation: 6 RMSE score: 115.29856604653432
Iteation: 7 RMSE score: 115.63738736901689
Iteation: 8 RMSE score: 115.78923499664398
Iteation: 9 RMSE score: 112.1970683110567
Iteation: 10 RMSE score: 118.38386732505033
Iteation: 11 RMSE score: 117.56445975231901
Iteation: 12 RMSE score: 115.52441924707333
Iteation: 13 RMSE score: 114.43428685594418
Iteation: 14 RMSE score: 111.9881347602403
Iteation: 15 RMSE score: 114.23919472094678
Iteation: 16 RMSE score: 115.13960044180764
Iteation: 17 RMSE score: 116.22251676875716
Iteation: 18 RMSE score: 115.59325722963095
Iteation: 19 RMSE score: 115.303339058245
Iteation: 20 RMSE score: 114.42239279903768
Iteation: 21 RMSE score: 115.76165210239483
Iteation: 22 RMSE score: 119.66190421343698
Iteation: 23 RMSE score: 115.2818952624563
It

Unnamed: 0,Evaluation Metric,Mean RMSE Score
0,rmse,115.519887


In [None]:
# To test the stability of the prediction:
# Split the data set randomly 1000 times
# Calculate average rmse score

rmse_scores = []
for i in range(1,1000):
    # Splitting data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

    # Calculate average delays on each flight route
    average_delays = calc_average_delay_for_each_route(X_train,y_train)

    # Predict flight delays based on average delay at each airport
    y_pred = estimate_delays(average_delays,X_test)

    # Scoring the prediction using rmse
    rmse_cv = mean_squared_error(y_test, y_pred, squared=False)
    rmse_scores.append(rmse_cv)
    print("Iteation:",i,"RMSE score:",rmse_cv)

mean_rmse_score = sum(rmse_scores)/len(rmse_scores)
# Displaying the score 
model_scores_df = pd.DataFrame({
    'Evaluation Metric': ["rmse"],
    'Mean RMSE Score': [mean_rmse_score]
})

display(model_scores_df)

Iteation: 1 RMSE score: 109.36161722031547
Iteation: 2 RMSE score: 117.24242325155274
Iteation: 3 RMSE score: 113.32845202890178
Iteation: 4 RMSE score: 116.13265937328882
Iteation: 5 RMSE score: 114.10617950595977
Iteation: 6 RMSE score: 115.29856604653432
Iteation: 7 RMSE score: 115.63738736901689
Iteation: 8 RMSE score: 115.78923499664398
Iteation: 9 RMSE score: 112.1970683110567
Iteation: 10 RMSE score: 118.38386732505033
Iteation: 11 RMSE score: 117.56445975231901
Iteation: 12 RMSE score: 115.52441924707333
Iteation: 13 RMSE score: 114.43428685594418
Iteation: 14 RMSE score: 111.9881347602403
Iteation: 15 RMSE score: 114.23919472094678
Iteation: 16 RMSE score: 115.13960044180764
Iteation: 17 RMSE score: 116.22251676875716
Iteation: 18 RMSE score: 115.59325722963095
Iteation: 19 RMSE score: 115.303339058245
Iteation: 20 RMSE score: 114.42239279903768
Iteation: 21 RMSE score: 115.76165210239483
Iteation: 22 RMSE score: 119.66190421343698
Iteation: 23 RMSE score: 115.2818952624563
It

Unnamed: 0,Evaluation Metric,Mean RMSE Score
0,rmse,115.519887
