In [16]:
# Pre-requisites
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Store the classifier models to save time
import joblib

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Classifiers from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Performance metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score,mean_absolute_error,mean_squared_error,r2_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
from scipy.stats import pointbiserialr

In [17]:
df = pd.read_csv('/Users/biraj/Desktop/Github/flight-prediction-api/datasets/Flights_2021/final_dataset-add.csv')
df.shape

(538040, 37)

In [18]:
df.columns

Index(['Unnamed: 0', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'FlightDate', 'Marketing_Airline_Network', 'Operating_Airline ',
       'OriginAirportID', 'Origin', 'DestAirportID', 'Dest', 'CRSDepTime',
       'DepTime', 'DepDelayMinutes', 'DepDel15', 'TaxiIn', 'CRSArrTime',
       'ArrTime', 'ArrDelayMinutes', 'ArrDel15', 'DistanceGroup',
       'CombinedDateTime', 'ScaledCRSDepTime', 'temperature_2m',
       'relative_humidity_2m', 'dew_point_2m', 'precipitation', 'rain',
       'snowfall', 'weather_code', 'surface_pressure', 'cloud_cover',
       'wind_speed_10m', 'wind_direction_10m', 'wind_direction_100m'],
      dtype='object')

In [19]:
df.drop(columns=["FlightDate",
                 "OriginAirportID",
                 "DestAirportID",
                 "CRSArrTime",
                 "Unnamed: 0",
                 "CombinedDateTime",
                 "ScaledCRSDepTime"],
        inplace=True)

In [20]:
df.columns

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'Marketing_Airline_Network', 'Operating_Airline ', 'Origin', 'Dest',
       'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDel15', 'TaxiIn',
       'ArrTime', 'ArrDelayMinutes', 'ArrDel15', 'DistanceGroup',
       'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'precipitation', 'rain', 'snowfall', 'weather_code', 'surface_pressure',
       'cloud_cover', 'wind_speed_10m', 'wind_direction_10m',
       'wind_direction_100m'],
      dtype='object')

In [21]:
df = df[df['ArrDel15'].notna()]

In [22]:
for col in df.columns:
    if df[col].isna().sum()>0:
        print(col)

In [23]:
airportLabel = [
    {"id": 0, "code": "BWI"},
    {"id": 1, "code": "CLT"},
    {"id": 2, "code": "DEN"},
    {"id": 3, "code": "DFW"},
    {"id": 4, "code": "DTW"},
    {"id": 5, "code": "IAH"},
    {"id": 6, "code": "LAX"},
    {"id": 7, "code": "MDW"},
    {"id": 8, "code": "ORD"},
    {"id": 9, "code": "PHL"},
    {"id": 10, "code": "PHX"},
    {"id": 11, "code": "SEA"},
    {"id": 12, "code": "SFO"},
    {"id": 13, "code": "SLC"}
]

airport_mapping = {airport['code']: airport['id'] for airport in airportLabel}


# Replace Origin and Dest columns with corresponding ids
df['Origin'] = df['Origin'].map(airport_mapping)
df['Dest'] = df['Dest'].map(airport_mapping)


# labelEncoder = LabelEncoder()
# df["Origin"] = labelEncoder.fit_transform(df["Origin"])
# df["Dest"] = labelEncoder.fit_transform(df["Dest"])



In [24]:
df['Origin']

0          3
1          3
2          3
3          3
4          3
          ..
538035     5
538036     9
538037    12
538038    12
538039    10
Name: Origin, Length: 536982, dtype: int64

In [25]:
X = df[["Quarter", "DayofMonth", "Origin", "Dest", "CRSDepTime", "DepDel15", 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'precipitation', 'rain', 'snowfall', 'weather_code', 'surface_pressure','cloud_cover', 'wind_speed_10m','wind_direction_100m']]
y =  np.array(df.loc[:, df.columns == "ArrDel15"]["ArrDel15"])


In [26]:
print(f"\nDataset shape: {df.shape}")
X_train, X_test, y_train, y_test = train_test_split= train_test_split(X, y, test_size=0.3, random_state=42)
print(f"X train shape: {X_train.shape} | Y train shape: {y_train.shape}")
print(f"X test shape: {X_test.shape} | Y test shape: {y_test.shape}")


Dataset shape: (536982, 30)
X train shape: (375887, 17) | Y train shape: (375887,)
X test shape: (161095, 17) | Y test shape: (161095,)


In [27]:
perf_df = pd.DataFrame(columns=["Regressors", "MSE", "RMSE", "MAE", "R2"])
def print_metrics(labels_test, model_pred, regressor_name, perf_df):
    
    mse = mean_squared_error(labels_test, model_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(labels_test, model_pred)
    r2 = r2_score(labels_test, model_pred)
    
    print(f"MSE      : {mse}", end="\n\n")
    print(f"RMSE     : {rmse}", end="\n\n")
    print(f"MAE      : {mae}", end="\n\n")
    print(f"R2 Score : {r2}", end="\n\n")
    
    perf_df = perf_df.append({"Regressors": regressor_name,
                                        "MSE": mse, 
                                        "RMSE": rmse,
                                        "MAE": mae,
                                        "R2": r2}, ignore_index=True)
    return perf_df

In [28]:
model = LinearRegression(n_jobs=-1)
model.fit(X_train, y_train)
# joblib.dump(model, "./Regressors/LogisticRegression.joblib")
yhat = model.predict(X_test)
perf_df = print_metrics(y_test, yhat, "LinearRegression", perf_df)
del model
del yhat

MSE      : 0.07223886825120059

RMSE     : 0.26877289344575017

MAE      : 0.14383828976455756

R2 Score : 0.49537819281347983



  perf_df = perf_df.append({"Regressors": regressor_name,


In [29]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
# joblib.dump(model, "./Regressors/DecisionTreeRegressor.joblib")
yhat = model.predict(X_test)
perf_df = print_metrics(y_test, yhat, "DecisionTreeRegressor", perf_df)
del model
del yhat

MSE      : 0.13781533325746367

RMSE     : 0.37123487613297285

MAE      : 0.13796103748305863

R2 Score : 0.03729634461378695



  perf_df = perf_df.append({"Regressors": regressor_name,


In [30]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
# joblib.dump(model, "./Regressors/GradientBoostingRegressor.joblib")
# model = joblib.load("./Regressors/GradientBoostingRegressor.joblib")
yhat = model.predict(X_test)
perf_df = print_metrics(y_test, yhat, "GradientBoostingRegressor", perf_df)
del model
del yhat

MSE      : 0.07118261664502697

RMSE     : 0.2668007058555636

MAE      : 0.1427326265170415

R2 Score : 0.5027565973656876



  perf_df = perf_df.append({"Regressors": regressor_name,


In [31]:
model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train, y_train)
# joblib.dump(model, "./Regressors/RandomForestRegressor.joblib")
# model = joblib.load("./Regressors/RandomForestRegressor.joblib")
yhat = model.predict(X_test)
perf_df = print_metrics(y_test, yhat, "RandomForestRegressor", perf_df)
del model
del model_pred

MSE      : 0.07623027647194512

RMSE     : 0.27609830943333413

MAE      : 0.14111868603116468

R2 Score : 0.46749636578142684



  perf_df = perf_df.append({"Regressors": regressor_name,


In [33]:
model = ExtraTreesRegressor(n_jobs=-1)
model.fit(X_train, y_train)
# joblib.dump(model, "./Regressors/ExtraTreesRegressor.joblib")
# model = joblib.load("./Regressors/ExtraTreesRegressor.joblib")
yhat = model.predict(X_test)
perf_df = print_metrics(y_test, yhat, "ExtraTreesRegressor", perf_df)
del model
del yhat


MSE      : 0.08196820333757514

RMSE     : 0.28630089650152185

MAE      : 0.13879062044135448

R2 Score : 0.4274143006199176



  perf_df = perf_df.append({"Regressors": regressor_name,


In [34]:
# Set name of the regressors as index labels
perf_df.set_index("Regressors", inplace=True)
perf_df

Unnamed: 0_level_0,MSE,RMSE,MAE,R2
Regressors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LinearRegression,0.072239,0.268773,0.143838,0.495378
DecisionTreeRegressor,0.137815,0.371235,0.137961,0.037296
GradientBoostingRegressor,0.071183,0.266801,0.142733,0.502757
RandomForestRegressor,0.07623,0.276098,0.141119,0.467496
ExtraTreesRegressor,0.081936,0.286244,0.138707,0.427642
ExtraTreesRegressor,0.081968,0.286301,0.138791,0.427414
