In [1]:
# IMPORT PACKAGES
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl

from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, PolynomialFeatures, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import ElasticNet, RidgeClassifier, Ridge, Lasso, LinearRegression, SGDRegressor, HuberRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score

import xgboost as xgb

from sklearn.model_selection import GridSearchCV

----------------

# FLIGHTS

-------------

In [2]:
# LOAD DATAFRAMES
df_flights = pd.read_csv('flights_250k.csv')

In [3]:
# ENCODE AIRPORTS AND TAILNUM
encoder = OrdinalEncoder()
df_flights['mkt_carrier'] = encoder.fit_transform(df_flights[['mkt_carrier']])

In [4]:
# GRAB RELEVANT COLUMNS
X = df_flights[[
    'fl_date',
    'mkt_carrier',
    'origin_airport_id',
    'dest_airport_id',
    'crs_dep_time',
    'crs_arr_time',
    'crs_elapsed_time',
    'distance',
    
    'dep_delay',
    'nas_delay',
    'carrier_delay'
]].copy()

y = df_flights['arr_delay'].copy()

In [5]:
# FEATURE ENGINEERING

# TURNS DATE INTO MONTHS
X['fl_date'] = X['fl_date'].str[5:7]
X['fl_date'] = X['fl_date'].astype(int)

# TURNS TIME INTO HOURS
X['crs_dep_time'] = X['crs_dep_time'] // 100
X['crs_arr_time'] = X['crs_arr_time'] // 100
X['crs_dep_time'].replace(24.0, 0.0, inplace=True)
X['crs_arr_time'].replace(24.0, 0.0, inplace=True)

X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

In [6]:
# TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2)
X_train = X_train.copy()
X_test = X_test.copy()
y_train = y_train.copy()
y_test = y_test.copy()

In [7]:
# HISTORICAL DATA

# ORIGIN AIRPORT NAS DELAY
origin_airport_data = X_train.groupby(by=['fl_date', 'origin_airport_id'])['nas_delay'].mean().to_dict()

def get_origin_delay(x):
    if (x['fl_date'], x['origin_airport_id']) in origin_airport_data:
        return origin_airport_data[(x['fl_date'], x['origin_airport_id'])]
    else:
        return 0

X_train['origin_delay'] = X_train.apply(get_origin_delay, axis=1)

# DEST AIRPORT NAS DELAY
dest_airport_data = X_train.groupby(by=['fl_date', 'dest_airport_id'])['nas_delay'].mean().to_dict()

def get_dest_delay(x):
    if (x['fl_date'], x['dest_airport_id']) in dest_airport_data:
        return dest_airport_data[(x['fl_date'], x['dest_airport_id'])]
    else:
        return 0
    
X_train['dest_delay'] = X_train.apply(get_dest_delay, axis=1)

# CARRIER DELAY
carrier_data = X_train.groupby(by=['fl_date', 'mkt_carrier'])['carrier_delay'].mean().to_dict()

def get_carrier_delay(x):
    if (x['fl_date'], x['mkt_carrier']) in carrier_data:
        return carrier_data[(x['fl_date'], x['mkt_carrier'])]
    else:
        return 0

X_train['carr_delay'] = X_train.apply(get_carrier_delay, axis=1)

# BIN HISTORICAL DATA
origin_minmax = MinMaxScaler(feature_range=(1,5))
dest_minmax = MinMaxScaler(feature_range=(1,5))
carrier_minmax = MinMaxScaler(feature_range=(1,5))

X_train['origin_delay'] = origin_minmax.fit_transform(X_train[['origin_delay']])
X_train['dest_delay'] = dest_minmax.fit_transform(X_train[['dest_delay']])
X_train['carr_delay'] = carrier_minmax.fit_transform(X_train[['carr_delay']])

X_train['origin_delay'] = X_train['origin_delay'].round(0)
X_train['dest_delay'] = X_train['dest_delay'].round(0)
X_train['carr_delay'] = X_train['carr_delay'].round(0)

In [8]:
# APPLY HISTORICAL DATA TO TEST
X_test['origin_delay'] = X_test.apply(get_origin_delay, axis=1)
X_test['dest_delay'] = X_test.apply(get_dest_delay, axis=1)
X_test['carr_delay'] = X_test.apply(get_carrier_delay, axis=1)

#BIN HISTORICAL DATA
X_test['origin_delay'] = origin_minmax.transform(X_test[['origin_delay']])
X_test['dest_delay'] = dest_minmax.transform(X_test[['dest_delay']])
X_test['carr_delay'] = carrier_minmax.transform(X_test[['carr_delay']])

X_test['origin_delay'] = X_test['origin_delay'].round(0)
X_test['dest_delay'] = X_test['dest_delay'].round(0)
X_test['carr_delay'] = X_test['carr_delay'].round(0)

In [9]:
# DROP HISTORICAL DATA
X_train.drop(columns=['carrier_delay', 'nas_delay', 'dep_delay', 'origin_airport_id', 'dest_airport_id', 'mkt_carrier'], inplace=True)
X_test.drop(columns=['carrier_delay', 'nas_delay', 'dep_delay', 'origin_airport_id', 'dest_airport_id', 'mkt_carrier'], inplace=True)

In [10]:
# # MODEL CLASSIFICATION
# scaler = StandardScaler()
# poly = PolynomialFeatures(2)

# # create function for apply
# def relabel_y(x):
#     if x > 0:
#         return 1
#     else:
#         return 0

# #random forest for classification
# ran_forest = RandomForestClassifier(
#     n_estimators=300,
#     n_jobs=2
# )

# #ridge classifier
# # ran_forest = RidgeClassifier()

# # relabel y for classification
# y_train_class = y_train.apply(relabel_y)
# y_test_class = y_test.apply(relabel_y)

# # apply fit, predict
# ran_forest.fit(
#     scaler.fit_transform(poly.fit_transform(X_train)),
#     y_train_class
# )
# y_train_class_pred = ran_forest.predict(
#     scaler.fit_transform(poly.fit_transform(X_train))
# )
# y_test_class_pred =  ran_forest.predict(
#     scaler.fit_transform(poly.fit_transform(X_test))
# )

# # add delay feature to X
# X_train['delay'] = y_train_class_pred
# X_test['delay'] = y_test_class_pred

# #show prediction accuracy based on available data
# accuracy_score(y_test_class, y_test_class_pred)

In [11]:
# MODEL REGRESSION SELECTION :)
# model = RandomForestRegressor(
#     n_estimators=200,
#     n_jobs=2
# )

# model = ElasticNet(
#      alpha=1,
#      l1_ratio=0.5
# )

# model = SGDRegressor()

# model = HuberRegressor()

# model = GradientBoostingRegressor()

# model = AdaBoostRegressor()

# model = xgb.XGBRegressor()

# BEST SO FAR
# model = LinearRegression()

In [12]:
# MODEL REGRESSION
model = LinearRegression()

scaler = StandardScaler()
pca = PCA(n_components=4)
poly = PolynomialFeatures(2)

# WITH PCA
# model.fit(
#     pca.fit_transform(scaler.fit_transform(poly.fit_transform(X_train))), y_train
# )

#WITH SCALING
# model.fit(
#     scaler.fit_transform(X_train), y_train
# )

# WITH POLY AND SCALING
model.fit(
    scaler.fit_transform(poly.fit_transform(X_train)), y_train
)

# WITH POLY
# model.fit(
#     poly.fit_transform(X_train), y_train
# )

# NO SCALING
# model.fit(
#     X_train, y_train
# )

LinearRegression()

In [13]:
# MODEL PREDICTION AND SCORE

# WITH PCA
# y_pred = model.predict(
#     pca.transform(scaler.transform(poly.fit_transform(X_test)))
# )

#WITH SCALING
# y_pred = model.predict(
#     scaler.transform(X_test)
# )

# WITH POLY AND SCALING
y_pred = model.predict(
    scaler.fit_transform(poly.fit_transform(X_test))
)

# WITH POLY
# y_pred = model.predict(
#     poly.fit_transform(X_test)
# )

# NO SCALING
# y_pred = model.predict(
#     X_test
# )

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

23.9533851539187
0.013070254232745993


In [14]:
# MODEL PERSISTENCE
import pickle
filename = 'linear_1.5.sav'

# SAVE MODEL
# pickle.dump(model, open(filename, 'wb'))

# LOAD MODEL
# model = pickle.load(open(filename, 'rb'))

------------------

# FLIGHTS TEST

---------------

In [None]:
# LOAD DATAFRAMES
df_test = pd.read_csv('flights_test_first_week.csv')

# ENCODE AIRPORTS AND TAILNUM
df_test['mkt_carrier'] = encoder.transform(df_test[['mkt_carrier']])

# GRAB RELEVANT COLUMNS
X_ = df_test[[
    'fl_date',
    'mkt_carrier',
    'origin_airport_id',
    'dest_airport_id',
    'crs_dep_time',
    'crs_arr_time',
    'crs_elapsed_time',
    'distance'
]].copy()

# FEATURE ENGINEERING
X_['fl_date'] = X_['fl_date'].str[5:7]
X_['fl_date'] = X_['fl_date'].astype(int)

X_['crs_dep_time'] = X_['crs_dep_time'] // 100
X_['crs_arr_time'] = X_['crs_arr_time'] // 100
X_['crs_dep_time'].replace(24.0, 0.0, inplace=True)
X_['crs_arr_time'].replace(24.0, 0.0, inplace=True)

X_.fillna(0, inplace=True)

# APPLY HISTORICAL DATA TO TEST
X_['origin_delay'] = X_.apply(get_origin_delay, axis=1)
X_['dest_delay'] = X_.apply(get_dest_delay, axis=1)
X_['carr_delay'] = X_.apply(get_carrier_delay, axis=1)

X_['origin_delay'] = origin_minmax.transform(X_[['origin_delay']])
X_['dest_delay'] = dest_minmax.transform(X_[['dest_delay']])
X_['carr_delay'] = carrier_minmax.transform(X_[['carr_delay']])

X_['origin_delay'] = X_['origin_delay'].round(0)
X_['dest_delay'] = X_['dest_delay'].round(0)
X_['carr_delay'] = X_['carr_delay'].round(0)

X_.drop(columns=['origin_airport_id', 'dest_airport_id', 'mkt_carrier'], inplace=True)

# WITH POLY
y_pred_ = model.predict(
    scaler.fit_transform(poly.fit_transform(X_))
)

In [None]:
# GRAB RELEVANT COLUMNS
X_final = df_test[[
    'fl_date',
    'mkt_carrier',
    'mkt_carrier_fl_num',
    'origin',
    'dest'
]].copy()

X_final['predicted_delay'] = y_pred_

In [None]:
X_final.to_csv('prediction.csv', index=False)