In [1]:
import modules.preprocessing_functions as pre
import warnings
warnings.filterwarnings('ignore')


In [146]:
import pandas as pd
import numpy as np


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error 


def split_data(X, y, test_size=0.2, random_state=99):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = random_state)
    
    return X_train, X_test, y_train, y_test

def decision_tree(X_train, X_test, y_train, y_test):
    dec_tree = DecisionTreeRegressor()
    dec_tree.fit(X_train, y_train)
    y_pred = dec_tree.predict(X_test)
    print('Mean Squared Error: {0:0.4f}'.format(mean_squared_error(y_test, y_pred)))
    
    return dec_tree, y_pred

def random_forest(X_train, X_test, y_train, y_test, random_state=99):
    rand_for = RandomForestRegressor(max_depth=100, min_samples_split=3, min_samples_leaf=3)
    rand_for.fit(X_train, y_train)
    y_pred = rand_for.predict(X_test)
    print('Mean Squared Error: {0:0.4f}'.format(mean_squared_error(y_test, y_pred)))
    print('RSME: {0:0.4f}'.format(mean_squared_error(y_test, y_pred, squared=False)))

    
    return rand_for, y_pred


def cleaning(df, year='2019', month='01', day='07', delay_max = 30, delay_min = -16, greater=False):
    
    cut_off = year + '-' + month + '-' + day
    
    if greater==True:
        df = df[df.fl_date >= cut_off]
    else:
        df = df[df.fl_date <= cut_off]
    df = df.loc[df['cancelled']==0]
    df = df.loc[df['diverted']==0]
    
    is_delay = np.zeros(df.shape[0])
    
    for i, entry in enumerate(df.arr_delay):
        is_delay[i] = entry > 0

    df['is_delay'] = is_delay
    
    df = df.loc[df['arr_delay'] < delay_max]
    df = df.loc[df['arr_delay'] > delay_min]
    
    df['arr_delay'] = df['arr_delay'].fillna(0)
    
    
    return df 
 

In [3]:
# Note: the data is in my local file but not pushed to GitHub- adjust your path - Load and Back Up

flights = pd.read_csv('data/2019-01.csv', low_memory=False)


In [46]:
flights.arr_delay.agg(['mean', 'std'])

mean     4.714467
std     52.089993
Name: arr_delay, dtype: float64

In [74]:
flights.arr_delay.quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.98])

0.10    -24.0
0.25    -16.0
0.50     -7.0
0.75      7.0
0.90     37.0
0.95     75.0
0.98    141.0
Name: arr_delay, dtype: float64

In [7]:
flights[flights.fl_date < '2019-01-08'].shape

(146485, 42)

In [147]:
# df_1 = cleaning(flights, '2019', '01', '08')
# df_2 = cleaning(flights_2, '2019', '12', '19', greater=True)
# df = cleaning(flights, '2019', '01', '32', delay_max=37, delay_min=0)
df = cleaning(flights, '2019', '01', '31', delay_max=60, delay_min=-7)

# df = pd.concat([df_1, df_2])

In [148]:
df.shape

(266130, 43)

In [149]:
# Get the target
y = df.arr_delay

In [150]:

keep = ['crs_dep_time', 'fl_date', 'mkt_carrier', 'origin_city_name', 'dest_city_name', 'tail_num']
df_new = df[keep]



In [151]:
df_new.head(1)

Unnamed: 0,crs_dep_time,fl_date,mkt_carrier,origin_city_name,dest_city_name,tail_num
2,1345,2019-01-01,AS,"Las Vegas, NV","Portland, OR",N524VA


In [80]:
import delaytable as dt
delaytables = dt.DelayTables(2018, 2018, 1)

In [81]:
hour_delay, daily_delay, carrier_delay, origin_city_delay, dest_city_delay, tail_delay = delaytables.delay_tables()

In [82]:
# daily_delay

In [83]:
# hour_delay

In [152]:
size = len(df_new)

hourly_average = np.empty(size)
for i, entry in enumerate(df_new.crs_dep_time):
    line = str(entry)
    time = len(line)
    if (time == 2) or (time == 1):
        key = 0
    elif time == 3:
        key = line[0]
    elif time == 4:
        key = line[:2]
    else:
        raise ValueError("Found invalid data entry")
    
    hourly_average[i] = hour_delay[int(key)]


daily_average = np.empty(size)
for i, entry in enumerate(df_new.fl_date):
    day = entry[-2:]
    if day[0] == '0':
        day = int(day[-1])
    else:
        day = int(day)
    daily_average[i] = daily_delay[day]



carrier_average = np.empty(size)
for i, entry in enumerate(df_new.mkt_carrier):
    carrier_average[i] = carrier_delay[entry]

origin_city_mean = sum(origin_city_delay.values())/len(origin_city_delay.values())

origin_city_average = np.empty(size)
for i, entry in enumerate(df_new.origin_city_name):
    try:
        origin_city_average[i] = origin_city_delay[entry]
    except KeyError:
        origin_city_average[i] = origin_city_mean


dest_city_mean = sum(dest_city_delay.values())/len(dest_city_delay.values())

dest_city_average = np.empty(size)
for i, entry in enumerate(df_new.dest_city_name):
    try:
        dest_city_average[i] = dest_city_delay[entry]
    except KeyError:
        dest_city_average[i] = origin_city_mean

tail_average = np.empty(size)
for i, entry in enumerate(df_new.tail_num):
    try:
        tail_average[i] = tail_delay[entry]
    except KeyError:
        tail_average[i] = 0

In [153]:
df_new['hour_del'] = hourly_average
df_new['daily_del'] = daily_average
df_new['carrier_del'] = carrier_average
df_new['origin_city_del'] = origin_city_average
df_new['dest_city_del'] = dest_city_average
df_new['tail_del'] = tail_average

In [154]:
df_new.head()

Unnamed: 0,crs_dep_time,fl_date,mkt_carrier,origin_city_name,dest_city_name,tail_num,hour_del,daily_del,carrier_del,origin_city_del,dest_city_del,tail_del
2,1345,2019-01-01,AS,"Las Vegas, NV","Portland, OR",N524VA,4.208772,14.457773,-4.704022,0.738841,-4.38833,-4.138889
5,2055,2019-01-01,AS,"San Jose, CA","Seattle, WA",N623VA,5.174974,14.457773,-4.704022,-0.165883,-3.149996,8.070588
7,800,2019-01-01,AS,"San Jose, CA","Seattle, WA",N625VA,0.627441,14.457773,-4.704022,-0.165883,-3.149996,8.14
10,952,2019-01-01,AS,"Seattle, WA","San Jose, CA",N521VA,0.77823,14.457773,-4.704022,-3.953294,-2.746781,-5.2
11,1730,2019-01-01,AS,"Seattle, WA","San Jose, CA",N526VA,6.761881,14.457773,-4.704022,-3.953294,-2.746781,-6.781609


In [155]:
features = ['hour_del','daily_del','carrier_del','origin_city_del','dest_city_del','tail_del']


In [156]:
X = df_new[features]

In [157]:
# There are a few null values for tail- just dropped them

X.isnull().sum()

hour_del           0
daily_del          0
carrier_del        0
origin_city_del    0
dest_city_del      0
tail_del           0
dtype: int64

In [158]:
X.shape, y.shape

((266130, 6), (266130,))

In [159]:
X.head()

Unnamed: 0,hour_del,daily_del,carrier_del,origin_city_del,dest_city_del,tail_del
2,4.208772,14.457773,-4.704022,0.738841,-4.38833,-4.138889
5,5.174974,14.457773,-4.704022,-0.165883,-3.149996,8.070588
7,0.627441,14.457773,-4.704022,-0.165883,-3.149996,8.14
10,0.77823,14.457773,-4.704022,-3.953294,-2.746781,-5.2
11,6.761881,14.457773,-4.704022,-3.953294,-2.746781,-6.781609


In [160]:
X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, random_state=99)

In [161]:
# Scale numeric features
# scaler = MinMaxScaler()
# col_num = ['hour_del','daily_del','carrier_del','origin_city_del','dest_city_del','tail_del']
# for col in col_num:
#     scaled_train = scaler.fit_transform(X_train[col].to_numpy().reshape(-1,1))
#     X_train[col] = scaled_train

# for col in col_num:
#     scaled_test = scaler.transform(X_test[col].to_numpy().reshape(-1,1))
#     X_test[col] = scaled_test

In [162]:
# from sklearn.model_selection import GridSearchCV

# parameter_candidates = {'n_estimators': [10, 50, 100, 250], 'criterion': ['squared_error', 'absolute_error','poisson'] }

# clf = GridSearchCV(estimator=RandomForestRegressor(), param_grid=parameter_candidates, n_jobs=-1)
# clf.fit(X_train, y_train)


In [163]:
# print('Best score:', clf.best_score_) 

In [164]:
# clf.best_estimator_

#### Copy and rerun my Random Forest Function with the best estimator from grid search

In [165]:
# def random_forest_best(X_train, X_test, y_train, y_test, random_state=99):
#     rand_for = RandomForestRegressor(max_depth=250, min_samples_split=3, min_samples_leaf=3, criterion='absolute_error')
#     rand_for.fit(X_train, y_train)
#     y_pred = rand_for.predict(X_test)
#     print('Mean Squared Error: {0:0.4f}'.format(mean_squared_error(y_test, y_pred)))
#     print('RSME: {0:0.4f}'.format(mean_squared_error(y_test, y_pred, squared=False)))

    
#     return rand_for, y_pred

In [166]:
# tree, y_pred = random_forest_best(X_train, X_test, y_train, y_test)

#### Marginally better result than before

In [167]:
tree, y_pred = random_forest(X_train, X_test, y_train, y_test)

Mean Squared Error: 221.7549
RSME: 14.8914


In [168]:
max(y_pred), min(y_pred)

(43.44569652569652, -3.902520202020203)

In [40]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [41]:
ridgereg = Ridge(alpha=0.001,normalize=True)
ridgereg.fit(X_train, y_train)
y_pred = ridgereg.predict(X_test)

In [42]:
y_pred

array([-1.43221212, -1.07363296, -5.0410336 , ..., -2.75997236,
       -2.60533694, -3.22470534])

In [43]:
mean_squared_error(y_test, y_pred)

180.32427559339402

In [44]:
lassoreg = Lasso(alpha=0.001,normalize=True)
lassoreg.fit(X_train, y_train)
y_pred = lassoreg.predict(X_test)

In [45]:
mean_squared_error(y_test, y_pred)

180.46351187254822

In [80]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# from sklearn.pipeline import Pipeline
# import numpy as np
# model = Pipeline([('poly', PolynomialFeatures(degree=3)),
#                   ('linear', LinearRegression(fit_intercept=False))])
# # fit to an order-3 polynomial data
# # x = np.arange(5)
# # y = 3 - 2 * x + x ** 2 - x ** 3
# model = model.fit(X, y)
# model.named_steps['linear'].coef_

In [81]:
# model.predict(X)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=10)
poly.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test =split_data(X, y, test_size=0.2, random_state=99)

In [None]:
ridgereg = Ridge(alpha=0.2,normalize=True)
ridgereg.fit(X_train, y_train)
y_pred = ridgereg.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred)