# Models on Integrated Features

In [4]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [5]:
# load data
df = pd.read_csv('data/model_df_weather.csv', index_col=0)

  mask |= (ar1 == a)


In [6]:
df.head(1)

Unnamed: 0,arr_delay,precip,snow,windgust,cloudcover,carrier_speed_rank,flight_num_speed_rank,month,month_rank,dep_hour,...,snow_cat,windgust_cat,cloud_cat,fl_month,airline_delay,haul_length,dep_timeday,arr_timeday,busy_origin,busy_dest
0,-2.0,0.0,0.0,40.7,74.3,0,1,1,0,19,...,0,2,2,1,1,0,3,3,4,3.0


## Model 4: Simple LinReg

In [7]:
df.columns

Index(['arr_delay', 'precip', 'snow', 'windgust', 'cloudcover',
       'carrier_speed_rank', 'flight_num_speed_rank', 'month', 'month_rank',
       'dep_hour', 'arr_hour', 'hour_rank', 'dep_hour_rank', 'arr_hour_rank',
       'precip_cat', 'snow_cat', 'windgust_cat', 'cloud_cat', 'fl_month',
       'airline_delay', 'haul_length', 'dep_timeday', 'arr_timeday',
       'busy_origin', 'busy_dest'],
      dtype='object')

In [8]:
x_cols = ['carrier_speed_rank', 'flight_num_speed_rank','month_rank',
       'dep_hour_rank', 'arr_hour_rank', 'precip_cat', 'snow_cat',
          'windgust_cat', 'cloud_cat', 'fl_month',
       'airline_delay', 'haul_length', 'dep_timeday', 'arr_timeday',
       'busy_origin', 'busy_dest']

X = df[x_cols].to_numpy()
y= df.arr_delay.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [None]:
# filename = 'models/lr4.sav'
# pickle.dump(lr, open(filename, 'wb'))

In [10]:
lr.score(X_test, y_test)

0.07284327944179692

In [14]:
# filename = 'models/lr_0.07.sav'
# pickle.dump(lr, open(filename, 'wb'))

# Model5: Hail Mary Random Forest - defaults

In [12]:
rf1 = RandomForestRegressor()
rf1.fit(X_train, y_train)

# filename = 'models/rf1.sav'
# pickle.dump(rf1, open(filename, 'wb'))

RandomForestRegressor()

In [15]:
pickle.load( open( "'models/rf1.sav'", "rb" ))

TypeError: file must have 'read' and 'readline' attributes

In [13]:
y_pred = rf1.predict(X_test)
print(r2_score(y_test, y_pred))

0.00611263205182766


# Model 6: Random Forest with 500 estimators

In [None]:
rf2 = RandomForestRegressor(n_estimators=500)
rf2.fit(X_train, y_train)

filename = 'models/rf2.sav'
pickle.dump(rf2, open(filename, 'wb'))

In [None]:
y_pred = rf2.predict(X_test)
print(r2_score(y_test, y_pred))

In [None]:
y_trainpred = rf2.predict(X_train)
print(r2_score(y_train, y_trainpred))

# Model 7: Polynomial regression

In [None]:
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)

X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(X_poly, y, test_size=0.2)

In [None]:
plr = LinearRegression()
plr.fit(X_poly_train, y_poly_train)

filename = 'models/plr01.sav'
pickle.dump(plr, open(filename, 'wb'))

In [None]:
plr.score(X_poly_test, y_poly_test)

In [None]:
plr.score(X_poly_train, y_poly_train)

This doesn't seem to be overfitting. Maybe try higher-order polynomial features

# Model 8: Polynomial regression, third-order polynomials

In [None]:
poly3 = PolynomialFeatures(3)
X_poly3 = poly.fit_transform(X)

X_poly3_train, X_poly3_test, y_poly3_train, y_poly3_test = train_test_split(X_poly3, y, test_size=0.2)

In [None]:
plr2 = LinearRegression()
plr2.fit(X_poly_train, y_poly_train)

filename = 'models/plr02.sav'
pickle.dump(plr2, open(filename, 'wb'))

In [None]:
plr.score(X_poly3_test, y_poly3_test)

In [None]:
plr.score(X_poly3_train, y_poly3_train)

Still not overfitting, but isn't doing any better. Try a different approach.

# Model 9: Random forest grid search with cross-validation: All features

In [None]:
df = pd.read_csv('data/model_df_weather.csv', index_col=0)
cols = ['carrier_speed_rank', 'flight_num_speed_rank', 'month_rank',
       'dep_hour_rank', 'arr_hour_rank', 'precip_cat', 'snow_cat',
        'windgust_cat', 'cloud_cat', 'fl_month', 'airline_delay', 
        'haul_length', 'dep_timeday', 'arr_timeday', 'busy_origin', 'busy_dest']
X = df[cols].to_numpy()
y = df['arr_delay'].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
grid = {'n_estimators': [10, 50],
       'max_depth': [None, 5, 20],
#        'min_samples_leaf': [1, 10, 100, 1000]
       }

gs = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid)
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)
model = gs.best_estimator_

In [None]:
filename = 'models/rf3.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
filename = 'models/gs1.sav'
pickle.dump(gs, open(filename, 'wb'))

# Model 10: Logistic Regression - Baseline

In [3]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()