In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
import pandas as pd
import joblib

In [2]:
data = pd.read_csv('./datasets/flights_preprocessed.csv')

In [6]:
display(data.shape)
data.info()

(77909, 23)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77909 entries, 0 to 77908
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Month                           77909 non-null  float64
 1   Day                             77909 non-null  float64
 2   Day Of Week                     77909 non-null  float64
 3   Origin Airport Delay Rate       77909 non-null  float64
 4   Destination Airport Delay Rate  77909 non-null  float64
 5   Scheduled Time                  77909 non-null  float64
 6   Distance                        77909 non-null  float64
 7   Scheduled Departure Hour        77909 non-null  float64
 8   Scheduled Departure Minute      77909 non-null  float64
 9   Arrival Delay                   77909 non-null  float64
 10  Airline_AS                      77909 non-null  int64  
 11  Airline_B6                      77909 non-null  int64  
 12  Airline_DL                      

In [5]:
display(data.head())

Unnamed: 0,Month,Day,Day Of Week,Origin Airport Delay Rate,Destination Airport Delay Rate,Scheduled Time,Distance,Scheduled Departure Hour,Scheduled Departure Minute,Arrival Delay,...,Airline_EV,Airline_F9,Airline_HA,Airline_MQ,Airline_NK,Airline_OO,Airline_UA,Airline_US,Airline_VX,Airline_WN
0,0.0,-0.956289,0.425688,-0.607571,-0.538144,1.064249,0.906818,-1.000305,1.551139,-13.0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,0.14467,1.379526,-0.607571,2.050538,0.370303,0.425303,-1.000305,0.18396,-12.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.14467,1.379526,-0.607571,-0.538144,-1.279458,-1.156818,-2.66461,0.18396,189.0,...,0,0,0,0,0,0,0,1,0,0
3,0.0,-0.40581,0.902607,-0.607571,-0.538144,0.213183,0.145238,-0.792267,-0.089476,-7.0,...,0,0,0,0,0,0,0,0,0,1
4,0.0,0.695149,-1.481987,1.870459,-0.538144,-0.978311,-1.18466,-1.624419,1.058954,-4.0,...,1,0,0,0,0,0,0,0,0,0


In [9]:
features = data.drop('Arrival Delay', axis=1)
target = data['Arrival Delay']

In [10]:
features_train, features_valid, target_train, target_valid = train_test_split(features, target, test_size=0.2, random_state=12345)

In [12]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(features_train, target_train)

LinearRegression()

In [13]:
print(linear_regression_model.score(features_valid, target_valid))

0.09992799031301902


In [20]:
%%time

forest_param_grid = {'max_depth': [deep for deep in range(1, 16, 2)],
                    'n_estimators': [10, 15]}
forest_gs = GridSearchCV(RandomForestRegressor(random_state=12345), param_grid=forest_param_grid)

forest_gs.fit(features_train, target_train)

print(forest_gs.best_params_)

{'max_depth': 9, 'n_estimators': 15}
CPU times: user 59.8 s, sys: 194 ms, total: 1min
Wall time: 1min


In [21]:
random_forest_model = RandomForestRegressor(random_state=12345, max_depth=9, n_estimators=100)
random_forest_model.fit(features_train, target_train)

RandomForestRegressor(max_depth=9, random_state=12345)

In [22]:
print(random_forest_model.score(features_train, target_train))
print(random_forest_model.score(features_valid, target_valid))

0.2665963341918417
0.15518627091098214


In [None]:
joblib.dump(random_forest_model, 'best_r2.joblib')