## PreProcessing and Modeling

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# bring in testing data
test_df = pd.read_csv('DB/test_sample.csv', index_col='Unnamed: 0')
test_df = test_df.dropna()

# bring in flights data
flights_df = pd.read_csv('DB/flights_data.csv', index_col='Unnamed: 0')

In [3]:
# find testing data features
feature_cols = list(test_df.columns)
feature_cols.append('arr_delay')

# create base training features from existing testing features
X = flights_df[feature_cols]
X = X.dropna()

In [4]:
# find numeric and categorical features
cols = X.columns
num_cols = X._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))

# remove redundant numeric columns
final_num_cols = list(num_cols)
final_num_cols.remove('op_carrier_fl_num')
final_num_cols.remove('flights')

# remove redundant categorical columns
final_cat_cols = ['mkt_unique_carrier', 'fl_date', 'tail_num', 'branded_code_share']

# combine final features
final_features = final_num_cols + final_cat_cols

X = X[final_features]

# convert fl_date feature into datetime
X['fl_date'] = pd.to_datetime(X['fl_date'])

# separate datetime into date features
X['year'] = X['fl_date'].dt.year
X['month'] = X['fl_date'].dt.month
X['week'] = X['fl_date'].dt.isocalendar().week
X['day'] = X['fl_date'].dt.day
X['day_of_week'] = X['fl_date'].dt.dayofweek

# reset index for collaborative data sorting structure
X = X.reset_index()
X.index.name = 'order'
X = X.drop(columns=['index'])

# drop original fl_date and arr_delay columns
X = X.drop(columns=['fl_date'])

***

## Join New Features

In [5]:
# bring in feature data
taxi_and_delay = pd.read_csv('James/features_created.csv')
avg_monthly_pas = pd.read_csv('Riley/avg_monthly_pas.csv')


In [6]:
taxi_and_delay.head()

Unnamed: 0.1,Unnamed: 0,order,distance,crs_dep_time,crs_arr_time,mkt_unique_carrier,mean_taxi_out/time,mean_taxi_in/time,mean_dep_delay/time,mean_arr_delay/time,mean_dep_delay/distance,mean_arr_delay/distance,mean_dep_delay/carrier,mean_arr_delay/carrier
0,0,0,733.0,1300,1444,UA,17.372642,8.39823,9.557783,16.20354,17.384397,13.644793,12.628894,8.687159
1,1,1,1075.0,630,854,UA,16.813246,9.232143,4.561346,5.321429,8.236994,0.011561,12.628894,8.687159
2,2,2,488.0,1500,1709,UA,18.789406,8.639053,14.756792,10.242604,13.277929,7.323288,12.628894,8.687159
3,3,3,199.0,2041,2159,UA,21.924528,7.333333,11.867925,18.743961,14.227513,6.284946,12.628894,8.687159
4,4,4,224.0,2140,2257,UA,17.312977,8.170455,14.675573,3.227273,10.140485,8.314578,12.628894,8.687159


In [7]:
# drop extra column
taxi_and_delay = taxi_and_delay.drop(columns=['Unnamed: 0'])

In [8]:
# merge taxi and delay features onto base training dataset
merge_cols = ['order',
              'mean_taxi_out/time',
              'mean_taxi_in/time',
              'mean_dep_delay/time',
              'mean_arr_delay/time',
              'mean_dep_delay/distance',
              'mean_arr_delay/distance',
              'mean_dep_delay/carrier',
              'mean_arr_delay/carrier']

X = pd.merge(X, taxi_and_delay[merge_cols], how='left', on=['order'])

In [9]:
# merge onto training DataFrame
final = pd.merge(X, avg_monthly_pas, how='left', on=['origin_airport_id','month'])

In [10]:
final = final.dropna()

In [11]:
final = final.drop(['order'], axis=1)
final.head()

Unnamed: 0,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,mkt_unique_carrier,tail_num,...,day_of_week,mean_taxi_out/time,mean_taxi_in/time,mean_dep_delay/time,mean_arr_delay/time,mean_dep_delay/distance,mean_arr_delay/distance,mean_dep_delay/carrier,mean_arr_delay/carrier,avg_monthly_pas
0,3501,12953,13930,1300,1444,164.0,733.0,-28.0,UA,N744YX,...,0,17.372642,8.39823,9.557783,16.20354,17.384397,13.644793,12.628894,8.687159,66470.5
1,3502,11433,12266,630,854,204.0,1075.0,1.0,UA,N640RW,...,0,16.813246,9.232143,4.561346,5.321429,8.236994,0.011561,12.628894,8.687159,108812.5
2,3503,11618,11433,1500,1709,129.0,488.0,18.0,UA,N641RW,...,0,18.789406,8.639053,14.756792,10.242604,13.277929,7.323288,12.628894,8.687159,94693.5
3,3504,11618,11278,2041,2159,78.0,199.0,32.0,UA,N722YX,...,0,21.924528,7.333333,11.867925,18.743961,14.227513,6.284946,12.628894,8.687159,94693.5
4,3505,12266,11298,2140,2257,77.0,224.0,-1.0,UA,N855RW,...,0,17.312977,8.170455,14.675573,3.227273,10.140485,8.314578,12.628894,8.687159,155988.0


In [12]:
final.shape

(156741, 25)

In [13]:
final.to_csv('final_training.csv', index=False)

***

## ML Setup

In [2]:
df = pd.read_csv('final_training.csv')

In [3]:
# set X & y
X = df.drop(columns=['arr_delay'])
y = df['arr_delay']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 101)

In [5]:
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=['mkt_unique_carrier', 'tail_num', 'branded_code_share'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

  elif pd.api.types.is_categorical(cols):


In [6]:
X_train.shape

(109718, 24)

In [9]:
# importing scalers and PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA

# instantiating
ss = StandardScaler()
mm = MinMaxScaler()
rs = RobustScaler()
pca = PCA(n_components=0.95, svd_solver='full', random_state=101)
# pca above will pick components to get to 95% of variance explained

# creating scaled train/test for each

# standard
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

# minmax
X_train_mm = mm.fit_transform(X_train)
X_test_mm = mm.transform(X_test)

# robust
X_train_rs = rs.fit_transform(X_train)
X_test_rs = rs.transform(X_test)

# PCA (using MinMax scaled data)
X_train_pca = pca.fit_transform(X_train_mm)
X_test_pca = pca.transform(X_test_mm)

In [10]:
# importing models to work with
from xgboost import XGBRegressor # XGBoost Regression
from sklearn.neighbors import KNeighborsRegressor # KNeighbours Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.model_selection import GridSearchCV # GridSearch for optimizing
from sklearn import metrics as me # metrics for evaluation

In [25]:
def score_model(model, X_train, y_train, X_test, y_test, scaling='Base'):
    '''
    function to score given model with given data and scaling label
    
    dependencies: import numpy as np
                  from sklearn import metrics as me
    
    model = model instantiated with optimal params set
    X_train, y_train = training data split used as np.array
    X_test, y_test = data split to test as np.array
    scaling = scaling/preprocessing used on data as str ex. 'RobustScale' or 'PCA'
              default = 'Base' for no scaling/reduction used
    '''
    # fit model
    model.fit(X_train, y_train)

    # predict Train set results
    y_pred_train = model.predict(X_train)

    # predict Test set results
    y_pred = model.predict(X_test)

    # computing RMSE + R2
    rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
    rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred))
    r2_train = me.r2_score(y_train, y_pred_train)
    r2_test = me.r2_score(y_test, y_pred)
    print(f"{scaling} data score:")
    print("Train RMSE: %f" % (rmse_train))
    print("Train R2: %f" % (r2_train))
    print("Test RMSE: %f" % (rmse_test))
    print("Test R2: %f" % (r2_test))

### XGBoost Regressor

In [28]:
# instantiating classifier
xgb = XGBRegressor()

# create params for GridSearch
param_grid = {'objective':['reg:squarederror'],
              'learning_rate': [0.03, 0.05, 0.07], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [2, 3, 4],
              'subsample': [0.7],
              'colsample_bytree': [0.5, 0.7],
              'n_estimators': [100, 500, 1000]}



# instantiate GridSearch
gscv = GridSearchCV(estimator=xgb, param_grid=param_grid, n_jobs=-1)

# fit model
xgb_base = gscv.fit(X_train, y_train)

# get best params
print('Best parameters for Base:', xgb_base.best_params_)

Best parameters for Base: {'colsample_bytree': 0.5, 'learning_rate': 0.03, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 500, 'objective': 'reg:squarederror', 'subsample': 0.7}


In [37]:
# setting optimal params to model
xgb1 = XGBRegressor(
    colsample_bytree=0.5, 
    learning_rate=0.03,
    max_depth=7,
    min_child_weight=4,
    n_estimators=500,
    objective='reg:squarederror',
    subsample=0.7
)

# fit model
xgb1.fit(X_train, y_train)

# predict Train set results
y_pred_train = xgb1.predict(X_train)

# predict Test set results
y_pred_base = xgb1.predict(X_test)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

Train RMSE: 41.779740
Train R2: 0.230241
Test RMSE: 49.048042
Test R2: 0.036613


In [11]:
# setting optimal params to model
xgb1 = XGBRegressor(
    colsample_bytree=0.5, 
    learning_rate=0.03,
    max_depth=7,
    min_child_weight=4,
    n_estimators=500,
    objective='reg:squarederror',
    subsample=0.7
)

# fit model
xgb1.fit(X_train_ss, y_train)

# predict Train set results
y_pred_train = xgb1.predict(X_train_ss)

# predict Test set results
y_pred_base = xgb1.predict(X_test_ss)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('StandardScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

StandardScale data scores:
Train RMSE: 41.837357
Train R2: 0.228117
Test RMSE: 49.058476
Test R2: 0.036203


In [13]:
# setting optimal params to model
xgb1 = XGBRegressor(
    colsample_bytree=0.5, 
    learning_rate=0.03,
    max_depth=7,
    min_child_weight=4,
    n_estimators=500,
    objective='reg:squarederror',
    subsample=0.7
)

# fit model
xgb1.fit(X_train_mm, y_train)

# predict Train set results
y_pred_train = xgb1.predict(X_train_mm)

# predict Test set results
y_pred_base = xgb1.predict(X_test_mm)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('MinMaxScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

MinMaxScale data scores:
Train RMSE: 41.771884
Train R2: 0.230531
Test RMSE: 49.067973
Test R2: 0.035830


In [14]:
# setting optimal params to model
xgb1 = XGBRegressor(
    colsample_bytree=0.5, 
    learning_rate=0.03,
    max_depth=7,
    min_child_weight=4,
    n_estimators=500,
    objective='reg:squarederror',
    subsample=0.7
)

# fit model
xgb1.fit(X_train_rs, y_train)

# predict Train set results
y_pred_train = xgb1.predict(X_train_rs)

# predict Test set results
y_pred_base = xgb1.predict(X_test_rs)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('RobustScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

RobustScale data scores:
Train RMSE: 41.768656
Train R2: 0.230650
Test RMSE: 49.056157
Test R2: 0.036294


In [15]:
# setting optimal params to model
xgb1 = XGBRegressor(
    colsample_bytree=0.5, 
    learning_rate=0.03,
    max_depth=7,
    min_child_weight=4,
    n_estimators=500,
    objective='reg:squarederror',
    subsample=0.7
)

# fit model
xgb1.fit(X_train_pca, y_train)

# predict Train set results
y_pred_train = xgb1.predict(X_train_pca)

# predict Test set results
y_pred_base = xgb1.predict(X_test_pca)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('PCA data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

PCA data scores:
Train RMSE: 43.116130
Train R2: 0.180210
Test RMSE: 49.415006
Test R2: 0.022144


### Decision Tree

In [40]:
# Create Decision Tree classifer object
dtree = DecisionTreeRegressor()

# create params for GridSearch
param_grid = {'criterion':['squared_error'],
              'max_depth': [None, 2, 3, 5, 7], 
              'min_samples_split': [2, 3, 4, 5],
              'max_features':  ['auto', 'sqrt', 'log2'],
              'splitter': ['best','random']}



# instantiate GridSearch
gscv1 = GridSearchCV(estimator=dtree, param_grid=param_grid, n_jobs=-1)

# fit model
tree_base = gscv1.fit(X_train, y_train)
tree_ss = gscv1.fit(X_train_ss, y_train)
tree_mm = gscv1.fit(X_train_mm, y_train)
tree_rs = gscv1.fit(X_train_rs, y_train)

# get best params
print('Best parameters for Base:', tree_base.best_params_)
print('Best parameters for StandardScale:', tree_ss.best_params_)
print('Best parameters for MinMaxScale:', tree_mm.best_params_)
print('Best parameters for RobustScale:', tree_rs.best_params_)

Best parameters for Base: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 4, 'splitter': 'random'}
Best parameters for StandardScale: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 4, 'splitter': 'random'}
Best parameters for MinMaxScale: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 4, 'splitter': 'random'}
Best parameters for RobustScale: {'criterion': 'squared_error', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 4, 'splitter': 'random'}


In [42]:
# setting optimal params to model
dtree1 = DecisionTreeRegressor(
    criterion= 'squared_error', 
    max_depth= 5, 
    max_features='auto', 
    min_samples_split= 4, 
    splitter='random'
)
# Base data
# fit model
dtree1.fit(X_train, y_train)

# predict Train set results
y_pred_train = dtree1.predict(X_train)

# predict Test set results
y_pred_base = dtree1.predict(X_test)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('Base data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

Base data scores:
Train RMSE: 47.273290
Train R2: 0.014504
Test RMSE: 49.657232
Test R2: 0.012533


In [46]:
# StandardScale data
# fit model
dtree1.fit(X_train_ss, y_train)

# predict Train set results
y_pred_train = dtree1.predict(X_train_ss)

# predict Test set results
y_pred_base = dtree1.predict(X_test_ss)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('StandardScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

StandardScale data scores:
Train RMSE: 47.245917
Train R2: 0.015645
Test RMSE: 49.635784
Test R2: 0.013386


In [47]:
# MinMaxScale data
# fit model
dtree1.fit(X_train_mm, y_train)

# predict Train set results
y_pred_train = dtree1.predict(X_train_mm)

# predict Test set results
y_pred_base = dtree1.predict(X_test_mm)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('MinMaxScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

MinMaxScale data scores:
Train RMSE: 47.138246
Train R2: 0.020127
Test RMSE: 49.695846
Test R2: 0.010997


In [48]:
# RobustScale data
# fit model
dtree1.fit(X_train_rs, y_train)

# predict Train set results
y_pred_train = dtree1.predict(X_train_rs)

# predict Test set results
y_pred_base = dtree1.predict(X_test_rs)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('RobustScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

RobustScale data scores:
Train RMSE: 47.251834
Train R2: 0.015399
Test RMSE: 49.603000
Test R2: 0.014689


In [17]:
# PCA data
# setting optimal params to model
dtree1 = DecisionTreeRegressor(
    criterion= 'squared_error', 
    max_depth= 5, 
    max_features='auto', 
    min_samples_split= 4, 
    splitter='random'
)

# fit model
dtree1.fit(X_train_pca, y_train)

# predict Train set results
y_pred_train = dtree1.predict(X_train_pca)

# predict Test set results
y_pred_base = dtree1.predict(X_test_pca)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('PCA data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

PCA data scores:
Train RMSE: 47.392818
Train R2: 0.009514
Test RMSE: 49.789242
Test R2: 0.007276


### KNN Regression

In [18]:
# Create KNRegressor classifer object
knr = KNeighborsRegressor()

In [25]:
# fit model
knr_base = knr.fit(X_train_ss, y_train)

# predict Train set results
y_pred_train = knr.predict(X_train_ss)

# predict Test set results
y_pred_base = knr.predict(X_test_ss)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('StandardScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

StandardScale data scores:
Train RMSE: 41.648881
Train R2: 0.235056
Test RMSE: 52.970414
Test R2: -0.123632


In [26]:
# fit model
knr_base = knr.fit(X_train_rs, y_train)

# predict Train set results
y_pred_train = knr.predict(X_train_rs)

# predict Test set results
y_pred_base = knr.predict(X_test_rs)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('RobustScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

RobustScale data scores:
Train RMSE: 41.827172
Train R2: 0.228493
Test RMSE: 53.286869
Test R2: -0.137098


In [27]:
# fit model
knr_base = knr.fit(X_train_mm, y_train)

# predict Train set results
y_pred_train = knr.predict(X_train_mm)

# predict Test set results
y_pred_base = knr.predict(X_test_mm)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('MinMaxScale data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

MinMaxScale data scores:
Train RMSE: 40.771489
Train R2: 0.266946
Test RMSE: 52.198470
Test R2: -0.091121


In [19]:
# fit model
knr_base = knr.fit(X_train_pca, y_train)

# predict Train set results
y_pred_train = knr.predict(X_train_pca)

# predict Test set results
y_pred_base = knr.predict(X_test_pca)

# computing RMSE + R2
rmse_train = np.sqrt(me.mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(me.mean_squared_error(y_test, y_pred_base))
r2_train = me.r2_score(y_train, y_pred_train)
r2_test = me.r2_score(y_test, y_pred_base)
print('PCA data scores:')
print("Train RMSE: %f" % (rmse_train))
print("Train R2: %f" % (r2_train))
print("Test RMSE: %f" % (rmse_test))
print("Test R2: %f" % (r2_test))

PCA data scores:
Train RMSE: 40.757392
Train R2: 0.267452
Test RMSE: 52.247074
Test R2: -0.093154


### Comparing results

##### Obtaining top two results from models:

In [22]:
# compiling lists

models = ['KNRegressor_minmax',
          'KNRegressor_pca',
          'DecisionTree_standard',
          'DecisionTree_robust',
          'XGBoost',
          'XGBoost_robust']

train_r2 = [0.266946,
            0.267452,
            0.015645,
            0.015399,
            0.230241,
            0.230650]

train_RMSE = [40.771489,
              40.757392,
              47.245917,
              47.251834,
              41.779740,
              41.768656]

test_r2 = [-0.091121,
           -0.093154,
           0.013386,
           0.014689,
           0.036613,
           0.036294]

test_RMSE = [52.198470,
             52.247074,
             49.635784,
             49.603000,
             49.048042,
             49.056157]

# zipping into dictionary

results_dict = {k: (v1, v2, v3, v4) for k, v1, v2, v3, v4 in zip(models, train_r2, train_RMSE, test_r2, test_RMSE)}

In [23]:
# creating df
results = pd.DataFrame.from_dict(results_dict, orient='index',
                       columns=['Train r2', 'Train RMSE', 'Test r2', 'Test RMSE'])

# checking
results

Unnamed: 0,Train r2,Train RMSE,Test r2,Test RMSE
KNRegressor_minmax,0.266946,40.771489,-0.091121,52.19847
KNRegressor_pca,0.267452,40.757392,-0.093154,52.247074
DecisionTree_standard,0.015645,47.245917,0.013386,49.635784
DecisionTree_robust,0.015399,47.251834,0.014689,49.603
XGBoost,0.230241,41.77974,0.036613,49.048042
XGBoost_robust,0.23065,41.768656,0.036294,49.056157


In [24]:
# saving to csv
results.to_csv('results.csv')