# INTRODUCTION
This notebook performs linear regression on the pre-processed data from "1. daily_import_merge_engineer.ipynb"

## Libraries

In [14]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error   
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

DAILY_DATA_PATH = "data.v3/daily" 

## Import data & column groups

In [3]:
df = pd.read_parquet(os.path.join(DAILY_DATA_PATH, "daily_flights_and_weather_merged.parquet"))

# Flights column groups
flights_terminal_cols = ['flights_arr_A', 'flights_arr_B', 'flights_arr_C', 'flights_arr_D', 'flights_arr_E',
                         'flights_dep_A', 'flights_dep_B', 'flights_dep_C', 'flights_dep_D', 'flights_dep_E']

flights_non_terminal_cols = ['flights_total', 'flights_cancel', 'flights_delay', 'flights_ontime',
                             'flights_arr_ontime', 'flights_arr_delay', 'flights_arr_cancel',
                             'flights_dep_ontime', 'flights_dep_delay', 'flights_dep_cancel']

flights_percentage_cols = ['flights_cancel_pct', 'flights_delay_pct', 'flights_ontime_pct',
                            'flights_arr_delay_pct', 'flights_arr_ontime_pct', 'flights_arr_cancel_pct',
                            'flights_dep_delay_pct', 'flights_dep_ontime_pct', 'flights_dep_cancel_pct']

# Date column groups
date_cols = ['date', 'covid', 'ordinal_date', 'year', 'month', 'day_of_month', 'day_of_week', 'season', 'holiday', 'halloween', 'xmas_eve', 'new_years_eve', 'jan_2', 'jan_3', 'day_before_easter', 'days_until_xmas', 'days_until_thanksgiving', 'days_until_july_4th', 'days_until_labor_day', 'days_until_memorial_day']

# Weather column groups
weather_cols = ['wx_temperature_max', 'wx_temperature_min', 'wx_apcp', 'wx_prate', 'wx_asnow', 'wx_frozr', 'wx_vis', 'wx_gust', 'wx_maxref', 'wx_cape', 'wx_lftx', 'wx_wind_speed', 'wx_wind_direction']

# Lag column groups
lag_cols =  ['flights_total_lag_1', 'flights_total_lag_2', 'flights_total_lag_3', 'flights_total_lag_4', 'flights_total_lag_5', 'flights_total_lag_6', 'flights_total_lag_7', 'flights_cancel_lag_1', 'flights_cancel_lag_2', 'flights_cancel_lag_3', 'flights_cancel_lag_4', 'flights_cancel_lag_5', 'flights_cancel_lag_6', 'flights_cancel_lag_7']

# DATA PREPROCESSING

## Train Test Split

In [4]:
# Select training features
train_features = ['random'] + date_cols + weather_cols + lag_cols

# Create X and y
X = df[train_features].drop('date', axis=1)
y = df[flights_non_terminal_cols + flights_percentage_cols]

print(X.columns.tolist())
print("\nTarget columns\n", y.head())

# Split data into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=42)

# Print shapes
print("X_train_full shape:", X_train_full.shape)
print("y_train_full shape:", y_train_full.shape)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_Test shape:", X_test.shape)


['random', 'covid', 'ordinal_date', 'year', 'month', 'day_of_month', 'day_of_week', 'season', 'holiday', 'halloween', 'xmas_eve', 'new_years_eve', 'jan_2', 'jan_3', 'day_before_easter', 'days_until_xmas', 'days_until_thanksgiving', 'days_until_july_4th', 'days_until_labor_day', 'days_until_memorial_day', 'wx_temperature_max', 'wx_temperature_min', 'wx_apcp', 'wx_prate', 'wx_asnow', 'wx_frozr', 'wx_vis', 'wx_gust', 'wx_maxref', 'wx_cape', 'wx_lftx', 'wx_wind_speed', 'wx_wind_direction', 'flights_total_lag_1', 'flights_total_lag_2', 'flights_total_lag_3', 'flights_total_lag_4', 'flights_total_lag_5', 'flights_total_lag_6', 'flights_total_lag_7', 'flights_cancel_lag_1', 'flights_cancel_lag_2', 'flights_cancel_lag_3', 'flights_cancel_lag_4', 'flights_cancel_lag_5', 'flights_cancel_lag_6', 'flights_cancel_lag_7']

Target columns
             flights_total  flights_cancel  flights_delay  flights_ontime  \
2018-07-20         1898.0            24.0          430.0          1444.0   
2018-07-21 

## Column transformers

In [5]:
categorical_tranformer = make_pipeline(OneHotEncoder(handle_unknown='ignore')) # Some observed holidays may not be in the training data
numeric_transformer = make_pipeline(StandardScaler())

# print value counts of unique data types in X
print(X.dtypes.value_counts())

# Identify categorical and numeric columns in X_train_full
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include = ['float64', 'float32', 'int32', 'int64']).columns.tolist()

# Check that all columns are accounted for
print(f"categorical columns: {categorical_cols}")
print(f"numeric columns: {numeric_cols}")
print(len(categorical_cols) + len(numeric_cols) == X_train_full.shape[1])

# Linear regression transformer
LR__transformer = ColumnTransformer(
    transformers=[
        ('cat', categorical_tranformer, categorical_cols),
        ('num', numeric_transformer, numeric_cols)
    ])

float64    23
object     11
int64       7
float32     4
int32       2
Name: count, dtype: int64
categorical columns: ['covid', 'month', 'day_of_week', 'season', 'holiday', 'halloween', 'xmas_eve', 'new_years_eve', 'jan_2', 'jan_3', 'day_before_easter']
numeric columns: ['random', 'ordinal_date', 'year', 'day_of_month', 'days_until_xmas', 'days_until_thanksgiving', 'days_until_july_4th', 'days_until_labor_day', 'days_until_memorial_day', 'wx_temperature_max', 'wx_temperature_min', 'wx_apcp', 'wx_prate', 'wx_asnow', 'wx_frozr', 'wx_vis', 'wx_gust', 'wx_maxref', 'wx_cape', 'wx_lftx', 'wx_wind_speed', 'wx_wind_direction', 'flights_total_lag_1', 'flights_total_lag_2', 'flights_total_lag_3', 'flights_total_lag_4', 'flights_total_lag_5', 'flights_total_lag_6', 'flights_total_lag_7', 'flights_cancel_lag_1', 'flights_cancel_lag_2', 'flights_cancel_lag_3', 'flights_cancel_lag_4', 'flights_cancel_lag_5', 'flights_cancel_lag_6', 'flights_cancel_lag_7']
True


## Lasso regression

We'll start with lasso without cross validation for a single target, flights_ontime. 

In [6]:
# Lasso pipeline for flights_ontime
lasso_ontime = make_pipeline(
    LR__transformer,
    Lasso(alpha=10)
)

# Lasso fit
lasso_ontime.fit(X_train, y_train['flights_ontime'])

# Lasso predictions
y_pred_ontime = lasso_ontime.predict(X_val)
print("R2 score:", r2_score(y_val['flights_ontime'], y_pred_ontime))

# Features and coefficients with non-zero coefficients
lasso_ontime_features = lasso_ontime.named_steps['columntransformer'].get_feature_names_out()
lasso_ontime_coef = lasso_ontime.named_steps['lasso'].coef_

# Create a dataframe of features and coefficients
lasso_ontime_df = pd.DataFrame({'features': lasso_ontime_features, 'coefficients': lasso_ontime_coef})

# Sort the dataframe by coefficient absolute value, largest to smallest
lasso_ontime_df['coefficients_abs'] = lasso_ontime_df['coefficients'].abs()
lasso_ontime_df.sort_values(by='coefficients_abs', inplace=True, ascending=False)

# Filter the dataframe for coefficients_abs > .1
lasso_ontime_df = lasso_ontime_df[lasso_ontime_df['coefficients_abs'] > .1]

print("Lasso coefficients:\n", lasso_ontime_df)

R2 score: 0.6236520201173014
Lasso coefficients:
                      features  coefficients  coefficients_abs
70             num__wx_maxref    -85.302991         85.302991
81   num__flights_total_lag_7     77.050106         77.050106
82  num__flights_cancel_lag_1    -68.795517         68.795517
75   num__flights_total_lag_1     33.323642         33.323642
77   num__flights_total_lag_3     26.461647         26.461647
78   num__flights_total_lag_4     24.954123         24.954123
71               num__wx_cape    -14.507070         14.507070
69               num__wx_gust    -13.824874         13.824874
64               num__wx_apcp    -10.026545         10.026545
67              num__wx_frozr     -8.735262          8.735262
66              num__wx_asnow     -7.313045          7.313045
79   num__flights_total_lag_5      6.592167          6.592167
80   num__flights_total_lag_6      4.076124          4.076124
88  num__flights_cancel_lag_7     -1.637139          1.637139
65              num_

Lasso regression on all targets using gridsearchCV to tune alpha

In [29]:
param_grid = {'lasso__alpha': [.01, .1, 1, 10, 20]}

lasso_pipeline = make_pipeline(
    LR__transformer,
    Lasso()
)

grid_search = GridSearchCV(
    lasso_pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1, # n_jobs=-1 means use all available CPU cores
    verbose=0
    )

lasso_models = {}

# Fit lasso models for all targets
for target in y.columns.tolist():
    grid_search.fit(X_train, y_train[target])
    lasso_models[f"lasso_{target}"] = grid_search.best_estimator_

# Print best parameters for all lasso models
for target, model in lasso_models.items():
    print(f"{target} best parameters: {model.named_steps['lasso'].get_params()['alpha']}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso_flights_total best parameters: 0.1
lasso_flights_cancel best parameters: 1
lasso_flights_delay best parameters: 0.1
lasso_flights_ontime best parameters: 0.1
lasso_flights_arr_ontime best parameters: 0.1
lasso_flights_arr_delay best parameters: 0.1
lasso_flights_arr_cancel best parameters: 0.1
lasso_flights_dep_ontime best parameters: 0.1
lasso_flights_dep_delay best parameters: 0.1
lasso_flights_dep_cancel best parameters: 0.1
lasso_flights_cancel_pct best parameters: 0.1
lasso_flights_delay_pct best parameters: 0.1
lasso_flights_ontime_pct best parameters: 0.1
lasso_flights_arr_delay_pct best parameters: 0.1
lasso_flights_arr_ontime_pct best parameters: 0.1
lasso_flights_arr_cancel_pct best parameters: 0.1
lasso_flights_dep_delay_pct best parameters: 0.1
lasso_flights_dep_ontime_pct best parameters: 0.1
lasso_flights_dep_cancel_pct best parameters: 0.1


R-squared on validation set for lasso regression models

In [21]:
lasso_predictions = {}
lasso_r2_scores = {}

# Get R2 scores for Lasso regression
for target in y.columns.tolist():
    lasso_predictions[f'y_pred_{target}'] = lasso_models[f'lasso_{target}'].predict(X_val)
    # create a dictionary of R2 scores

     
    print(f"R2 score for Lasso_{target}: {r2_score(y_val[target], lasso_predictions[f'y_pred_{target}'])}")

R2 score for Lasso_flights_total: 0.9158765478432008
R2 score for Lasso_flights_cancel: 0.823228661772087
R2 score for Lasso_flights_delay: 0.335181738403064
R2 score for Lasso_flights_ontime: 0.656340419596772
R2 score for Lasso_flights_arr_ontime: 0.6710188460023864
R2 score for Lasso_flights_arr_delay: 0.29620587454594227
R2 score for Lasso_flights_arr_cancel: 0.8303846950282266
R2 score for Lasso_flights_dep_ontime: 0.6381976521098794
R2 score for Lasso_flights_dep_delay: 0.39153803558508626
R2 score for Lasso_flights_dep_cancel: 0.7974032012874714
R2 score for Lasso_flights_cancel_pct: 0.8045508986693678
R2 score for Lasso_flights_delay_pct: 0.2476434187485096
R2 score for Lasso_flights_ontime_pct: 0.5763573546788832
R2 score for Lasso_flights_arr_delay_pct: 0.22145506523234193
R2 score for Lasso_flights_arr_ontime_pct: 0.5447154882678514
R2 score for Lasso_flights_arr_cancel_pct: 0.808523550420089
R2 score for Lasso_flights_dep_delay_pct: 0.28123190013845245
R2 score for Lasso_fl

## Ridge regression

In [9]:
# Ridge pipeline for flights_ontime
ridge_pipeline = make_pipeline(
    LR__transformer,
    Ridge(alpha=10)
)

# Ridge fit
ridge_pipeline.fit(X_train, y_train['flights_ontime'])

# Ridge predictions
y_pred_ontime = ridge_pipeline.predict(X_val)
print("R2 score:", r2_score(y_val['flights_ontime'], y_pred_ontime))

# Features and coefficients with non-zero coefficients
ridge_ontime_features = ridge_pipeline.named_steps['columntransformer'].get_feature_names_out()
ridge_ontime_coef = ridge_pipeline.named_steps['ridge'].coef_

# Create a dataframe of features and coefficients
ridge_ontime_df = pd.DataFrame({'features': ridge_ontime_features, 'coefficients': ridge_ontime_coef})

# Sort the dataframe by coefficient absolute value, largest to smallest
ridge_ontime_df['coefficients_abs'] = ridge_ontime_df['coefficients'].abs()
ridge_ontime_df.sort_values(by='coefficients_abs', inplace=True, ascending=False)

# Filter the dataframe for coefficients_abs > .1
ridge_ontime_df = ridge_ontime_df[ridge_ontime_df['coefficients_abs'] > .1]

print("Ridge coefficients:\n", ridge_ontime_df)

R2 score: 0.6561574193569667
Ridge coefficients:
                       features  coefficients  coefficients_abs
38   cat__holiday_Thanksgiving   -176.156278        176.156278
82   num__flights_cancel_lag_1    -85.173769         85.173769
70              num__wx_maxref    -82.796434         82.796434
16   cat__day_of_week_Saturday    -66.255361         66.255361
44           cat__xmas_eve_yes    -64.502099         64.502099
..                         ...           ...               ...
25  cat__holiday_Christmas Day      1.750882          1.750882
74      num__wx_wind_direction      1.731846          1.731846
46      cat__new_years_eve_yes      0.767014          0.767014
45       cat__new_years_eve_no     -0.767014          0.767014
68                 num__wx_vis      0.721992          0.721992

[89 rows x 3 columns]


## Ridge regression on all targets using grid search CV to tune alpha

In [10]:
param_grid = {'ridge__alpha': [.01, .1, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}

grid_search = GridSearchCV(
    ridge_pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1, # n_jobs=-1 means use all available CPU cores
    verbose=0
    )

for target in y.columns.tolist():
    grid_search.fit(X_train, y_train[target])
    globals()[f"Ridge_{target}"] = grid_search.best_estimator_
    print(f"Best parameters for Ridge_{target}:\n{grid_search.best_params_}")


Best parameters for Ridge_flights_total:
{'ridge__alpha': 0.1}
Best parameters for Ridge_flights_cancel:
{'ridge__alpha': 100}
Best parameters for Ridge_flights_delay:
{'ridge__alpha': 10}
Best parameters for Ridge_flights_ontime:
{'ridge__alpha': 1}
Best parameters for Ridge_flights_arr_ontime:
{'ridge__alpha': 1}
Best parameters for Ridge_flights_arr_delay:
{'ridge__alpha': 10}
Best parameters for Ridge_flights_arr_cancel:
{'ridge__alpha': 100}
Best parameters for Ridge_flights_dep_ontime:
{'ridge__alpha': 1}
Best parameters for Ridge_flights_dep_delay:
{'ridge__alpha': 10}
Best parameters for Ridge_flights_dep_cancel:
{'ridge__alpha': 100}
Best parameters for Ridge_flights_cancel_pct:
{'ridge__alpha': 100}
Best parameters for Ridge_flights_delay_pct:
{'ridge__alpha': 20}
Best parameters for Ridge_flights_ontime_pct:
{'ridge__alpha': 30}
Best parameters for Ridge_flights_arr_delay_pct:
{'ridge__alpha': 30}
Best parameters for Ridge_flights_arr_ontime_pct:
{'ridge__alpha': 30}
Best pa

In [11]:
# Get R2 scores for Ridge regression
models = {}
predictions = {}

for target in y.columns.tolist():
    models[f"Ridge_{target}"] = Ridge().fit(X_train, y_train[target]_)

for target in y.columns.tolist():
    globals()[f"y_pred_{target}"] = globals()[f"Ridge_{target}"].predict(X_val)
    print(f"R2 score for Ridge_{target}: {r2_score(y_val[target], globals()[f'y_pred_{target}'])}")

R2 score for Ridge_flights_total: 0.9151877612263515


R2 score for Ridge_flights_cancel: 0.818514495419842
R2 score for Ridge_flights_delay: 0.35007346641511805
R2 score for Ridge_flights_ontime: 0.6576534831096172
R2 score for Ridge_flights_arr_ontime: 0.6660449773742638
R2 score for Ridge_flights_arr_delay: 0.30389208037446114
R2 score for Ridge_flights_arr_cancel: 0.8341571188025663
R2 score for Ridge_flights_dep_ontime: 0.6392973214567113
R2 score for Ridge_flights_dep_delay: 0.3945615774886526
R2 score for Ridge_flights_dep_cancel: 0.7895087731546115
R2 score for Ridge_flights_cancel_pct: 0.8092271742765116
R2 score for Ridge_flights_delay_pct: 0.2581214024444216
R2 score for Ridge_flights_ontime_pct: 0.5729818307428827
R2 score for Ridge_flights_arr_delay_pct: 0.23283859083345892
R2 score for Ridge_flights_arr_ontime_pct: 0.5404683797598525
R2 score for Ridge_flights_arr_cancel_pct: 0.8177503796239907
R2 score for Ridge_flights_dep_delay_pct: 0.28822744344601947
R2 score for Ridge_flights_dep_ontime_pct: 0.5923636010440227
R2 score 

In [15]:
# Get MAE scores for Ridge regression
for target in y.columns.tolist():
    globals()[f"y_pred_{target}"] = globals()[f"Ridge_{target}"].predict(X_val)
    print(f"MAE score for Ridge_{target}: {mean_absolute_error(y_val[target], globals()[f'y_pred_{target}'])}")

MAE score for Ridge_flights_total: 61.29455178164893
MAE score for Ridge_flights_cancel: 31.497056062823084
MAE score for Ridge_flights_delay: 96.7885015784389
MAE score for Ridge_flights_ontime: 125.96024053129943
MAE score for Ridge_flights_arr_ontime: 62.57233555122675
MAE score for Ridge_flights_arr_delay: 49.9781958626491
MAE score for Ridge_flights_arr_cancel: 14.8164764791897
MAE score for Ridge_flights_dep_ontime: 64.88053476094524
MAE score for Ridge_flights_dep_delay: 49.68656515818731
MAE score for Ridge_flights_dep_cancel: 17.056655567857643
MAE score for Ridge_flights_cancel_pct: 2.036425957410606
MAE score for Ridge_flights_delay_pct: 5.40532495414463
MAE score for Ridge_flights_ontime_pct: 6.245727429623
MAE score for Ridge_flights_arr_delay_pct: 5.595246585487878
MAE score for Ridge_flights_arr_ontime_pct: 6.448872649783835
MAE score for Ridge_flights_arr_cancel_pct: 1.931121884292273
MAE score for Ridge_flights_dep_delay_pct: 5.576355484978591
MAE score for Ridge_fligh

In [26]:
from sklearn.linear_model import ElasticNet

# ElasticNet pipeline for flights_ontime
elastic_net_pipeline = make_pipeline(
    LR__transformer,
    ElasticNet(alpha=10, l1_ratio=0.5))

# Grid search for ElasticNet
param_grid = {'elasticnet__alpha': [.01, .1, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
              'elasticnet__l1_ratio': [.1, .3, .5, .7, .9]}

grid_search = GridSearchCV(
    elastic_net_pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1, # n_jobs=-1 means use all available CPU cores
    verbose=0
    )

elastic_net_models = {}

for target in y.columns.tolist():
    grid_search.fit(X_train, y_train[target])
    elastic_net_models[f"elastic_net_{target}"] = grid_search.best_estimator_
    # print(f"Best parameters for elastic_net_{target}:\n{grid_search.best_params_}")

Best parameters for elastic_net_flights_total:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.9}


  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_cancel:
{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.1}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_delay:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.3}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_ontime:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.9}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_arr_ontime:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.9}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_arr_delay:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.1}
Best parameters for elastic_net_flights_arr_cancel:
{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.1}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_dep_ontime:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.9}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_dep_delay:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.5}


  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_dep_cancel:
{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.1}
Best parameters for elastic_net_flights_cancel_pct:
{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.3}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_delay_pct:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.7}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_ontime_pct:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.1}


  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_arr_delay_pct:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.3}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_arr_ontime_pct:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.1}
Best parameters for elastic_net_flights_arr_cancel_pct:
{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.3}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_dep_delay_pct:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.7}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters for elastic_net_flights_dep_ontime_pct:
{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.1}
Best parameters for elastic_net_flights_dep_cancel_pct:
{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.3}


In [27]:
elastic_net_predictions = {}

# Get R2 scores for ElasticNet regression
for target in y.columns.tolist():
    elastic_net_predictions[f'y_pred_{target}'] = elastic_net_models[f'elastic_net_{target}'].predict(X_val)
    print(f"R2 score for ElasticNet_{target}: {r2_score(y_val[target], elastic_net_predictions[f'y_pred_{target}'])}")

R2 score for ElasticNet_flights_total: 0.9212738567722281
R2 score for ElasticNet_flights_cancel: 0.8176442739314613
R2 score for ElasticNet_flights_delay: 0.34997013226812146
R2 score for ElasticNet_flights_ontime: 0.6583849828508679
R2 score for ElasticNet_flights_arr_ontime: 0.6675301310622083
R2 score for ElasticNet_flights_arr_delay: 0.30517269769528454
R2 score for ElasticNet_flights_arr_cancel: 0.8344536556158904
R2 score for ElasticNet_flights_dep_ontime: 0.639379510449414
R2 score for ElasticNet_flights_dep_delay: 0.39398289604738035
R2 score for ElasticNet_flights_dep_cancel: 0.7877726817877178
R2 score for ElasticNet_flights_cancel_pct: 0.8104453670693246
R2 score for ElasticNet_flights_delay_pct: 0.2536369027644553
R2 score for ElasticNet_flights_ontime_pct: 0.567183323471081
R2 score for ElasticNet_flights_arr_delay_pct: 0.2273549642330861
R2 score for ElasticNet_flights_arr_ontime_pct: 0.5326997167921952
R2 score for ElasticNet_flights_arr_cancel_pct: 0.8193519616023064
R