In [None]:
# Import packages relevant for all models
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

# Load in modeling data from /data
modeling_df = pd.read_csv('data/weather_with_avg_hourly_flight_delay.csv', index_col=False)

In [None]:
# split data into training and testing subsets
train_features, test_features, train_outcome, test_outcome = train_test_split(
    modeling_df.drop(['DATE', 'HourlySkyConditions', 'REPORT_TYPE', 'join_time', 'actual_weather_delay', 'avg_delay'], axis=1),
    modeling_df.avg_delay,
    test_size=0.20
)

# Explore sizes of resulting training and testing sets
print('Training features shape: ' + str(train_features.shape))
print('Testing features shape: ' + str(test_features.shape))
print('Training outcomes shape: ' + str(train_outcome.shape))
print('Testing outcomes shape: ' + str(test_outcome.shape))

print(list(train_features))

In [None]:
# construct pipeline
pipe_knn = make_pipeline(
    MinMaxScaler(), # used to normalize data onto a similar scale
    SelectPercentile(), # used to filter out features that add noise
    KNeighborsRegressor()
)

# create the parameter grid for hyperparameter tuning
param_grid_knn = {
    'selectpercentile__percentile':range(10, 30, 5), # what upper percentile of features to take
    'kneighborsregressor__n_neighbors':range(1, 20), # the number of neighbors to take
    'kneighborsregressor__weights':["uniform", "distance"] # how to weight the connections between neighbors
}

# perform grid search of pipeline
knn_grid = GridSearchCV(pipe_knn, param_grid_knn)

# use results to create model on training data
knn_grid.fit(train_features, train_outcome)

# find the best parameters from the grid search
knn_best_params = knn_grid.best_params_

# find the score of our model on the test data
knn_grid_score = knn_grid.score(test_features, test_outcome)

# find the mean absolute error of our model on the test data
knn_mae = mean_absolute_error(knn_grid.predict(test_features), test_outcome)

# find the explained variance score of our model on the test data
knn_evs = explained_variance_score(knn_grid.predict(test_features), test_outcome)

In [None]:
# construct pipeline
pipe_dt = make_pipeline(
    MinMaxScaler(), # used to normalize data onto a similar scale
    SelectPercentile(), # used to filter out features that add noise
    DecisionTreeRegressor()
)

# create the parameter grid for hyperparameter tuning
param_grid_dt = {
    'selectpercentile__percentile':range(5, 30, 5), # what upper percentile of features to take
    'decisiontreeregressor__max_features':["auto", "sqrt", "log2", None], # the number of features to conside when splitting
    'decisiontreeregressor__max_depth':range(1, 10), # maximum depth of the decision tree
    'decisiontreeregressor__min_samples_leaf':range(1, 4) # minimum number of samples required to be at a leaf node
}

# perform grid search of pipeline
dt_grid = GridSearchCV(pipe_dt, param_grid_dt)

# use results to create model on training data
dt_grid.fit(train_features, train_outcome)

# find the best parameters from the grid search
dt_best_params = dt_grid.best_params_

# find the score of our model on the test data
dt_grid_score = dt_grid.score(test_features, test_outcome)

# find the mean absolute error of our model on the test data
dt_mae = mean_absolute_error(dt_grid.predict(test_features), test_outcome)

# find the explained variance score of our model on the test data
dt_evs = explained_variance_score(dt_grid.predict(test_features), test_outcome)

In [None]:
print('KNN R-squared: ' + str(knn_grid_score))
print('Decision Tree R-squared: ' + str(dt_grid_score))
print('KNN Mean Absolute Error: ' + str(knn_mae))
print('Decision Tree Mean Absolute Error: ' + str(dt_mae))
print('KNN Explained Variance Score: ' + str(knn_evs))
print('Decision Explained Variance Score: ' + str(dt_evs))

In [None]:
print('Best KNN hyperparameters: ' + str(knn_best_params) + '\n')
print('Best Decision Tree hyperparameters: ' + str(dt_best_params) + '\n')

In [None]:
import matplotlib.pyplot as plt

plt.scatter(dt_grid.predict(test_features), test_outcome, alpha=0.5, label='DecisionTree')
plt.scatter(knn_grid.predict(test_features), test_outcome, alpha=0.5, label='KNN')
plt.plot(test_outcome, test_outcome, label='y=x', c='green', alpha=0.2)
plt.legend()
plt.ylabel('Actual Delay')
plt.xlabel('Predicted Delay')
plt.title('Predicted v.s. Actual Delay')
plt.show()