In [36]:
!pip install stumpy
!pip install xgboost



In [37]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.io as pio
pio.templates.default = "plotly_white+xgridoff"

In [67]:
import os
print(os.listdir())

df = pd.read_csv('model/cs551.csv', parse_dates=['date_time'])
df['weekday'] = df.date_time.map(lambda x: x.weekday())
df['year'] = df.date_time.map(lambda x: x.year)
df['month'] = df.date_time.map(lambda x: x.month)

['backend.py', 'model.pkl', 'model', 'app.py', '.idea']


In [63]:
pio.templates

Templates configuration
-----------------------
    Default template: 'plotly_white+xgridoff'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']

In [40]:
df.corr()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,traffic_volume,weekday,year,month
temp,1.0,0.009069,-0.019755,-0.101976,0.130299,-0.007708,0.134945,0.223738
rain_1h,0.009069,1.0,-9e-05,0.004818,0.004714,-0.00692,0.000442,0.001298
snow_1h,-0.019755,-9e-05,1.0,0.027931,0.000733,-0.014928,-0.003519,0.020412
clouds_all,-0.101976,0.004818,0.027931,1.0,0.067054,-0.039715,-0.072861,-0.009133
traffic_volume,0.130299,0.004714,0.000733,0.067054,1.0,-0.149544,0.004753,-0.002533
weekday,-0.007708,-0.00692,-0.014928,-0.039715,-0.149544,1.0,-0.012313,0.010741
year,0.134945,0.000442,-0.003519,-0.072861,0.004753,-0.012313,1.0,-0.158688
month,0.223738,0.001298,0.020412,-0.009133,-0.002533,0.010741,-0.158688,1.0


In [68]:
avgs = df.groupby('holiday').traffic_volume.mean()
fig = px.bar(avgs.sort_values(ascending=False))
fig.update_layout(yaxis_title='Traffic Volume', xaxis_title='Holiday')
fig.show()

In [69]:
fig = px.bar(df.groupby('weekday').traffic_volume.mean())
fig.update_layout(yaxis_title='Traffic Volume', xaxis_title='Weekday', 
                  xaxis={
                      'ticktext': ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'],
                      'tickvals': [0, 1, 2, 3, 4, 5, 6],
                      'tickmode': 'array'
                  })
fig.show()

In [70]:
fig = px.bar(df.groupby('month').traffic_volume.mean())
fig.update_layout(yaxis_title='Traffic Volume', xaxis_title='Month', 
                  xaxis={
                      'ticktext': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
                      'tickvals': np.arange(1, 13),
                      'tickmode': 'array'
                  })
fig.show()

In [71]:
fig = px.bar(df.groupby('year').traffic_volume.mean())
fig.update_layout(yaxis_title='Traffic Volume', xaxis_title='Year')
fig.show()

In [72]:
temp = pd.get_dummies(df.weekday)
temp.rename(columns=dict(zip(np.arange(7), ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday'])), inplace=True)
df = df.join(temp)

In [73]:
df['is_holiday'] = df.holiday.map(lambda x: x != 'None')
df[df.is_holiday][['holiday', 'is_holiday']]

Unnamed: 0,holiday,is_holiday
126,Columbus Day,True
1123,Veterans Day,True
1370,Thanksgiving Day,True
2360,Christmas Day,True
2559,New Years Day,True
...,...,...
44441,Memorial Day,True
45547,Independence Day,True
46936,State Fair,True
47330,Labor Day,True


In [74]:
def random_day():
    base = pd.to_datetime('2012-10-03 00:00:00')
    days_to_add = np.random.randint(0, 2187)
    start = base + pd.Timedelta(days=days_to_add)
    end = start + pd.Timedelta(days=1)
    fig = px.line(df[(df.date_time > start) & (df.date_time < end)], x='date_time', y='traffic_volume')
    mapper = dict(zip(np.arange(7), ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']))
    fig.update_layout(
        font=dict(size=20),
        yaxis_range=[0, 6000],
        xaxis_title=mapper[start.weekday()]
    )
    return fig

In [75]:
random_day()

In [76]:
df.weather_main.value_counts()

Clouds          15164
Clear           13391
Mist             5950
Rain             5672
Snow             2876
Drizzle          1821
Haze             1360
Thunderstorm     1034
Fog               912
Smoke              20
Squall              4
Name: weather_main, dtype: int64

In [77]:
fig = px.bar(df.groupby('weather_main').traffic_volume.mean())
fig.update_layout(yaxis_title='Traffic Volume', xaxis_title='Weather Type')
fig.show()

In [78]:
fig = px.line(df.groupby('clouds_all').traffic_volume.mean())
fig.update_layout(yaxis_title='Average Traffic Volume', xaxis_title='Cloud Density')
fig.show()

In [79]:
def tod(x):
    if x < 5:
        return 'early_morning'
    elif x < 9:
        return 'morning'
    elif x < 12:
        return 'late_morning'
    elif x < 15:
        return 'afternoon'
    elif x < 18:
        return 'evening'
    else:
        return 'night'

time_of_day = df.date_time.map(lambda x: tod(x.hour))
temp = pd.get_dummies(time_of_day)
df = df.join(temp)

In [80]:
df

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume,weekday,...,thursday,friday,saturday,is_holiday,afternoon,early_morning,evening,late_morning,morning,night
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545,1,...,0,0,0,False,0,0,0,1,0,0
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516,1,...,0,0,0,False,0,0,0,1,0,0
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767,1,...,0,0,0,False,0,0,0,1,0,0
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026,1,...,0,0,0,False,1,0,0,0,0,0
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918,1,...,0,0,0,False,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543,6,...,0,0,1,False,0,0,0,0,0,1
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781,6,...,0,0,1,False,0,0,0,0,0,1
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159,6,...,0,0,1,False,0,0,0,0,0,1
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450,6,...,0,0,1,False,0,0,0,0,0,1


In [81]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
#
# df[['temp', 'clouds_all']] = scaler.fit_transform(df[['temp', 'clouds_all']])
df['is_winter'] = df.month.map(lambda x: x in [1, 2, 11, 12])
df

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume,weekday,...,friday,saturday,is_holiday,afternoon,early_morning,evening,late_morning,morning,night,is_winter
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545,1,...,0,0,False,0,0,0,1,0,0,False
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516,1,...,0,0,False,0,0,0,1,0,0,False
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767,1,...,0,0,False,0,0,0,1,0,0,False
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026,1,...,0,0,False,1,0,0,0,0,0,False
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918,1,...,0,0,False,1,0,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543,6,...,0,1,False,0,0,0,0,0,1,False
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781,6,...,0,1,False,0,0,0,0,0,1,False
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159,6,...,0,1,False,0,0,0,0,0,1,False
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450,6,...,0,1,False,0,0,0,0,0,1,False


In [82]:
features = ['temp', 'clouds_all'] + list(df.columns[12:]) + ['traffic_volume']
feature_df = df[features]
feature_df

Unnamed: 0,temp,clouds_all,sunday,monday,tuesday,wednesday,thursday,friday,saturday,is_holiday,afternoon,early_morning,evening,late_morning,morning,night,is_winter,traffic_volume
0,288.28,40,0,1,0,0,0,0,0,False,0,0,0,1,0,0,False,5545
1,289.36,75,0,1,0,0,0,0,0,False,0,0,0,1,0,0,False,4516
2,289.58,90,0,1,0,0,0,0,0,False,0,0,0,1,0,0,False,4767
3,290.13,90,0,1,0,0,0,0,0,False,1,0,0,0,0,0,False,5026
4,291.14,75,0,1,0,0,0,0,0,False,1,0,0,0,0,0,False,4918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48199,283.45,75,0,0,0,0,0,0,1,False,0,0,0,0,0,1,False,3543
48200,282.76,90,0,0,0,0,0,0,1,False,0,0,0,0,0,1,False,2781
48201,282.73,90,0,0,0,0,0,0,1,False,0,0,0,0,0,1,False,2159
48202,282.09,90,0,0,0,0,0,0,1,False,0,0,0,0,0,1,False,1450


In [85]:
from sklearn.model_selection import train_test_split
feature_df['is_holiday'] = feature_df['is_holiday'].astype(int)
feature_df['is_winter'] = feature_df['is_winter'].astype(int)
feature_df.to_csv('./training_data.csv', index=False)

X = feature_df[features[:-1]]
y = feature_df.traffic_volume
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
print(f'Train size: {len(X_train)} hours\nTest Size: {len(X_test)} hours')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Train size: 38563 hours
Test Size: 9641 hours


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
lr = LinearRegression()
lr.fit(X_train, y_train)
preds = lr.predict(X_test)
result_df = pd.DataFrame({'Predicted': preds, 'True': y_test})
result_df.sort_values(by='True', inplace=True)
result_df.reset_index(drop=True, inplace=True)
fig = px.line(result_df)
fig.update_layout(
    xaxis_title='',
    yaxis_title='Traffic Volume')
result_df['error'] = result_df.Predicted - result_df['True']
result_df['abs_error'] = np.abs(result_df.error)
result_df['squared_error'] = result_df.error ** 2
print(f'MAE: {result_df.abs_error.mean()}')
print(f'RMSE: {np.sqrt(np.mean(result_df.squared_error))}')
fig.show()

In [None]:
from sklearn.linear_model import Ridge
rr = Ridge(alpha=5)
rr.fit(X_train, y_train)
preds = rr.predict(X_test)
result_df = pd.DataFrame({'Predicted': preds, 'True': y_test})
result_df.sort_values(by='True', inplace=True)
result_df.reset_index(drop=True, inplace=True)
fig = px.line(result_df)
fig.update_layout(
    xaxis_title='',
    yaxis_title='Traffic Volume')
result_df['error'] = result_df.Predicted - result_df['True']
result_df['abs_error'] = np.abs(result_df.error)
result_df['squared_error'] = result_df.error ** 2
print(f'MAE: {result_df.abs_error.mean()}')
print(f'RMSE: {np.sqrt(np.mean(result_df.squared_error))}')
fig.show()

In [None]:
importance = dict(zip(X_train.columns, list(rr.coef_)))
fig = px.bar(x=importance.keys(), y=importance.values())
fig.update_layout(xaxis_title='feature', yaxis_title='coefficient')

In [None]:
import xgboost as xgb
gbr = xgb.XGBRegressor(n_estimators=100, max_depth=25, learning_rate=.1)
gbr.fit(X_train, y_train)
preds = gbr.predict(X_test)
result_df = pd.DataFrame({'Predicted': preds, 'True': y_test})
result_df.sort_values(by='True', inplace=True)
result_df.reset_index(drop=True, inplace=True)
fig = px.line(result_df)
fig.update_layout(
    xaxis_title='',
    yaxis_title='Traffic Volume')
result_df['error'] = result_df.Predicted - result_df['True']
result_df['abs_error'] = np.abs(result_df.error)
result_df['squared_error'] = result_df.error ** 2
print(f'MAE: {result_df.abs_error.mean()}')
print(f'RMSE: {np.sqrt(np.mean(result_df.squared_error))}')
fig.show()

In [None]:
importance = dict(zip(X_train.columns, list(gbr.feature_importances_)))
fig = px.bar(x=importance.keys(), y=importance.values())
fig.update_layout(xaxis_title='feature', yaxis_title='importance')

In [None]:
from scipy.spatial.distance import cdist

def knn_predict(*kwargs, neighbors=10):
    neighbors_x = X_train.copy()
    neighbor_y = y_train.copy()
    neighbor_df = neighbors_x.join(neighbor_y)
    x=pd.DataFrame(dict(zip(neighbor_df.columns[:-1], list(kwargs))), index=[0])
    neighbor_df['distance'] = cdist(x, neighbor_df[neighbor_df.columns[:-1]])[0]
    neighbor_df.sort_values(by='distance', ascending=True, inplace=True)
    neighbor_df.reset_index(drop=True, inplace=True)
    closest = neighbor_df[:neighbors]
    return closest.traffic_volume.mean()

tqdm.pandas()
preds = X_test.progress_apply(lambda x: knn_predict(*x.values, 69), axis=1)

In [None]:
result_df = pd.DataFrame({'Predicted': preds, 'True': y_test})
result_df.sort_values(by='True', inplace=True)
result_df.reset_index(drop=True, inplace=True)
fig = px.line(result_df)
fig.update_layout(
    xaxis_title='',
    yaxis_title='Traffic Volume')
result_df['error'] = result_df.Predicted - result_df['True']
result_df['abs_error'] = np.abs(result_df.error)
result_df['squared_error'] = result_df.error ** 2
print(f'MAE: {result_df.abs_error.mean()}')
print(f'RMSE: {np.sqrt(np.mean(result_df.squared_error))}')
fig.show()

In [None]:
!pip install tensorflow
!pip install -q -U keras-tuner
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
import keras_tuner as kt

def build_model(hp):
    model = keras.Sequential(
        [
            layers.Input(shape=[17]),
            layers.Dense(hp.Int('Dense_1', min_value=17, max_value=68, step=17), activation='relu'),
            layers.Dense(hp.Int('Dense_2', min_value=4, max_value=16, step=4), activation='relu'),
            layers.Dense(2, activation='relu'),
            layers.Dense(1, activation='relu')
        ]
    )
    model.build()
    model.compile(
        optimizer=keras.optimizers.Adam(
            learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='mean_squared_error',
        metrics=['mean_absolute_error', 'root_mean_squared_error']
    )
    return model

tuner2 = kt.Hyperband(build_model,
                    objective='val_loss',
                    max_epochs=100,
                    factor=3,
                    directory='./new/')

tuner2.search(X_train, y_train, epochs=50, validation_split=.2)

In [None]:
from matplotlib import pyplot as plt
from IPython.display import clear_output

class PlotLosses(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.val_losses = []
        
        self.fig = plt.figure()
        
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.i += 1
        
        clear_output(wait=True)
        plt.plot(self.x, self.losses, label="loss")
        plt.plot(self.x, self.val_losses, label="val_loss")
        plt.legend()
        plt.show();
        
plot_losses = PlotLosses()

In [None]:
model = keras.Sequential(
        [
            layers.Input(shape=[17]),
            layers.Dense(500, activation='relu'),
            layers.Dense(250, activation='relu'),
            layers.Dense(50, activation='relu'),
            layers.Dense(25, activation='relu'),
            layers.Dense(1)
        ]
)
model.summary()
model.build(None)
model.compile(
    optimizer=keras.optimizers.Adam(
        learning_rate=.0005),
    loss='mean_squared_error',
    metrics=['mean_absolute_error', keras.metrics.RootMeanSquaredError()]
)
model.fit(X_train, y_train, epochs=50, validation_split=.2, batch_size=1000, verbose=1, callbacks=[plot_losses])

In [None]:
preds = [i[0] for i in model.predict(X_test)]
result_df = pd.DataFrame({'Predicted': preds, 'True': y_test})
result_df.sort_values(by='True', inplace=True)
result_df.reset_index(drop=True, inplace=True)
fig = px.line(result_df)
fig.update_layout(
    xaxis_title='',
    yaxis_title='Traffic Volume')
result_df['error'] = result_df.Predicted - result_df['True']
result_df['abs_error'] = np.abs(result_df.error)
result_df['squared_error'] = result_df.error ** 2
print(f'MAE: {result_df.abs_error.mean()}')
print(f'RMSE: {np.sqrt(np.mean(result_df.squared_error))}')
fig.show()

In [None]:
!pip install tpot

In [None]:
from tpot import TPOTRegressor
tpot = TPOTRegressor(generations=2, population_size=25, verbosity=2)
tpot.fit(X_train, y_train)

In [None]:
preds = tpot.predict(X_test)
result_df = pd.DataFrame({'Predicted': preds, 'True': y_test})
result_df.sort_values(by='True', inplace=True)
result_df.reset_index(drop=True, inplace=True)
fig = px.line(result_df)
fig.update_layout(
    xaxis_title='',
    yaxis_title='Traffic Volume')
result_df['error'] = result_df.Predicted - result_df['True']
result_df['abs_error'] = np.abs(result_df.error)
result_df['squared_error'] = result_df.error ** 2
print(f'MAE: {result_df.abs_error.mean()}')
print(f'RMSE: {np.sqrt(np.mean(result_df.squared_error))}')
fig.show()

In [None]:
tpot.score(X_test, y_test)

In [None]:
tpot.export('tpot_model.py')

In [None]:
df.date_time.map(lambda x: x.strftime('%Y %m %d')).nunique() / float((df.date_time.max() - df.date_time.min()) / pd.Timedelta(days=1)) * 100

In [None]:
maes = [774, 586, 568, 515, 465]
rmses = [1029, 832, 830, 812, 770]
models = ['LR', 'DNN', 'KNN', 'XGBoost', 'TPOT']
comp_df = pd.DataFrame({'model': models, 'mae': maes, 'rmse': rmses})
fig = px.bar(comp_df, x='model', y=['mae', 'rmse'], barmode='group', text_auto=True)
fig.update_layout(xaxis_title='Model Type', yaxis_title='Error (in Vehicles)', title='Model Performance Comparison')
