# Traffic Volume Prediction

Give some attributes we have to predict the traffic volume.

**There are 9 attributes which represents the following -**

-    **date_time** - DateTime Hour of the data collected in local CST time
-    **holiday** - Categorical US National holidays plus regional holiday, Minnesota State Fair
-    **temp** - Numeric Average temp in kelvin
-    **rain_1h** - Numeric Amount in mm of rain that occurred in the hour
-    **snow_1h** - Numeric Amount in mm of snow that occurred in the hour
-    **clouds_all** - Numeric Percentage of cloud cover
-    **weather_main** - Categorical Short textual description of the current weather
-    **weather_description** - Categorical Longer textual description of the current weather
-    **traffic_volume** - Numeric Hourly I-94 ATR 301 reported westbound traffic volume (Target)

In [None]:
# import all required libraries for reading, analysing and visualizing data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

## Data Analysis

In [None]:
train_df = pd.read_csv('../input/metro-traffic-volume/Metro_Interstate_Traffic_Volume.csv')

In [None]:
print('Dataset shape: ', train_df.shape)

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
train_df.describe(include = 'all')

In [None]:
train_df.isnull().sum()

## Data Visualization

In [None]:
# convert the date_time column to datetime type
train_df['date_time'] = pd.to_datetime(train_df['date_time'])

### Time vs Traffic Volume

In [None]:
train_df['time'] = train_df['date_time'].dt.hour

In [None]:
fig, (axis1,axis2) = plt.subplots(2, 1, figsize = (20,12))
sns.countplot(x = 'time', data = train_df, ax = axis1)
sns.lineplot(x = 'time', y = 'traffic_volume', data = train_df, ax = axis2);

Thus we can infer that traffic is much higher in morning and noon time and gradually decreases as night progresses.

### Month vs Traffic Volume

In [None]:
train_df['month'] = train_df['date_time'].dt.month

In [None]:
fig, (axis1,axis2) = plt.subplots(2, 1, figsize = (20,12))
sns.countplot(x = 'month', data = train_df, ax = axis1)
sns.lineplot(x = 'month', y = 'traffic_volume', data = train_df, ax = axis2);

Thus we can see that in the months when its cold, the traffic volume decreases is slightly less.

### Year vs Traffic Volume

In [None]:
train_df['year'] = train_df['date_time'].dt.year

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (20,6))
sns.countplot(x = 'year', data = train_df, ax = axis1)
sns.lineplot(x = 'year', y = 'traffic_volume', data = train_df, ax = axis2);

### Day vs Traffic Volume

In [None]:
train_df['day'] = train_df['date_time'].dt.day_name()

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (20,6))
sns.countplot(x = 'day', data = train_df, ax = axis1)
sns.lineplot(x = 'day', y = 'traffic_volume', data = train_df, ax = axis2);

### Holiday vs Traffic Volume

In [None]:
train_df['holiday'].value_counts()

In [None]:
z = lambda x: False if x == 'None' else True
train_df['holiday'] = train_df['holiday'].apply(z)

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (20,6))
sns.countplot(x = 'holiday', data = train_df, ax = axis1)
sns.barplot(x = 'holiday', y = 'traffic_volume', data = train_df, ax = axis2);

Thus from the above plots we can see that traffic on holidays is usually less than traffic on non holidays.

### Temperature vs Traffic Volume

In [None]:
(train_df['temp'] == 0).sum()

As the temperature can't be 0 kelvin therefore these are outliers.

In [None]:
train_df = train_df[train_df['temp'] != 0]

In [None]:
sns.scatterplot(x = 'temp', y = 'traffic_volume', data = train_df);

### Rain vs Traffic Volume

In [None]:
(train_df['rain_1h'] > 100).sum()

Therefore there is an outlier and we have to remove it.

In [None]:
train_df = train_df[train_df.rain_1h < 100]

In [None]:
sns.scatterplot(x = 'rain_1h', y = 'traffic_volume', data = train_df);

### Snow vs Traffic Volume

In [None]:
sns.scatterplot(x = 'snow_1h', y = 'traffic_volume', data = train_df);

### Clouds vs Traffic Volume

In [None]:
sns.scatterplot(x = 'clouds_all', y = 'traffic_volume', data = train_df);

### Short Weather Description vs Traffic Volume

In [None]:
fig, (axis1,axis2) = plt.subplots(2, 1, figsize = (16,12))
sns.countplot(x = 'weather_main', data = train_df, ax = axis1)
sns.lineplot(x = 'weather_main', y = 'traffic_volume', data = train_df, ax = axis2);

### Long Weather Description vs Traffic Volume

In [None]:
train_df['weather_description'].value_counts()

In [None]:
plt.figure(figsize = (20,6))
sns.lineplot(x = 'weather_description', y = 'traffic_volume', data = train_df);

Now we can see from the values of the column that it contains long description of the weather_main column, so we can drop it.

### Correlation between features

In [None]:
plt.figure(figsize=(8, 5))
plt.title('Correlation between features')
sns.heatmap(train_df.corr(), annot = True);

We can see from the above heatmap that features are not coorelated.

## Preprocessing of data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# drop the unrequired columns
train_df.drop(['date_time', 'weather_description'], axis = 1, inplace = True)

In [None]:
# convert values of day column to numerical format
encoder = LabelEncoder()
train_df['day'] = encoder.fit_transform(train_df['day'])

In [None]:
# subtract 242 from the temp column as there is no temperature below it
train_df['temp'] = train_df['temp'] - 242

In [None]:
# convert the values of weather_main column to numerical format
encoder = LabelEncoder()
train_df['weather_main'] = encoder.fit_transform(train_df['weather_main'])

## Machine Learning

In [None]:
# import the required modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

In [None]:
(X, Y) = (train_df.drop(['traffic_volume'], axis = 1).values, train_df['traffic_volume'].values)

# Scale the values
scaler = StandardScaler()
X = scaler.fit_transform(X)

(X_train, X_val, Y_train, Y_val) = train_test_split(X, Y)
print("X_train shape:" + str(X_train.shape))
print("Y_train shape:" + str(Y_train.shape))
print("X_val shape:" + str(X_val.shape))
print("Y_val shape:" + str(Y_val.shape))

In [None]:
# DataFrame to store the RMSE scores of various algorithms
results = pd.DataFrame(columns = ['RMSE'])

In [None]:
# helper function to evaluate a model
def evaluate_model(regressor, name):
    # train and test scores
    train_score = round(regressor.score(X_train, Y_train), 2)
    val_score = round(regressor.score(X_val, Y_val), 2)
    # predicted output
    Y_pred = regressor.predict(X_val)

    print(name + ' Train score: ', train_score)
    print(name + 'Test score: ', val_score)
    print('Root Mean Squared error: ', sqrt(mean_squared_error(Y_val, Y_pred)))
    print('Coefficient of determination: ', r2_score(Y_val, Y_pred))
    
    # add the current RMSE to the scores list
    results.loc[name] = sqrt(mean_squared_error(Y_val, Y_pred))
    
    # plot predicted vs true values
    x_points=np.linspace(0,8e3)
    plt.figure(figsize=(12,5))
    plt.plot(x_points, x_points, color='r')
    plt.scatter(Y_val, Y_pred)
    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.title('True Values Vs Predicted Values');

### Using Linear Regression

In [None]:
lireg = LinearRegression()
lireg.fit(X_train, Y_train)

In [None]:
# evaluate the Regressor
evaluate_model(lireg, 'Linear Regression')

### Using Decision Tree Regressor

In [None]:
dtreg = DecisionTreeRegressor(max_depth = 12)
dtreg.fit(X_train, Y_train)

In [None]:
# evaluate the Regressor
evaluate_model(dtreg, 'Decision Tree')

### Using Random Forest Regressor

In [None]:
# n_estimators - The number of trees in the forest.
# min_samples_split - The minimum number of samples required to split an internal node
rfreg = RandomForestRegressor(n_estimators = 50, max_depth = 12, min_samples_split = 5)
rfreg.fit(X_train, Y_train)

In [None]:
# evaluate the Regressor
evaluate_model(rfreg, 'Random Forest')

### Using Gradient Boosting

In [None]:
# n_estimators - The number of boosting stages to perform.
# max_depth - maximum depth of the individual regression estimators.
gbreg = GradientBoostingRegressor(n_estimators=500, max_depth=10)
gbreg.fit(X_train, Y_train)

In [None]:
# evaluate the Regressor
evaluate_model(gbreg, 'Gradient Boosting')

### Using AdaBoost

In [None]:
# n_estimators - The number of trees in the forest.
# learning_rate - Learning rate shrinks the contribution of each classifier by learning_rate.
adareg = AdaBoostRegressor(base_estimator=dtreg, n_estimators=50, learning_rate=0.01)
adareg.fit(X_train, Y_train)

In [None]:
# evaluate the Regressor
evaluate_model(adareg, 'Ada Boost')

### Comparison between all the above algorithms

In [None]:
results

Thus we can see that **Gradient Boosting** provide the least RMSE, therefore we will use it to compute the outputs.

In [None]:
plt.plot(gbreg.feature_importances_)

The above plot shows that time and da are the most important features.

### Neural Networks using Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline

In [None]:
def nn_model ():
    model = Sequential()
    model.add(Dense(128, input_dim=10, kernel_initializer='normal', activation='relu'))
    model.add(Dense(256, kernel_initializer='normal', activation='relu'))
    model.add(Dense(256, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
estimator = KerasRegressor(build_fn=nn_model, epochs=10, batch_size=5, verbose=0)
kfold = KFold(n_splits=10)

In [None]:
estimator.fit(X_train, Y_train)

In [None]:
# predicted output
Y_pred_nn = estimator.predict(X_val)

print('Root Mean Squared error: ', sqrt(mean_squared_error(Y_val, Y_pred_nn)))
print('Coefficient of determination: ', r2_score(Y_val, Y_pred_nn))