## Simple Regression Model for Traffic Flow Prediction
This task aims to implement simple regression models (Polynomial, Decision Tree, Random Forest) to predict the traffic flow based on Mcs data and INRIX data

## 0. Setting

### 0.1 Import Libraries

In [None]:
from google.colab import drive
import os
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn


### 0.2 Mount Google drive, Set working directory

In [None]:
drive.mount('/content/gdrive')
print(os.path.abspath('.'))
root_path = '/content/gdrive/My Drive/degree_project/' 
os.chdir(root_path + 'Data')
print(os.getcwd())

NameError: ignored

## 1. Read the data from Mcs, Inrix from .csv

In [None]:
mcsNorth = pd.read_csv("OutData/mcsNorth.csv", delimiter = ';')
mcsSouth = pd.read_csv("OutData/mcsSouth.csv", delimiter= ';')
inrixNorth = pd.read_csv("OutData/inrixNorth.csv", delimiter= ';')
inrixSouth = pd.read_csv("OutData/inrixSouth.csv", delimiter= ';')

## 2. Discover the data to gain insight

### 2.1 Schema and dimension

In [None]:
mcsNorth.info()
print(" ")
mcsSouth.info()
print(" ")
inrixSouth.info()

### 2.2 Look at the data

In [None]:
mcsNorth.head(5)

In [None]:
mcsSouth.head(5)

In [None]:
inrixNorth.head(5)

### 2.3 statistical summary

In [None]:
mcsNorth.describe()

In [None]:
mcsSouth.describe()

In [None]:
inrixNorth.describe()

## 2.4 make plots

In [None]:
# function to plot same attribute from 2 different senesors in a same diagram
def plotAttr(days, dataFrameNorth, dataFrameSouth, attribute, sensor):
  plt.style.use('default')
  plt.plot(dataFrameNorth['Epoch_'+sensor], dataFrameNorth[attribute], 'r', linewidth = 0.4, label = sensor + 'North')
  plt.plot(dataFrameSouth['Epoch_'+sensor], dataFrameSouth[attribute], 'b', linewidth = 0.4, label = sensor + 'South')
  plt.xlabel('Epochs(sec)')
  plt.ylabel(attribute)
  plt.title(attribute + ' vs. time (' + str(days) + ' days)')
  plt.legend()
  plt.show()

In [None]:
plt.figure(figsize = (20, 7))
plotAttr(31, mcsNorth, mcsSouth, 'flow', 'mcs')

plt.figure(figsize = (20, 7))
mcsNorth4days = mcsNorth[: round(4*len(mcsNorth)/31.0)][:]
mcsSouth4days = mcsSouth[: round(4*len(mcsSouth)/31.0)][:]
plotAttr(4, mcsNorth4days, mcsSouth4days, 'flow', 'mcs')

plt.figure(figsize = (20, 7))
mcsNorth1days = mcsNorth[: round(len(mcsNorth)/31.0)][:]
mcsSouth1days = mcsSouth[: round(len(mcsSouth)/31.0)][:]
plotAttr(1, mcsNorth1days, mcsSouth1days, 'flow', 'mcs')


In [None]:
plt.figure(figsize = (20, 7))
plotAttr(31, mcsNorth, mcsSouth,'speed', 'mcs')
plt.figure(figsize = (20, 7))
plotAttr(4, mcsNorth4days, mcsSouth4days,'speed', 'mcs')
plt.figure(figsize = (20, 7))
plotAttr(1, mcsNorth1days, mcsSouth1days,'speed', 'mcs')


Note: The relation between flow and speed is non-linear. We cannot only use speed to predict flow, but need additional features to predict the flow together with speed.

In [None]:
# function to plot attributes in the same diagram from different sensors in different road segments
def plotAttr2(days, dataFrame1, dataFrame2, dataFrame3, attribute, sensor1, sensor2, sensor3, loc1, loc2, loc3):
  print(dataFrame1['Epoch_' + sensor1][-1:])
  print(dataFrame2['Epoch_' + sensor2][-1:])
  print(dataFrame3['Epoch_' + sensor3][-1:])
  plt.plot(dataFrame1['Epoch_' + sensor1], dataFrame1[attribute], linewidth=0.4, color= 'b', label = sensor1 + loc1)
  plt.plot(dataFrame2['Epoch_' + sensor2], dataFrame2[attribute], linewidth=0.4, color= 'k', label = sensor2 + loc2)
  plt.plot(dataFrame3['Epoch_' + sensor3], dataFrame3[attribute], linewidth=0.4, color= 'g', label = sensor3 + loc3)
  plt.xlabel('Epochs(sec)')
  plt.ylabel(attribute)
  plt.title(attribute + ' vs. time (' + str(days) + ' days)')
  plt.legend()
  plt.show()

In [None]:
inrixNorth1days = inrixNorth[: round(len(inrixNorth)/31.0)][:]
inrixSouth1days = inrixSouth[: round(len(inrixSouth)/31.0)][:]
plt.figure(figsize=(20, 7))
plotAttr2(1, mcsSouth1days, inrixNorth1days, inrixSouth1days, 'speed', 'mcs', 'inrix', 'inrix', 'South', 'North', 'South')

In [None]:
# mcsSouthHalfdays = mcsSouth[round(len(inrixNorth)/62.0): round(len(mcsSouth)/31.0)][:]
# inrixNorthHalfdays = inrixNorth[round(len(inrixNorth)/62.0): round(len(inrixNorth)/31.0)][:]
# inrixSouthHalfdays = inrixSouth[round(len(inrixNorth)/62.0): round(len(inrixSouth)/31.0)][:]
# plotAttr2(1, mcsSouthHalfdays, inrixNorthHalfdays, inrixSouthHalfdays, 'speed', 'mcs', 'inrix', 'inrix', 'South', 'Norht', 'South')

In [None]:
plt.figure(figsize=(20, 7))
plotAttr(1, inrixNorth1days, inrixSouth1days, 'travel_time_secs', 'inrix')

### 2.4.1 Contour plot of speeds in one day for 4 datasets (Inrix/Mcs/South/North)

In [None]:
def preContour(DF1, DF2, DF3, DF4, attribute1, attribute2):
  result = DF1[attribute1].append([DF2[attribute1], DF3[attribute2], DF4[attribute2]], ignore_index = True)
  return result.values

In [None]:
# Contour plot for speeds in InrixSouth, InrixNorth, McsSouth, McsNorth
contour_inrix_North = inrixNorth.iloc[:1440][['Epoch_inrix', 'speed']].copy()
contour_inrix_North['type'] = 1

contour_inrix_South = inrixSouth.iloc[:1440][['Epoch_inrix', 'speed']].copy()
contour_inrix_South['type'] = 2

contour_mcs_North = mcsNorth.iloc[:1426][['Epoch_mcs', 'speed']].copy()
contour_mcs_North['type'] = 3

contour_mcs_South = mcsSouth.iloc[:1424][['Epoch_mcs', 'speed']].copy()
contour_mcs_South['type'] = 4

# Prepare X, Y, Z for contour plot
X = preContour(contour_inrix_North, contour_inrix_South, contour_mcs_North, contour_mcs_South, 'Epoch_inrix', 'Epoch_mcs')
#print(X.shape)
#print(type(X))
#print(X)

Y = preContour(contour_inrix_North, contour_inrix_South, contour_mcs_North, contour_mcs_South, 'type', 'type')
#print(Y.shape)
#print(type(Y))
#print(Y)
                                            
Z = preContour(contour_inrix_North, contour_inrix_South, contour_mcs_North, contour_mcs_South, 'speed', 'speed')
#print(Z.shape)
#print(type(Z))
#print(Z)

In [None]:
plt.figure(figsize=(12, 5))
plt.style.use('default')
plt.xlabel('Epoch(sec) in a day', fontsize = 14)
plt.ylabel('4 sensors', fontsize = 16)
plt.title('Speed Contour Plot (Original Data)', fontsize = 17)
# plt.ylim(bottom = 0.5, top = 4.5)
plt.tricontourf(X, Y, Z, 25, cmap = 'RdYlGn')
plt.colorbar()
plt.yticks(np.arange(1.0, 5.0, 1.0), ('InrixNorth', 'InrixSouth', 'McsNorth', 'McsSouth'))
loc, labels = plt.yticks()
print(loc)
print(labels)

### 2.4.2 Distribution of flow

In [None]:
 plt.figure(figsize=(7, 5))
seaborn.distplot(mcsNorth1days['flow'], label='Flow distribution(mcsNorth)')
plt.legend()

## 2.5 Correlation among attributes

### 2.5.1 Correlation of Speed and Flow wihtin McsSouth and INRIX




In [None]:
def plotScatter(data1, data2, attribute1, attribute2, Label):
  plt.scatter(data1[attribute1], data2[attribute2], s=1.5, label=Label)
  plt.xlabel(attribute1, fontsize = 15)
  plt.xticks(fontsize = 14)
  plt.yticks(fontsize = 14)
  plt.ylabel(attribute2, fontsize = 15)
  plt.title(attribute2 + '  vs. ' + attribute1, fontsize = 20)
  plt.legend()
  plt.show()

In [None]:
mcsSouthCorr = mcsSouth[['speed', 'flow']].copy()
print(mcsSouthCorr.corr())

plt.figure(figsize=(7,5))
plotScatter(mcsSouth, mcsSouth, 'speed', 'flow', 'mcsSouth')

### 2.5.2 Correlation among McsNorth and McsSouth

In [None]:
spMcsNorth = mcsNorth['speed']
spMcsSouth = mcsSouth['speed']
merge = {'NorthSpeed': spMcsNorth, 'SouthSpeed': spMcsSouth}
speedDf = pd.DataFrame(merge)

speedDf.corr()

In [None]:
flMcsNorth = mcsNorth['flow']
flMcsSouth = mcsSouth['flow']
merge = {'Northflow': flMcsNorth, 'Southflow': flMcsSouth}
flowDf = pd.DataFrame(merge)
flowDf.corr()

As correlation of flow among north and south Mcs is high, it might be possible to use the regression model trained in mcs north to predict the speed/flow in mcs south. 

### 2.5.3 Correlation of Speed and travel distance in INRIX

In [None]:
spInrix = inrixSouth['speed']
travelTimeInrix = inrixSouth['travel_time_secs']
merge = {'inrixSpeed': spInrix, 'inrixTravelTime': travelTimeInrix}
inrixSouthCorr = pd.DataFrame(merge)
inrixSouthCorr.corr()

High negative correlation between speed and travel time, which means they contain similar information

## 3. Prepare the data for machine learning

Polynomial linear regression model for prediction of flow in 9 days based on Epoch.

In [None]:
# function for making polynomial feature columns 
# Input: numpy array with shape(sample number, 1)
def makePoly(deg, X, bias = True):
  poly = PolynomialFeatures(degree = deg,include_bias = bias)
  X_1 = X.reshape(-1, 1)
  X_poly = poly.fit_transform(X_1)
  return X_poly

### 3.1 Prepare label colum and feature columns




In [None]:
# prepare the feature and label columns from mcsSouth
mcsSouth9days = mcsSouth[:12856]
# X = mcsSouth9days['speed'].values
X = mcsSouth9days['Epoch_mcs'].values
Y = mcsSouth9days['flow'].values.reshape(-1, 1)
print(X.shape)
print(Y.shape)
print(X[-1:])
print(Y[:3])

# turn Epocs_mcs into polynomial features: 1, x, x^2, x^3, ...
X = makePoly(20, X)
print(X.shape)
dfRawFeatures = pd.DataFrame(X)
dfRawFeatures.describe()

### 3.2 Scaling the feature columns

In [None]:
# funtion for normalizing the feature columns
# Input: X: Numpy array
def normalize(X, Scaler=None):
  # remove the bias column
  X_1 = np.delete(X, 0, 1)

  # scale the remaining feature columns
  scaler = None
  X_2 = None

  if(Scaler==None):
    scaler = StandardScaler()
    X_2 = scaler.fit_transform(X_1)
  else:
    scaler = Scaler
    X_2 = scaler.transform(X_1)

  # add the bias column back to the features columns
  ones = np.ones((len(X_2), 1))
  X_scaled = np.append(ones, X_2, 1)
  
  return X_scaled, scaler

In [None]:
# Scale the feature columns
X, scaler1month = normalize(X)
dfScaled = pd.DataFrame(X)
dfScaled.describe()

### 3.3 Split training and testing datasets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

## 4. Make the model

In [None]:
def model(X_train, Y_train):
  regressor = LinearRegression(fit_intercept=False)
  regressor.fit(X_train, Y_train)
  return regressor

### 4.1 Build and train theregression model

In [None]:
regressor = model(X_train, Y_train)
print('Coefficients: ', regressor.coef_)

### 4.2 Evaluation of the model on training dataset 

In [None]:
Y_predict_train = regressor.predict(X_train)
print("Root Mean Square Error: ", mean_squared_error(Y_train, Y_predict_train, squared=False))
print("R2 Score: ", r2_score(Y_train, Y_predict_train))

### 4.3 Evaluation of the model on test dataset

In [None]:
Y_predict_test = regressor.predict(X_test)
print("Root Mean Square Error: ", mean_squared_error(Y_test, Y_predict_test, squared=False))
print("R2 Score: ", r2_score(Y_test, Y_predict_test))

### 4.4 Plot the result of the predition

In [None]:
def plotResult(xTrain, yTrain, yPredTrain, xTest, yTest, yPredTest, timeRange, xlabel):
  if (xTest is not None):
    plt.scatter(xTrain, yTrain, s=2, color='black', label='real flow(train_set)')
    plt.scatter(xTrain, yPredTrain, s= 30, color='blue', marker = "s", label='predicted flow(train_set)')
    plt.scatter(xTest, yTest, color='red', s=2, label='real flow(test_set)')
    plt.scatter(xTest, yPredTest, color='green', s=30, marker = 's', label='predicted flow(test_set)')
  else:
    plt.plot(xTrain, yPredTrain, linewidth=0.5, color='blue', label='predicted flow', zorder= 0)
    # plt.scatter(xTrain, yPredTrain, s=1.0, color='blue', label='predicted flow', zorder= 10)
    plt.scatter(xTrain, yTrain, s=1.5, color='red', label='real flow', zorder= 10)
  plt.xlabel(xlabel, fontsize= 16) 
  plt.xticks(fontsize = 14)
  plt.ylabel('flow', fontsize=16)
  plt.yticks(fontsize = 14)
  plt.title('Regression model for prediction of flow in ' + timeRange, fontsize= 20)
  plt.legend(fontsize=15)
  plt.show()

In [None]:
# Plot real and predicted flow from X_train
plt.figure(figsize=(20, 10))
plotResult(X_train[:, 1], Y_train[:, 0], Y_predict_train[:, 0], 
           X_test[:, 1], Y_test[:, 0], Y_predict_test[:, 0], '9 days', 'epoch')


Summary for the prediction of flow based on the epoch(time) in a month range: 
1. The accuracy of flow prediction over 9 days is very low (R2: 0.25, RMSE: 450) when using polynomial regression with time Epoch (x, x^2, x^3, ...) as features.
2. It is difficultto use linear regression to capture the behavior of flow over a month as it is not a linear function of time.
3. We might be able to use polynomial regression to predict the flow in an one day range by using high degree polynomial order and get a good accuracy, but it would not be a useful model. Since the flow behavior is dynamic, model trained by one day probably will not work well on another day, i.e., it doesn't generalize well. (Proof to be continue)

## 5. Polynomial Regression for prediction in one day

### 5.1 Polynomial regression model trained on first day's Epoch data (1st. Oct) for prediction of flow.

In [None]:
# prepare the feature and label columns from mcsSouth
mcsSouth24Hour = mcsSouth[:round(len(mcsSouth)/31)-4][:]
print(mcsSouth24Hour[:][-1:])
X = mcsSouth24Hour['Epoch_mcs'].values
Y = mcsSouth24Hour['flow'].values.reshape(-1, 1)

# turn Epocs_mcs into polynomial features: 1, x, x^2, x^3, ...
X = makePoly(14, X)
dfRawFeatures = pd.DataFrame(X)
dfRawFeatures.describe()

In [None]:
# Scale the feature columns
X, scaler = normalize(X)
dfScaled = pd.DataFrame(X)
dfScaled.describe()

In [None]:
# Split the training and test dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Build model
regressor = model(X_train, Y_train)

# Evaluate Performance of training
Y_predict_train = regressor.predict(X_train)
print("Root Mean Square Error: ", mean_squared_error(Y_train, Y_predict_train, squared=False))
print("R2 Score: ", r2_score(Y_train, Y_predict_train))

In [None]:
# Evaluate Performance of testing
Y_predict_test = regressor.predict(X_test)
print("Root Mean Square Error: ", mean_squared_error(Y_test, Y_predict_test, squared=False))
print("R2 Score: ", r2_score(Y_test, Y_predict_test))

In [None]:
# Plot real and predicted flow from X_train
plt.figure(figsize=(12, 6))
plotResult(X_train[:, 1], Y_train[:, 0], Y_predict_train[:, 0], 
           X_test[:, 1], Y_test[:, 0], Y_predict_test[:, 0], '1st. day', 'epoch')

### 5.2 Test the prediction model on 7th of Oct.

In [None]:
mcsSouth3rdDay = mcsSouth[8562:9998][:]
mcsSouth3rdDay

In [None]:
X_3rd = mcsSouth3rdDay[:]['Epoch_mcs'].values
Y_3rd = mcsSouth3rdDay[:]['flow'].values.reshape(-1, 1)
X_3rd = X_3rd - 518400.0
X_3rd = makePoly(14, X_3rd)
X_3rd, scalerdummy = normalize(X_3rd, scaler)
dfX_3rd = pd.DataFrame(X_3rd)
dfX_3rd.describe()

In [None]:
# Evaluate Performance of training
Y_predict_3rd = regressor.predict(X_3rd)
print("Root Mean Square Error: ", mean_squared_error(Y_3rd, Y_predict_3rd, squared=False))
print("R2 Score: ", r2_score(Y_3rd, Y_predict_3rd))

In [None]:
# Plot real and predicted flow from X_train
plt.figure(figsize=(12, 6))
plotResult(X_3rd[:, 1], Y_3rd[:, 0], Y_predict_3rd[:, 0], 
           None, None, None, '7th day', 'epoch')

The model trained by 1st.Oct's data performs badly when predicting flow on 7th Oct, which means that the model doesn't generalize well. The 2nd reason why it is difficult to use linear regression model to predict flow is because the traffic flow is very dynamic and its behavior changes every day.

## 6. Add a new feature "speed" into regression model

### 6.1 Include "Speed" as a new feature in feature columns

In [None]:
# prepare the feature and label columns from mcsSouth's 9days data
# print(mcsSouth9days[:][-1:])
X_epoch = mcsSouth9days['Epoch_mcs'].values
X_speed = mcsSouth9days['speed'].values
Y_label = mcsSouth9days['flow'].values.reshape(-1, 1)

# turn Epocs_mcs into polynomial features: 1, t, t^2, t^3, ...
X_epoch_poly = makePoly(20, X_epoch) # output: np Array
X_epoch_poly, scaler_epoch_9 = normalize(X_epoch_poly)
# dfRawFeatures = pd.DataFrame(X_epoch_poly)
# dfRawFeatures.describe()


In [None]:
# turn speed into polynomial features: 1, s, s^2, s^3, ...
X_speed_poly = makePoly(20, X_speed)
X_speed_poly, scaler_speed_9 = normalize(X_speed_poly)
# dfRawFeatures_speed = pd.DataFrame(X_speed_poly)
# dfRawFeatures_speed.describe()

In [None]:
# remove the bias column of speed feature and merge with the epoch feature columns
# X_speed_poly = np.delete(X_speed_poly, 0, 1)
print(X_speed_poly.shape)
# print(X_epoch_poly.shape)
# X_features = np.append(X_epoch_poly, X_speed_poly,axis=1)
X_features = X_speed_poly
dfRawFeatures = pd.DataFrame(X_features)
dfRawFeatures.describe()

In [None]:
# split training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_label, test_size=0.2, random_state=0)

In [None]:
# Build the regression model
regressor = model(X_train, Y_train)

# Evaluate Performance of training set
Y_predict_train = regressor.predict(X_train)
print("Root Mean Square Error: ", mean_squared_error(Y_train, Y_predict_train, squared=False))
print("R2 Score: ", r2_score(Y_train, Y_predict_train))

In [None]:
# Evaluate Performance of testing set
Y_predict_test = regressor.predict(X_test)
print("Root Mean Square Error: ", mean_squared_error(Y_test, Y_predict_test, squared=False))
print("R2 Score: ", r2_score(Y_test, Y_predict_test))

Testset's R2 is larger than Trainset's R2: did not over fitting

In [None]:
# Plot real and predicted flow from X_train, X_test
plt.figure(figsize=(20, 9))
plotResult(X_train[:, 1], Y_train[:, 0], Y_predict_train[:, 0], 
           X_test[:, 1], Y_test[:, 0], Y_predict_test[:, 0], '9 days (with speed as feature)', 'speed')

In [None]:
# Use all data to train and predict the flow
Y_predict = regressor.predict(X_features)
print("Root Mean Square Error: ", mean_squared_error(Y_label, Y_predict, squared=False))
print("R2 Score: ", r2_score(Y_label, Y_predict))

In [None]:
1440 * 3

In [None]:
inrixSouth.iloc[4300:4350]

In [None]:
# Plot real and predicted flow vs. X_features (speed_mcs)
plt.figure(figsize=(20, 9))
plotResult(X_features[:, 1], Y_label[:, 0], Y_predict[:, 0], 
           X_features[:, 1],  Y_label[:, 0], Y_predict[:, 0], '9 days', 'speed')

# Plot real and predicted flow vs. Epoch (display 3 days' result)
plt.figure(figsize=(20, 9))
plotResult(X_epoch_poly[:4270, 1], Y_label[:4270, 0], Y_predict[:4270, 0], 
           None, None, None, '9 days', 'epoch')

# Plot corresponding mcs speed (display 3 days' data)
plt.figure(figsize=(19, 9))
plotAttr(3, mcsSouth9days.iloc[:4270], mcsSouth9days.iloc[:4270],'speed', 'mcs')

# Plot corresponding inrix speed (display 3 days' data)
inrixSouth3days = inrixSouth.iloc[:4312]
plt.figure(figsize=(19, 9))
plotAttr(3, inrixSouth3days, inrixSouth3days,'speed', 'inrix')

Conclusion: By introducing “speed” as a feature, we can effectively increase accuracy of prediction, because the relation between flow and speed, i.e., f(speed) = flow, is closer to a linear function. However, speed itself still cannot provide all information about flow, e.g., in the morning rush hours flow can increase really fast while speed still keeps the similar value. That is to say, we need to find other features to provide more information about flow and make more accurate predictions.

## 7. Using microscopic data in Inrix to predict macroscopic data in Mcs

### 7.1 Data processing for missing timesteps
There are many timesteps missing in mcs and inrix bacause data with low confidence were filtered out, e.g., {Null, Null, Null, Null} was filtered out in mcs. We need to match the Inrix timesteps to mcs timesteps for further processing, such as using 'speed' and 'travel time' from Inrix as features to predict mcs flow.




In [None]:
# function to match inrix timesteps to mcs, and merge them
# Input: mcs dataframe, inrix dataframe
# output: return a new dataframe combines both dataframes after the time-step matching
def matchTimeStep (mcsDf, inrixDf):
  mcsMatched = pd.DataFrame(columns = mcsDf.columns)
  inrixMatched = pd.DataFrame(columns = inrixDf.columns)
  print(mcsMatched)
  print(inrixMatched)
  i = 0
  j = 0
  while ((j < len(inrixDf)) & (i < len(mcsDf))):
    mcsEpoch = mcsDf.iloc[i]['Epoch_mcs']
    inrixEpoch = inrixDf.iloc[j]['Epoch_inrix']
    # if the difference in epoch between mcs and inrix is smaller than 25 sec,
    # they are in the same time step (within same min interval)
    if (abs(inrixEpoch - mcsEpoch) < 26.0 ):
      mcsMatched = mcsMatched.append(mcsDf.iloc[i], ignore_index = True)
      inrixMatched = inrixMatched.append(inrixDf.iloc[j], ignore_index = True)
      i += 1
      j += 1
      if(i % 1000.0 == 0): print(mcsMatched)
    elif ((inrixEpoch - mcsEpoch) < 0.0):
      j += 1
      print('missing mcs', j)
    elif ((inrixEpoch -mcsEpoch) > 0.0):
      i +=1
      print('missing inrix', i)
  
  print('length of mcsMatched: ', len(mcsMatched))
  print('length of inrixMatched: ', len(inrixMatched))

  print(inrixMatched.columns)
  inrixMatched.rename(columns = {'speed': 'speed_inrix'}, inplace = True)
  print(inrixMatched.columns)
  
  combinedDf = pd.concat([mcsMatched, inrixMatched], axis=1)
  print('length of combined df:', len(combinedDf))
  
  return combinedDf
  

In [None]:
# Match the timesteps between mcsSouth and inrixSouth dataframe,
# and merge them into a datafram
combinedSouth = matchTimeStep(mcsSouth, inrixSouth)

### 7.2 Analyse correlation between mcsSouth and inrixSouth after timestep matching

In [None]:
# Check the datatype after combining, cast the datatypes to the original types
print(combinedSouth.info())
print(mcsSouth.info())
print(inrixSouth.info())
combinedSouth = combinedSouth.astype({'fk_id':'int64', 'flow':'int64', 'segmentid':'int64', 'Epoch_inrix':'int64'})
combinedSouth.info()
# combinedSouth.head(3)

In [None]:
# draw plot of epoch_mcs vs. epoch_inrix after timestep matching
plt.figure(figsize = (7, 7))
combinedSouth1day = combinedSouth.iloc[:1424]
plotScatter(combinedSouth1day, combinedSouth1day, 'Epoch_mcs', 'Epoch_inrix','First day Epoch (sec) alignment')

In [None]:
combinedSouth.iloc[38517: 38520]
# index for end of 27th Oct = 38518

In [None]:
# draw plot of Speed_mcs vs. Speed_inrix after timestep matching
combinedCorr1 = combinedSouth.iloc[:38519][['speed_inrix', 'speed']].copy()
print(combinedCorr1.corr())

plt.figure(figsize=(10,7))
plotScatter(combinedCorr1, combinedCorr1, 'speed_inrix', 'speed', 'mcsSouth-inrixSouth')

Low correlation between mcs speed and inrix speed probably because of the time lag in speed. It should be higher because the pattern of 2 speeds' behaviors are similar.

In [None]:
combinedSouth.iloc[38500: 38550]

In [None]:
# draw flow_mcs vs. Speed_inrix after time step matched
combinedCorr2 = combinedSouth.iloc[:38519][['speed_inrix', 'flow']].copy()
print(combinedCorr2.corr())

plt.figure(figsize=(10,7))
plotScatter(combinedCorr2, combinedCorr2, 'speed_inrix', 'flow', 'mcsSouth-inrixSouth')

The relation between flow and (raw)speed_inrix is very non-linear and does not follow the traffic flow theory.

### 7.3 Using (raw) inrix_speed as feature to predict mcs_flow in 9 days range

In [None]:
# pick up 9 days' data from combined dataframe (1st OCT to 9th OCT)
combinedSouth9days = combinedSouth[:12839] 
combinedSouth9days.iloc[-3:]

In [None]:
# prepare the feature and label columns from combinedSouth's 9days data
X_epoch = combinedSouth9days['Epoch_mcs'].values.reshape(-1, 1)
X_speed = combinedSouth9days['speed_inrix'].values
Y_label = combinedSouth9days['flow'].values.reshape(-1, 1)
print(X_epoch.shape)

# turn speed_inrix into polynomial features: 1, s, s^2, s^3, ...
X_speed_poly = makePoly(20, X_speed)
X_speed_poly, scaler_speed_9 = normalize(X_speed_poly)
print(X_speed_poly.shape)
X_speed_epoch = np.append(X_speed_poly, X_epoch, axis=1)
dfRawFeatures_speed = pd.DataFrame(X_speed_epoch)
print(dfRawFeatures_speed.head(5))
X_features = X_speed_epoch

# split training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_label, test_size=0.2, random_state=0)

In [None]:
X_train[:, :21].shape
type(X_train[:, :21])
dfTest = pd.DataFrame(X_train[:, :21])
dfTest.head(5)

In [None]:
# prepare the feature and label columns from combinedSouth's 9days data
X_epoch = combinedSouth9days['Epoch_mcs'].values.reshape(-1, 1)
X_speed = combinedSouth9days['speed_inrix'].values
Y_label = combinedSouth9days['flow'].values.reshape(-1, 1)

# turn speed_inrix into polynomial features: 1, s, s^2, s^3, ...
X_speed_poly = makePoly(20, X_speed)
X_speed_poly, scaler_speed_9 = normalize(X_speed_poly)
X_speed_epoch = np.append(X_speed_poly, X_epoch, axis=1)
dfRawFeatures_speed = pd.DataFrame(X_speed_epoch)
print(dfRawFeatures_speed.head(5))
X_features = X_speed_epoch

# split training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_label, test_size=0.2, random_state=0)

# build and train the polynomial regression model
regressor = model(X_train[:, :21], Y_train)

# Evaluate Performance of training set
Y_predict_train = regressor.predict(X_train[:, :21])
print("Root Mean Square Error: ", mean_squared_error(Y_train, Y_predict_train, squared=False))
print("R2 Score: ", r2_score(Y_train, Y_predict_train))

# Evaluate Performance of testing set
Y_predict_test = regressor.predict(X_test[:, :21])
print("Root Mean Square Error: ", mean_squared_error(Y_test, Y_predict_test, squared=False))
print("R2 Score: ", r2_score(Y_test, Y_predict_test))

In [None]:
# Plot real and predicted flow from X_train, X_test
plt.figure(figsize=(20, 9))
plotResult(X_train[:, 1], Y_train[:, 0], Y_predict_train[:, 0], 
           X_test[:, 1], Y_test[:, 0], Y_predict_test[:, 0], '9 days (with speed_inrix as feature)', 'speed')

In [None]:
# Use all data to train and predict the flow
Y_predict = regressor.predict(X_features[:,:21])
print("Root Mean Square Error: ", mean_squared_error(Y_label, Y_predict, squared=False))
print("R2 Score: ", r2_score(Y_label, Y_predict))

In [None]:
# Plot real and predicted flow from X_features
plt.figure(figsize=(10, 9))
plotResult(X_features[:, 1], Y_label[:, 0], Y_predict[:, 0], 
           X_features[:, 1], Y_label[:, 0], Y_predict[:, 0], '9 days (with speed_inrix as feature)', 'speed')

plt.figure(figsize=(20, 9))
plotResult(X_features[:, -1], Y_label[:, 0], Y_predict[:, 0], 
           None, None, None, '9 days (with speed_inrix as feature)', 'epoch')

plt.figure(figsize=(20, 9))
plotAttr(9, combinedSouth9days, combinedSouth9days,'speed_inrix', 'inrix')

Summary: The accuracy (R2 = 0.2) is really low when we directly use inrix speed to predict  mcs flow. This is probably because the relation between inrix speed and mcs flow does not meet the traffic flow theory (fundemental Diagram) due to the time lag between inrix speed and mcs speed. Therefore, in order to achieve higher accuracy by using inrix speed as feature, we need either shift the inrix speed to eliminate the effect from time-lag, or train a neural network which can learn to mitigate the effect of timelag by itself.

### 7.4 Shifting the time-lag between inrix speed and mcs speed


#### 7.4.1 Decide how many time steps we should shift to match 2 speeds

In [None]:
# function to plot 2 attributes (vs. Epochs) from 2 different senesors in a same diagram
def plotAttr3(days, dataFrame, attribute1, attribute2, location):
  plt.plot(dataFrame['Epoch_mcs'], dataFrame[attribute1], 'r',linewidth = 0.8, label = 'mcs_' + location)
  plt.plot(dataFrame['Epoch_mcs'], dataFrame[attribute2], 'b', linewidth = 0.8, label = 'inrix_' + location)
  plt.xlabel('Epochs(sec)', fontsize= 14)
  plt.xticks(fontsize=14)
  plt.ylabel(attribute1, fontsize=14)
  plt.yticks(fontsize=14)
  plt.title(attribute1 + ' vs. time (' + str(days) + ' day)', fontsize = 14)
  plt.legend()
  plt.show()

In [None]:
combinedSouthT = combinedSouth.iloc[1424:2833] 
plt.figure(figsize=(10, 6))
plotAttr3('2nd', combinedSouthT, 'speed', 'speed_inrix', 'South')

combinedSouth3days = combinedSouth.iloc[:4260] # 4260 for 3 days
plt.figure(figsize=(20, 7))
plotAttr3(3, combinedSouth3days, 'speed', 'speed_inrix', 'South')

combinedSouthLast1days = combinedSouth.iloc[-400:]
plt.figure(figsize=(20, 7))
plotAttr3(-1, combinedSouthLast1days, 'speed', 'speed_inrix', 'South')

We should shift inrix by approximate 114 min to eliminate the time-lag.

#### 7.4.2 Shifting the Inrix by 114 mins (time-lag)

In [None]:
# Shift Epoch_inrix by adding 114 mins in order to eliminate the time-gap  
inrixSouth_shifted = inrixSouth.copy()
print(inrixSouth_shifted)
inrixSouth_shifted['Epoch_inrix'] = inrixSouth_shifted.Epoch_inrix + (114 * 60)
print(inrixSouth_shifted)

Shift 110 mins: Corr Coefficient = 0.674 <br>
Shift 112 mins: Corr Coefficient =  <br>
Shift 114 mins: Corr Coefficient = 0.69 <br>
Shift 116 mins: Corr Coefficient =  <br>
Shift 118 mins: Corr Coefficient = 0.690 <br>
Shift 120 mins: Corr Coefficient = 0.684 <br>


In [None]:
# match the shifted inrix with mcs dataframe and combined them
combinedSouth_shifted = matchTimeStep(mcsSouth, inrixSouth_shifted)

In [None]:
print(combinedSouth_shifted[['date', 'speed', 'timestamputc', 'speed_inrix']])

In [None]:
# Check the datatype after combining, cast the datatypes to the original types
print(combinedSouth_shifted.info())
print(mcsSouth.info())
print(inrixSouth.info())
combinedSouth_shifted = combinedSouth_shifted.astype({'fk_id':'int64', 'flow':'int64', 'segmentid':'int64', 'Epoch_inrix':'int64'})
combinedSouth_shifted.info()
combinedSouth_shifted.head(3)

In [None]:
combinedSouth_shifted.iloc[32500:32960]

In [None]:
combinedSouthT_shifted = combinedSouth_shifted.iloc[1311:2721] # 4148 for 3 days
plt.figure(figsize=(10, 6))
plotAttr3('2nd', combinedSouthT_shifted, 'speed', 'speed_inrix', 'South')

plt.figure(figsize=(20, 7))
combinedSouth3days_shifted = combinedSouth_shifted.iloc[1311 :1311+7200]
plotAttr3('2nd-6th day', combinedSouth3days_shifted, 'speed', 'speed_inrix', 'South')

Speed_inrix and speed_mcs now matching each other after shifting

### 7.4.3 Analyse correlation between mcsSouth and inrixSouth after timestep matching (after shifting)

In [None]:
# draw plot of epoch_mcs vs. epoch_inrix after shifting
plt.figure(figsize = (7, 7))
combinedSouth1day_shifted = combinedSouth_shifted.iloc[:1424]
plotScatter(combinedSouth1day_shifted, combinedSouth1day_shifted, 'Epoch_mcs', 'Epoch_inrix','First day Epoch (sec) alignment')

In [None]:
combinedSouth_shifted[38406: 38408]

In [None]:
# draw plot of Speed_mcs vs. Speed_inrix after timestep matching
combinedCorr1 = combinedSouth_shifted.iloc[:38407][['speed_inrix', 'speed']].copy() # until 27th
print(combinedCorr1.corr())

plt.figure(figsize=(10,7))
plotScatter(combinedCorr1, combinedCorr1, 'speed_inrix', 'speed', 'mcsSouth-inrixSouth')

Correlation between two speeds is higer after shifting.

In [None]:
# draw flow_mcs vs. Speed_inrix after time step matched
combinedCorr2 = combinedSouth_shifted.iloc[:38407][['speed_inrix', 'flow']].copy() # until 27th Oct
print(combinedCorr2.corr())

plt.figure(figsize=(10,7))
plotScatter(combinedCorr2, combinedCorr2, 'speed_inrix', 'flow', 'mcsSouth-inrixSouth')

Shifting doesn’t improve the characterization of flow-speed(inrix) relation much.

### 7.4.4 Using (shifted) inrix_speed as feature to predict mcs_flow in 9 days range

In [None]:
# pick up 9 days' data from combined dataframe (1st OCT to 9th OCT)
combinedSouth9days_shifted = combinedSouth_shifted[:12727] 
combinedSouth9days_shifted.iloc[-3:]

In [None]:
# prepare the feature and label columns from combinedSouth's 9days data
X_epoch = combinedSouth9days_shifted['Epoch_mcs'].values.reshape(-1, 1)
X_speed = combinedSouth9days_shifted['speed_inrix'].values
Y_label = combinedSouth9days_shifted['flow'].values.reshape(-1, 1)

# turn speed_inrix into polynomial features: 1, s, s^2, s^3, ...
X_speed_poly = makePoly(20, X_speed)
X_speed_poly, scaler_speed_9 = normalize(X_speed_poly)
X_speed_epoch = np.append(X_speed_poly, X_epoch, axis=1)
dfRawFeatures_speed = pd.DataFrame(X_speed_epoch)
print(dfRawFeatures_speed.head(5))
X_features = X_speed_epoch

# split training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_label, test_size=0.2, random_state=0)

# build and train the polynomial regression model
regressor = model(X_train[:, :21], Y_train)

# Evaluate Performance of training set
Y_predict_train = regressor.predict(X_train[:, :21])
print("Root Mean Square Error: ", mean_squared_error(Y_train, Y_predict_train, squared=False))
print("R2 Score: ", r2_score(Y_train, Y_predict_train))

# Evaluate Performance of testing set
Y_predict_test = regressor.predict(X_test[:, :21])
print("Root Mean Square Error: ", mean_squared_error(Y_test, Y_predict_test, squared=False))
print("R2 Score: ", r2_score(Y_test, Y_predict_test))

In [None]:
# Plot real and predicted flow from X_train, X_test
plt.figure(figsize=(20, 9))
plotResult(X_train[:, 1], Y_train[:, 0], Y_predict_train[:, 0], 
           X_test[:, 1], Y_test[:, 0], Y_predict_test[:, 0], '9 days (with speed_inrix as feature)', 'speed')

In [None]:
  # Use all data to train and predict the flow
regressor = model(X_features[:, :21], Y_label)
Y_predict = regressor.predict(X_features[:,:21])
print("Root Mean Square Error: ", mean_squared_error(Y_label, Y_predict, squared=False))
print("R2 Score: ", r2_score(Y_label, Y_predict))

In [None]:
# Plot real and predicted flow from X_features
plt.figure(figsize=(10, 9))
plotResult(X_features[:, 1], Y_label[:, 0], Y_predict[:, 0], 
           X_features[:, 1], Y_label[:, 0], Y_predict[:, 0], '9 days (speed_inrix as feature)', 'speed')

plt.figure(figsize=(20, 9))
plotResult(X_features[:, -1], Y_label[:, 0], Y_predict[:, 0], 
           None, None, None, '9 days (speed_inrix as feature)', 'epoch')

plt.figure(figsize=(20, 9))
plotAttr(9, combinedSouth9days_shifted.iloc[:round(len(combinedSouth9days_shifted)/3)], combinedSouth9days_shifted.iloc[:round(len(combinedSouth9days_shifted)/3)],'speed_inrix', 'inrix')

## 8. Including "Travel Time" as feature in regression model

Use travel time in InrixSouth as an new feature to predict the flow

In [None]:
plt.figure(figsize=(9, 6))
plt.xlim(left= 20.0, right = 350.0)
plt.ylim(top = 2000)
plotScatter(combinedSouth_shifted.iloc[:38407], combinedSouth_shifted.iloc[:38407], 'travel_time_secs', 'flow', 'mcs flow vs. inrix travel time')

In [None]:
combinedSouth9days_shifted.info()

### 8.1 Use travel time alone to predict the flow

In [None]:
# prepare the feature and label columns from combinedSouth's 9days data
X_travel_time = combinedSouth9days_shifted['travel_time_secs'].values
X_epoch = combinedSouth9days_shifted['Epoch_mcs'].values.reshape(-1, 1)
Y_label = combinedSouth9days_shifted['flow'].values.reshape(-1, 1)
#X_speed = combinedSouth9days_shifted['speed_inrix'].values

# turn speed_inrix, travel_time into polynomial features: 1, s, s^2, s^3, ...
X_travel_time_poly = makePoly(15, X_travel_time)
# X_speed_poly = makePoly(20, X_speed)
X_travel_time_poly, scaler_travel_time_9 = normalize(X_travel_time_poly)
# X_speed_poly, scaler_speed_9 = normalize(X_speed_poly)

# Drop the intercept bias in travel_time features, then append it to speed to form feature arrays
dfRawFeatures_travel = pd.DataFrame(X_travel_time_poly)
print(dfRawFeatures_travel.head(10))
dfRawFeatures_travel.describe()
# X_travel_time_poly = np.delete(X_travel_time_poly, 0, 1)
# X_speed_travel = np.append(X_speed_poly, X_travel_time_poly, axis=1)
X_features = X_travel_time_poly

# split training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_label, test_size=0.2, random_state=0)

# build and train the polynomial regression model
regressor = model(X_train, Y_train)

# Evaluate Performance of training set
Y_predict_train = regressor.predict(X_train)
print("Root Mean Square Error: ", mean_squared_error(Y_train, Y_predict_train, squared=False))
print("R2 Score: ", r2_score(Y_train, Y_predict_train))

# Evaluate Performance of testing set
Y_predict_test = regressor.predict(X_test)
print("Root Mean Square Error: ", mean_squared_error(Y_test, Y_predict_test, squared=False))
print("R2 Score: ", r2_score(Y_test, Y_predict_test))
dfRawFeatures_speed.describe()

travel time order: R2 <br>
5: 0.225 <br>
7: 0.226  <br>
10:0.227 <br>
13: 0.23 <br>
15: 0.237 <br>
16: over-fitting

In [None]:
# Plot real and predicted flow from X_train, X_test w.r.t. travel_time
plt.figure(figsize=(20, 9))
plt.xlim(left= -1.5, right = 5.0)
plotResult(X_train[:, 1], Y_train[:, 0], Y_predict_train[:, 0], 
           X_test[:, 1], Y_test[:, 0], Y_predict_test[:, 0], '9 days (with travel_time_inrix as feature)', 'travel_time')

In [None]:
# Use all data to train and predict the flow
regressor = model(X_features, Y_label)
Y_predict = regressor.predict(X_features)
print("Root Mean Square Error: ", mean_squared_error(Y_label, Y_predict, squared=False))
print("R2 Score: ", r2_score(Y_label, Y_predict))

R2 score = 0.24, not better than using inrix speed to predict (R2: 0.26)

In [None]:
# Plot real and predicted flow from X_features
plt.figure(figsize=(10, 6))
plt.xlim(left = -1.5, right = 4)
plt.ylim(bottom = 0, top = 2000)
plotResult(X_features[:, 1], Y_label[:, 0], Y_predict[:, 0], 
           X_features[:, 1], Y_label[:, 0], Y_predict[:, 0], '9 days', 'travel time')

plt.figure(figsize=(20, 9))
plotResult(X_epoch[:, 0], Y_label[:, 0], Y_predict[:, 0], 
           None, None, None, '9 days (with travel time as feature)', 'epoch')


plt.figure(figsize=(16, 6))
plt.ylim(top= 200)
plotAttr(9, combinedSouth9days_shifted, combinedSouth9days_shifted,'travel_time_secs', 'inrix')

START = 4000

plt.figure(figsize=(20, 9))
plotResult(X_epoch[START: START + round(len(X_epoch)/3), 0], Y_label[START : START + round(len(X_epoch)/3), 0], Y_predict[START :  START + round(len(X_epoch)/3), 0], 
           None, None, None, '3 days (with travel time as feature)', 'epoch')

plt.figure(figsize=(16, 6))
plt.ylim(top= 200)
plotAttr(3, combinedSouth9days_shifted.iloc[START : START + round(len(combinedSouth9days_shifted)/3)], 
         combinedSouth9days_shifted.iloc[START : START + round(len(combinedSouth9days_shifted)/3)],'travel_time_secs', 'inrix')


### 8.2 Use 'Speed' and 'Travel Time" to predict in polynomial regression model

In [None]:
# prepare the feature and label columns from combinedSouth's 9days data
X_travel_time = combinedSouth9days_shifted['travel_time_secs'].values
X_epoch = combinedSouth9days_shifted['Epoch_mcs'].values.reshape(-1, 1)
Y_label = combinedSouth9days_shifted['flow'].values.reshape(-1, 1)
X_speed = combinedSouth9days_shifted['speed_inrix'].values

# turn speed_inrix, travel_time into polynomial features: 1, s, s^2, s^3, ...
X_speed_poly = makePoly(15, X_speed)
X_travel_time_poly = makePoly(10, X_travel_time)
X_speed_poly, scaler_speed_9 = normalize(X_speed_poly)
X_travel_time_poly, scaler_travel_time_9 = normalize(X_travel_time_poly)

# Drop the intercept bias in travel_time features, then append it to speed to form feature arrays
X_travel_time_poly = np.delete(X_travel_time_poly, 0, 1)
X_speed_travel = np.append(X_speed_poly, X_travel_time_poly, axis=1)
dfRawFeatures_speed_travel = pd.DataFrame(X_speed_travel)
X_features = X_speed_travel

# split training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_label, test_size=0.2, random_state=0)

# build and train the polynomial regression model
regressor = model(X_train, Y_train)

# Evaluate Performance of training set
Y_predict_train = regressor.predict(X_train)
print("Root Mean Square Error: ", mean_squared_error(Y_train, Y_predict_train, squared=False))
print("R2 Score: ", r2_score(Y_train, Y_predict_train))

# Evaluate Performance of testing set
Y_predict_test = regressor.predict(X_test)
print("Root Mean Square Error: ", mean_squared_error(Y_test, Y_predict_test, squared=False))
print("R2 Score: ", r2_score(Y_test, Y_predict_test))

dfRawFeatures_speed_travel.describe()

(S, T): R2 <br>
(15, 10): 0.256 <br>
(20, 10): overfitting <br>
(15,15): overfitting <br>
(10, 15): overfitting <br>
==> No much improvement by using 2 features in the same time



In [None]:
# Plot real and predicted flow from X_train, X_test to 'speed' and 'travel' feature
plt.figure(figsize=(20, 9))
plt.ylim(top = 2000)
plotResult(X_train[:, 1], Y_train[:, 0], Y_predict_train[:, 0], 
           X_test[:, 1], Y_test[:, 0], Y_predict_test[:, 0], '9 days (with speed and travel time as feature)', 'speed')

# Plot real and predicted flow from X_train, X_test to 'speed' and 'travel' feature
plt.figure(figsize=(20, 9))
plt.ylim(top = 2000)
plt.xlim(left = -1.5, right = 5.0)
plotResult(X_train[:,16], Y_train[:, 0], Y_predict_train[:, 0], 
           X_test[:, 16], Y_test[:, 0], Y_predict_test[:, 0], '9 days (with speed and travel time as feature)', 'travel time')

In [None]:
# Use all data to train and predict the flow
regressor = model(X_features, Y_label)
Y_predict = regressor.predict(X_features)
print("Root Mean Square Error: ", mean_squared_error(Y_label, Y_predict, squared=False))
print("R2 Score: ", r2_score(Y_label, Y_predict))

## 9. Decision Tree Regression

### 9.0 Import decision tree library

In [None]:
from sklearn.tree import DecisionTreeRegressor

### 9.1 Use Inrix speed and travel time as features

In [None]:
# prepare the features [speed, travel_time] and label[flow]
X = combinedSouth9days_shifted[['speed_inrix', 'travel_time_secs']].values
X_epoch = combinedSouth9days_shifted['Epoch_mcs']
Y_label = combinedSouth9days_shifted['flow'].values
print(type(X))
print(X.shape)
print(X)
print(Y.shape)

# split the train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_label, test_size=0.2, random_state=0)


In [None]:
def quickSplitTuning(model, x_train, y_train, x_test, y_test, split):
  trainR2Array = []
  testR2Array = []
  for i in split:
    estimator = model(max_depth = i, random_state = 0)
    estimator.fit(x_train, y_train)

    trainR2Array.append(estimator.score(x_train, y_train))
    testR2Array.append(estimator.score(x_test, y_test))

  result = pd.DataFrame({'split': split, 'Train_R2': trainR2Array, 'Test_R2': testR2Array})
  return result


In [None]:
# temporarily tune the max_depth (hyperparameters) 
split = list(range(2, 20))
print(split)
result = quickSplitTuning(DecisionTreeRegressor, X_train, Y_train, X_test, Y_test, split)
print(result)

Best max_depth: 6 ~ 7

In [None]:
# train the model with train dataset and check the accuracy
DTRegressor = DecisionTreeRegressor(max_depth = 7, random_state=0)
DTRegressor.fit(X_train, Y_train)

# Evaluate Performance of training set
Y_predict_train = DTRegressor.predict(X_train)
print("Root Mean Square Error: ", mean_squared_error(Y_train, Y_predict_train, squared=False))
print("R2 Score: ", DTRegressor.score(X_train, Y_train))

# Evaluate Performance of testing set
Y_predict_test = DTRegressor.predict(X_test)
print("Root Mean Square Error: ", mean_squared_error(Y_test, Y_predict_test, squared=False))
print("R2 Score: ", r2_score(Y_test, Y_predict_test))


Short Conclusion:
slightly improve the accuracy to 0.64 by using decision tree to predict Mcs flow by using Inrix.

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)

In [None]:
# Plot real and predicted flow from X_train, X_test to 'speed' and 'travel' feature
plt.figure(figsize=(20, 9))
# plt.ylim(top = 2000)
plotResult(X_train[:, 0], Y_train, Y_predict_train, X_test[:, 0], Y_test, Y_predict_test, '9 days (Decision Tree)', 'speed')

# Plot real and predicted flow from X_train, X_test to 'speed' and 'travel' feature
plt.figure(figsize=(20, 9))
# plt.ylim(top = 2000)
plt.xlim(left = 20, right = 100)
plotResult(X_train[:,1], Y_train, Y_predict_train, 
           X_test[:, 1], Y_test, Y_predict_test, '9 days (Decision Tree)', 'travel time')

In [None]:
# Use all data to train and predict the flow
DTRegressor.fit(X, Y_label)
Y_predict = DTRegressor.predict(X)
print("Root Mean Square Error: ", mean_squared_error(Y_label, Y_predict, squared=False))
print("R2 Score: ", r2_score(Y_label, Y_predict))

In [None]:
# check the structure of the decision tree
print(DTRegressor.get_depth())
print(DTRegressor.get_n_leaves())

In [None]:
# Plot real and predicted flow vs. travel time(a feature)
plt.figure(figsize=(10, 6))
plt.xlim(left = 25, right = 200)
'''plt.ylim(bottom = 0, top = 2000)'''
plotResult(X[:, 1], Y_label, Y_predict, 
           X[:, 1], Y_label, Y_predict, '9 days (Decision Tree)', 'travel time')

# Plot real and predicted flow vs. speed (a feature)
plt.figure(figsize=(10, 6))
'''plt.xlim(left = -1.5, right = 4)
plt.ylim(bottom = 0, top = 2000)'''
plotResult(X[:, 0], Y_label, Y_predict, 
           X[:, 0], Y_label, Y_predict, '9 days (Decision Tree))', 'inrix_speed')

# plot flow vs. epoch in 9 days range
plt.figure(figsize=(20, 9))
plotResult(X_epoch, Y_label, Y_predict, 
           None, None, None, '9 days (Decision Tree)', 'epoch')


# plot travel_time vs. epoch in 9 syas range
plt.figure(figsize=(16, 6))
plt.ylim(top= 200)
plotAttr(9, combinedSouth9days_shifted, combinedSouth9days_shifted,'travel_time_secs', 'inrix')

# plot speed vs. epoch in 9 syas range
plt.figure(figsize=(16, 6))
plotAttr(9, combinedSouth9days_shifted, combinedSouth9days_shifted,'speed_inrix', 'inrix')