In [48]:
# Import basic data libraries
import numpy as np
import pandas as pd

#### Load, Inspect, and Merge Datasets

In [49]:
# Load charging reports data
ev_charging_reports = pd.read_csv('EV charging reports.csv', delimiter=';')
ev_charging_reports.head()

Unnamed: 0,session_ID,Garage_ID,User_ID,User_type,Shared_ID,Start_plugin,Start_plugin_hour,End_plugout,End_plugout_hour,El_kWh,Duration_hours,month_plugin,weekdays_plugin,Plugin_category,Duration_category
0,1,AdO3,AdO3-4,Private,,21.12.2018 10:20,10,21.12.2018 10:23,10.0,3,5,Dec,Friday,late morning (9-12),Less than 3 hours
1,2,AdO3,AdO3-4,Private,,21.12.2018 10:24,10,21.12.2018 10:32,10.0,87,136666667,Dec,Friday,late morning (9-12),Less than 3 hours
2,3,AdO3,AdO3-4,Private,,21.12.2018 11:33,11,21.12.2018 19:46,19.0,2987,8216388889,Dec,Friday,late morning (9-12),Between 6 and 9 hours
3,4,AdO3,AdO3-2,Private,,22.12.2018 16:15,16,23.12.2018 16:40,16.0,1556,2441972222,Dec,Saturday,late afternoon (15-18),More than 18 hours
4,5,AdO3,AdO3-2,Private,,24.12.2018 22:03,22,24.12.2018 23:02,23.0,362,970555556,Dec,Monday,late evening (21-midnight),Less than 3 hours


In [50]:
# Change Start_plugin_hour to same format as Start_plugin
Start_plugin_hour = []

for i in range(len(ev_charging_reports)):
    new_text = ev_charging_reports['Start_plugin'][i][:14] + '00'
    Start_plugin_hour.append(new_text)

In [51]:
# Make a new Start_plugin_hour column and add it to the data
ev_charging_reports['Start_plugin_hour'] = Start_plugin_hour
col = ev_charging_reports.pop('Start_plugin_hour')
ev_charging_reports.insert(6, 'Start_plugin_hour', col)

In [52]:
# create a User_private column with 1s and 0s
User_private = []
for i in range(len(ev_charging_reports)):
    if ev_charging_reports['User_type'][i] == 'Private':
        User_private.append(1.0)
    else:
        User_private.append(0.0)

In [53]:
# add the User_private column 
ev_charging_reports['User_private'] = User_private
ev_charging_reports = ev_charging_reports.drop(columns=['User_type'])

In [54]:
# insert at 3rd index
col = ev_charging_reports.pop('User_private')
ev_charging_reports.insert(3, 'User_private', col)

In [55]:
# convert month and weekdays to int
ev_charging_reports = pd.get_dummies(data = ev_charging_reports, columns = ['month_plugin', 'weekdays_plugin'], dtype=int)

In [56]:
ev_charging_reports.head()

Unnamed: 0,session_ID,Garage_ID,User_ID,User_private,Shared_ID,Start_plugin,Start_plugin_hour,End_plugout,End_plugout_hour,El_kWh,...,month_plugin_Nov,month_plugin_Oct,month_plugin_Sep,weekdays_plugin_Friday,weekdays_plugin_Monday,weekdays_plugin_Saturday,weekdays_plugin_Sunday,weekdays_plugin_Thursday,weekdays_plugin_Tuesday,weekdays_plugin_Wednesday
0,1,AdO3,AdO3-4,1.0,,21.12.2018 10:20,21.12.2018 10:00,21.12.2018 10:23,10.0,3,...,0,0,0,1,0,0,0,0,0,0
1,2,AdO3,AdO3-4,1.0,,21.12.2018 10:24,21.12.2018 10:00,21.12.2018 10:32,10.0,87,...,0,0,0,1,0,0,0,0,0,0
2,3,AdO3,AdO3-4,1.0,,21.12.2018 11:33,21.12.2018 11:00,21.12.2018 19:46,19.0,2987,...,0,0,0,1,0,0,0,0,0,0
3,4,AdO3,AdO3-2,1.0,,22.12.2018 16:15,22.12.2018 16:00,23.12.2018 16:40,16.0,1556,...,0,0,0,0,0,1,0,0,0,0
4,5,AdO3,AdO3-2,1.0,,24.12.2018 22:03,24.12.2018 22:00,24.12.2018 23:02,23.0,362,...,0,0,0,0,1,0,0,0,0,0


In [57]:
# Load traffic reports data
traffic_reports = pd.read_csv('Local traffic distribution.csv', delimiter=';')
traffic_reports.head()

Unnamed: 0,Date_from,Date_to,KROPPAN BRU,MOHOLTLIA,SELSBAKK,MOHOLT RAMPE 2,Jonsvannsveien vest for Steinanvegen
0,01.12.2018 00:00,01.12.2018 01:00,639,0,0,4,144
1,01.12.2018 01:00,01.12.2018 02:00,487,153,115,21,83
2,01.12.2018 02:00,01.12.2018 03:00,408,85,75,10,69
3,01.12.2018 03:00,01.12.2018 04:00,282,89,56,8,39
4,01.12.2018 04:00,01.12.2018 05:00,165,64,34,3,25


Merging the `ev_charging_reports` and `traffic_reports` datasets together into a Dataframe named `ev_charging_traffic` using the columns:

- `Start_plugin_hour` in `ev_charging_reports`
- `Date_from` in `traffic_reports`

In [58]:
ev_charging_traffic = pd.merge(ev_charging_reports, traffic_reports, 
                               left_on= 'Start_plugin_hour', right_on= 'Date_from')

In [59]:
ev_charging_traffic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6878 entries, 0 to 6877
Data columns (total 39 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   session_ID                            6878 non-null   int64  
 1   Garage_ID                             6878 non-null   object 
 2   User_ID                               6878 non-null   object 
 3   User_private                          6878 non-null   float64
 4   Shared_ID                             1412 non-null   object 
 5   Start_plugin                          6878 non-null   object 
 6   Start_plugin_hour                     6878 non-null   object 
 7   End_plugout                           6844 non-null   object 
 8   End_plugout_hour                      6844 non-null   float64
 9   El_kWh                                6878 non-null   object 
 10  Duration_hours                        6844 non-null   object 
 11  Plugin_category  

#### Data Cleaning and Preparation

In [60]:
# dropping columns that won't be used for training. 
ev_charging_traffic.drop(columns=['session_ID', 'Garage_ID', 'User_ID', 
                'Shared_ID',
                'Plugin_category','Duration_category', 
                'Start_plugin', 'Start_plugin_hour', 'End_plugout', 'End_plugout_hour', 
                'Date_from', 'Date_to'], inplace=True)

The `El_kWh` and `Duration_hours` columns are object data types. The data is following European notation where commas `,` are used as decimals instead of periods.

Replace `,` with `.` in these three columns.

In [61]:
ev_charging_traffic['El_kWh'] = ev_charging_traffic['El_kWh'].str.replace(',', '.')

In [62]:
ev_charging_traffic['Duration_hours'] = ev_charging_traffic['Duration_hours'].str.replace(',','.')

In [63]:
# remove irrelevant rows
ev_charging_traffic = ev_charging_traffic[ev_charging_traffic['KROPPAN BRU'] != '-']

ev_charging_traffic = ev_charging_traffic[ev_charging_traffic['MOHOLTLIA'] != '-']

In [64]:
# convert all columns to floats
for col in ev_charging_traffic.columns:
    ev_charging_traffic[col] = ev_charging_traffic[col].astype(float)

In [65]:
# drop all NA values
ev_charging_traffic = ev_charging_traffic.dropna()

In [66]:
ev_charging_traffic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6833 entries, 0 to 6877
Data columns (total 27 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   User_private                          6833 non-null   float64
 1   El_kWh                                6833 non-null   float64
 2   Duration_hours                        6833 non-null   float64
 3   month_plugin_Apr                      6833 non-null   float64
 4   month_plugin_Aug                      6833 non-null   float64
 5   month_plugin_Dec                      6833 non-null   float64
 6   month_plugin_Feb                      6833 non-null   float64
 7   month_plugin_Jan                      6833 non-null   float64
 8   month_plugin_Jul                      6833 non-null   float64
 9   month_plugin_Jun                      6833 non-null   float64
 10  month_plugin_Mar                      6833 non-null   float64
 11  month_plugin_May      

#### Train Test Split

- `X` contains only the input numerical features
- `y` contains only the target column `El_kWh`

In [67]:
X = ev_charging_traffic.drop(columns=['El_kWh'])
y = ev_charging_traffic['El_kWh']

Use `sklearn` to split `X` and `y` into training and testing datasets. The training set should use 80% of the data.

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

#### Linear Regression Baseline

The linear regression will be used as a baseline to compare against the neural network we will train later.

In [70]:
from sklearn.linear_model import LinearRegression

In [71]:
model = LinearRegression()

In [72]:
model.fit(X_train, y_train)

In [73]:
from sklearn.metrics import mean_squared_error

Linear regression baseline is calculated by the MSE on the testing data using `mean_squared_error` from `sklearn.metrics`.

Saving the testing MSE to the variable `test_mse` and printing it out.

In [74]:
y_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print(test_mse)
print(test_mse ** 0.5)

131.41881633566425
11.463804618697242


Looks like our mean squared error is around `131.4`. 
This is squared error. If we take the square root, we have about `11.5`. One way of interpreting this is to say that the linear regression, on average, is off by `11.5 kWh`.

####  Train a Neural Network Using PyTorch

Let's now create a neural network using PyTorch to predict EV charging loads.

In [75]:
import torch
import torch.nn as nn
import torch.optim as optim

Before training the neural network, we need to convert the training and testing sets into PyTorch tensors and specify `float` as the data type for the values.

In [76]:
X_train = torch.tensor(X_train.values, dtype = torch.float)
print(X_train)

X_test = torch.tensor(X_test.values, dtype = torch.float)

y_train = torch.tensor(y_train.values, dtype = torch.float).view(-1,1)

y_test = torch.tensor(y_test.values, dtype = torch.float).view(-1, 1)

tensor([[0.0000e+00, 1.9267e+00, 1.0000e+00,  ..., 5.2200e+02, 1.3300e+02,
         5.4500e+02],
        [1.0000e+00, 1.0972e+00, 0.0000e+00,  ..., 5.3500e+02, 2.1400e+02,
         7.8100e+02],
        [1.0000e+00, 1.6212e+01, 0.0000e+00,  ..., 8.1700e+02, 3.0100e+02,
         1.0730e+03],
        ...,
        [1.0000e+00, 3.7301e+01, 0.0000e+00,  ..., 3.3700e+02, 1.5200e+02,
         5.3700e+02],
        [1.0000e+00, 1.6782e+01, 0.0000e+00,  ..., 8.3000e+02, 3.3600e+02,
         1.2530e+03],
        [1.0000e+00, 1.5542e+01, 0.0000e+00,  ..., 4.0000e+02, 1.6300e+02,
         6.2900e+02]])


Next, let's use `nn.Sequential` to create a neural network.

First, we set a random seed using `torch.manual_seed(42)`.

Then, we create a sequential neural network with the following architecture:

- input layer with number of nodes equal to the number of training features
- a first hidden layer with `56` nodes and a ReLU activation
- a second hidden layer with `26` nodes and a ReLU activation
- an output layer with `1` node

Then, we save the network to the variable `model`.

In [77]:
torch.manual_seed(42)

model = nn.Sequential(
            nn.Linear(26, 56),
            nn.ReLU(),
            nn.Linear(56,26),
            nn.ReLU(),
            nn.Linear(26,1))

Next, let's define the loss function and optimizer used for training:

- set the MSE loss function to the variable `loss`
- set the Adam optimizer to the variable `optimizer` with a learning rate of `0.0007`

In [78]:
loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0007)

Create a training loop to train our neural network for 3000 epochs.

Keep track of the training loss by printing out the MSE every 500 epochs.

In [79]:
num_epochs = 4500
for epoch in range(num_epochs):
    predictions = model(X_train) # forward pass
    MSE = loss(predictions, y_train) # compute loss
    MSE.backward() # compute gradients
    optimizer.step() # update weights and biases 
    optimizer.zero_grad() # reset the gradients
    
    if (epoch + 1) % 500 == 0:
        print(f'The Mean Squared Loss is: {MSE}')

The Mean Squared Loss is: 146.82281494140625
The Mean Squared Loss is: 126.54449462890625
The Mean Squared Loss is: 116.3550796508789
The Mean Squared Loss is: 110.85167694091797
The Mean Squared Loss is: 106.99231719970703
The Mean Squared Loss is: 103.75149536132812
The Mean Squared Loss is: 103.26358032226562
The Mean Squared Loss is: 101.94781494140625
The Mean Squared Loss is: 99.43250274658203


In [80]:
# Save the neural network
torch.save(model, 'EV_Model')

Evaluating the neural network on the testing set. 

Save the testing data loss to the variable `test_loss` and use `.item()` to extract and print out the loss. 

In [81]:
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    test_MSE = loss(predictions, y_test)

In [83]:
print(test_MSE)
print(np.sqrt(test_MSE))

tensor(120.7677)
tensor(10.9894)


The increased training improved our test loss to about `120.7`, an improvement on the linear regression baseline. So the nonlinearity introduced by the neural network actually helped us out.