In [1]:
import torch
import numpy as np
import pandas as pd

from sklearn import preprocessing

In [2]:
data = pd.read_csv('processed_data.csv')
data.head(3)

Unnamed: 0,Year,Month,Day,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Dew Point Temp (°C),Rel Hum (%),Wind Dir (10s deg),Wind Spd (km/h),Berri1,Maisonneuve_2,Maisonneuve_1
0,2009.0,1.0,1.0,-14.7,-19.9,-17.3,-23.125,59.916667,28.291667,17.375,29.0,35.0,20.0
1,2009.0,1.0,2.0,-11.3,-16.2,-13.8,-17.029167,76.25,7.083333,12.25,19.0,22.0,3.0
2,2009.0,1.0,3.0,-8.1,-13.0,-10.6,-15.441667,67.958333,28.208333,22.541667,24.0,22.0,12.0


In [3]:
X = torch.Tensor(data.drop(columns=['Berri1','Maisonneuve_2','Maisonneuve_1']).to_numpy())
X = X[:, 3:]
rnd_idx = torch.randperm(X.shape[0])
X = X[rnd_idx]
y = torch.Tensor(data['Berri1'].to_numpy()).unsqueeze(1)
y = y[rnd_idx]

Create classes for each bin and one-hot encode the labels

In [4]:
bins = [0, 100, 500, 1000, 2000, 4000, 8000, 1000000]
y_bin = np.digitize(y, bins)
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder()
y_hot = torch.Tensor((one_hot.fit_transform(y_bin)).toarray())
y_hot[0:4]

tensor([[0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0., 0., 0.]])

Let's Split the test and train datasets

In [5]:
split = 0.08 # 8% train/test split

X_train = X[:int(X.shape[0]*(1-split))]
X_test = X[int(X.shape[0]*(1-split)):]

y_train = y_hot[:int(X.shape[0]*(1-split))]
y_test = y_hot[int(X.shape[0]*(1-split)):]
print("X:", X.shape, "y:", y.shape,
      "\nX_trn:", X_train.shape, "y_trn:", y_train.shape,
      "\nX_tst:", X_test.shape, "y_tst:", y_test.shape)


X: torch.Size([2743, 7]) y: torch.Size([2743, 1]) 
X_trn: torch.Size([2523, 7]) y_trn: torch.Size([2523, 7]) 
X_tst: torch.Size([220, 7]) y_tst: torch.Size([220, 7])


For this first model we will simply feed the data and solve a regression task.

In [None]:
class BikeModel(torch.nn.Module):
    def __init__(self,batch_first=True, linear=10, drop=0.5):
        super(BikeModel, self).__init__()
        self.linear1 = torch.nn.Linear(in_features=7, out_features=linear)
        self.linear2 = torch.nn.Linear(in_features=linear,out_features=linear)
        self.linear3 = torch.nn.Linear(in_features=linear,out_features=7)
        self.drop1 = torch.nn.Dropout(p=drop)
        self.drop2 = torch.nn.Dropout(p=drop)
    def forward(self, X):
        H = self.linear1(X)
        H = self.drop1(torch.nn.functional.relu(H))
        Z = self.linear2(H)
        Z = self.drop2(torch.nn.functional.relu(Z))
        Z = self.linear3(Z)
        return Z

In [None]:
from ray import tune

# Hyperparameters
lr = 0.01
batch_size = 1000
num_epoch = 2000


# Training Loop
def train(config):
    # Initialize the model 
    ann = BikeModel(linear=config['linear'])
    batch_size = config['batch']
    # Initialize the Loss. 
    loss = torch.nn.CrossEntropyLoss()
    train_loss = []
    test_loss = []
    # Initialize the Optimizer.
    optimizer = torch.optim.AdamW(ann.parameters(), lr=config["lr"])
    for epoch in range(num_epoch):
        for i in range(0, X_train.shape[0], batch_size):

            # Read minibatches (for both X and y)
            Xi = X_train[i:i+batch_size]
            yi = y_train[i:i+batch_size]

            # Run the RNN model
            output = ann.forward(Xi)

            #print('output',output.shape,'yi',yi.shape)
            l = loss(output,yi)
            #report for hyperparam search
            

            # Update the parameters (zero_grad, backward, optimization step)
            ann.zero_grad()
            l.backward()
            optimizer.step()

        train_loss.append(l.item())
        ann.eval()
        output = ann.forward(X_test)
        l_test = loss(output, y_test)
        test_loss.append(l_test.item())
        tune.report(l_test.item())
        # Print loss
#         if (epoch + 1) % 200 == 0:
#             print("Epoch %03d: Train_loss: %.4f " %(epoch+1, l.item()))
#             print("Epoch %03d: Test_loss: %.4f " %(epoch+1, l_test))
config = {"lr":tune.grid_search([0.005, 0.01, 0.05]),
          "batch":tune.grid_search([256, 512, 1024, 2048]),
         "linear":tune.grid_search([5,10,20,40,50])}
analysis = tune.run(train, config=config)

In [8]:
df = analysis.dataframe()
df



Unnamed: 0,_metric,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,timestamp,...,pid,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,config/batch,config/linear,config/lr,logdir
0,0.905208,0.018994,False,,,721,2f21d_00000,90ff82c955bc423abedf74f0e98973fd,2022-04-07_22-11-46,1649383906,...,1516,DESKTOP-IMPI4GQ,127.0.0.1,23.575493,0,721,256,5,0.005,C:\Users\samue\ray_results\train_2022-04-07_22...
1,0.945296,0.017994,False,,,933,2f21d_00001,0f10ca33ba44479b95197497ffe3941b,2022-04-07_22-11-46,1649383906,...,11348,DESKTOP-IMPI4GQ,127.0.0.1,21.656114,0,933,512,5,0.005,C:\Users\samue\ray_results\train_2022-04-07_22...
2,0.95395,0.03199,False,,,931,2f21d_00002,d5cca8fc61c64db89719416f46d1249f,2022-04-07_22-11-46,1649383906,...,5944,DESKTOP-IMPI4GQ,127.0.0.1,21.679107,0,931,1024,5,0.005,C:\Users\samue\ray_results\train_2022-04-07_22...
3,0.928964,0.011997,False,,,965,2f21d_00003,847fdcbc49d9445094e6ce98f95631a0,2022-04-07_22-11-46,1649383906,...,14064,DESKTOP-IMPI4GQ,127.0.0.1,21.518159,0,965,2048,5,0.005,C:\Users\samue\ray_results\train_2022-04-07_22...


In [None]:
import matplotlib.pyplot as plt
ax = plt.subplot((111))
plt.plot(train_loss[25:], 'green')
plt.ylabel('train', c='green')
ax2= ax.twinx()
plt.plot(test_loss[25:], 'orange')
plt.ylabel('test', c='orange');

Train Data performance

In [None]:
#from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
ann.eval()
vx_train, ix_train = torch.max(ann.forward(X_train).detach(), 1)
vy_train, iy_train = torch.max(y_train, 1)
plt.plot(ix_train[0:15])
plt.plot(iy_train[0:15])
ix_train[0:10], iy_train[0:10]

Test data performance

In [None]:
vx_test, ix_test = torch.max(ann.forward(X_test).detach(), 1)
vy_test, iy_test = torch.max(y_test, 1)
plt.plot(ix_test[0:15])
plt.plot(iy_test[0:15])

In [156]:
from sklearn.metrics import accuracy_score

print("Training accuracy: {:.2f}%".format( accuracy_score(ix_train,iy_train)*100) )
print("Testing accuracy: {:.2f}%".format( accuracy_score(ix_test,iy_test)*100) )

Training accuracy: 89.06%
Testing accuracy: 49.09%


We can verify that the propper bins were in fact by printing the first few values