## Initialize Library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd 'drive/MyDrive/Master’s Thesis/GenoaData/processed_data'

/content/drive/MyDrive/Kuliah/BDMA/Master’s Thesis/GenoaData/processed_data


In [None]:
!pip install wandb -qU

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m5.0/7.1 MB[0m [31m149.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.1/7.1 MB[0m [31m160.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/207.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/301.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━

In [None]:
import pandas as pd
import torch
import numpy as np
import math
import torch.nn.functional as F
import glob
import os

from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from sklearn import metrics
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

In [None]:
import wandb

wandb.login(key='baa9bd51cff45b100a3b43f6a039845a468eb8b2')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Importing the dataset

In [None]:
dataset = pd.read_csv('./dataset_cleaned_v5.csv')
dataset['t'] = pd.to_datetime(dataset['t'])
dataset['datetime'] = pd.to_datetime(dataset['datetime'])
dataset['date'] = dataset['t'].dt.date
dataset['vehicle'] = dataset['vehicle'].astype(str)
dataset['road_segment_id'] = dataset['road_segment_id'].astype(str)
dataset = dataset.drop(columns=['segment_id', 'snowfall', 'est_segment_battery_consumption(kwh)',  'rain', 'soil_moisture_0_to_7cm', 'soil_temperature_0_to_7cm', 'geometry',  'speed_meos'])

In [None]:
dataset = dataset.sort_values(['vehicle', 'date', 'datetime'])
grouped = dataset.groupby(['vehicle', 'date', 'datetime'])['sunshine_duration'].first().shift(-1).reset_index()
grouped = grouped.rename(columns={'sunshine_duration': 'next_sunshine_duration'})

# Merge the shifted sunshine duration back into the original dataset
dataset = dataset.merge(grouped, on=['vehicle', 'date', 'datetime'], how='left')
dataset = dataset.rename(columns={'next_sunshine_duration': 'current_sunshine_duration'})
dataset[['sunshine_duration', 'current_sunshine_duration']]

dataset = dataset.drop(columns=['sunshine_duration', 'is_day'])
dataset['current_sunshine_duration'] = dataset['current_sunshine_duration'].fillna(0)

In [None]:
# Create a column for day of the week
dataset['day_of_week'] = dataset['t'].dt.day_name()

# Define time bins
# Define hourly time bins
bins = list(range(0, 24*60+1, 60))  # bins every hour
labels = [f'{str(i).zfill(2)}:00-{str(i+1).zfill(2)}:00' for i in range(24)]

# Create a column for time range
dataset['time_range'] = pd.cut(dataset['t'].dt.hour*60 + dataset['t'].dt.minute,
                               bins=bins,
                               labels=labels,
                               right=False,
                               include_lowest=True)

In [None]:
dataset['duration'] = dataset.groupby(['vehicle', 'date'])['t'].apply(lambda x: x.shift(-1).sub(x).dt.total_seconds()).reset_index()['t']
dataset['duration'] = dataset['duration'].fillna(0.0)
max_durations = dataset.groupby(['vehicle', 'date', 't'])['duration'].transform('max')
dataset['duration'] = dataset['duration'].mask(dataset['duration'] == 0, max_durations)

In [None]:
dataset['stop'] = dataset['trip_id'].str.contains('stop')
dataset = dataset[~dataset['stop']]
dataset['unique_trip_id'] = dataset['vehicle'].astype(str) + '_' + dataset['date'].astype(str) + '_' + dataset['trip_id'].astype(str)

In [None]:
dataset = dataset.drop(columns=['datetime', 'stop', 'trip_id'])
dataset.head()

Unnamed: 0,vehicle,t,road_segment_id,est_battery_consumption(kwh),date,segment_length,distance_traveled,temperature_2m,relative_humidity_2m,precipitation,wind_speed_10m,wind_direction_10m,mean_elevation,avg_grade,avg_angle,current_sunshine_duration,day_of_week,time_range,duration,unique_trip_id
0,E301,2024-01-21 08:49:19,553983725,1.0,2024-01-21,14.361187,1.625764,4.683,29.58425,0.0,10.483357,15.945477,8.699299,-1.34347,-0.769706,3600.0,Sunday,08:00-09:00,30.0,E301_2024-01-21_trip_20
1,E301,2024-01-21 08:49:49,211285373,0.199855,2024-01-21,63.803733,43.126976,4.683,29.58425,0.0,10.483357,15.945477,8.5438,-0.184659,-0.105802,3600.0,Sunday,08:00-09:00,30.0,E301_2024-01-21_trip_20
2,E301,2024-01-21 08:49:49,19784065,0.227071,2024-01-21,48.896393,49.0,4.683,29.58425,0.0,10.483357,15.945477,8.391276,-0.414343,-0.237398,3600.0,Sunday,08:00-09:00,30.0,E301_2024-01-21_trip_20
3,E301,2024-01-21 08:49:49,474230989,0.064877,2024-01-21,13.874596,14.0,4.683,29.58425,0.0,10.483357,15.945477,8.100567,-1.743121,-0.998634,3600.0,Sunday,08:00-09:00,30.0,E301_2024-01-21_trip_20
4,E301,2024-01-21 08:50:19,553819742,0.508197,2024-01-21,74.361373,54.949292,4.683,29.58425,0.0,10.483357,15.945477,7.554913,-0.185431,-0.106138,3600.0,Sunday,08:00-09:00,31.0,E301_2024-01-21_trip_20


## Training the Model

### Preparing the dataset

In [None]:
class SegmentDataset(Dataset):
    def __init__(self, df, target='avg_kwh', features=['incline', 'distance', 'time_difference'],
                 cat_features=['A', 'B'], identifiers=['id1', 'id2'], sequence_length=4):
        self.features = features
        self.cat_features = cat_features
        self.target = target
        self.identifiers = identifiers
        self.sequence_length = sequence_length
        self.X, self.y, self.cat_X, self.ids = self.__prepare_sequence__(df, features, cat_features, target, identifiers, sequence_length)

    def __prepare_sequence__(self, df, features, cat_features, target, identifiers, sequence_length):
        feature_sequences = []
        cat_feature_sequences = []
        labels = []
        identifier_seqs = []

        for _, group in df.groupby(by=['vehicle', 'date', 'unique_trip_id']):
            group = group.sort_values('t')
            group_array = group[features + cat_features + [target] + identifiers].to_numpy()

            for i in range(len(group_array)):
                if i >= sequence_length - 1:
                    start_index = i - (sequence_length - 1)
                    window = group_array[start_index:i+1]
                else:
                    padding_length = sequence_length - (i + 1)
                    padding = np.zeros((padding_length, group_array.shape[1]))
                    window = np.vstack((padding, group_array[:i+1]))

                feature_tensor = torch.tensor(window[:, :len(features)], dtype=torch.float32)
                cat_feature_tensor = torch.tensor(window[:, len(features):-len(identifiers)-1], dtype=torch.long)

                feature_sequences.append(feature_tensor)
                cat_feature_sequences.append(cat_feature_tensor)

                labels.append(torch.tensor(window[-1, -len(identifiers)-1], dtype=torch.float32))
                identifier_seqs.append(torch.tensor(window[-1, -len(identifiers):], dtype=torch.float32))  # Last columns are the identifiers

        return torch.stack(feature_sequences), torch.stack(labels), torch.stack(cat_feature_sequences), torch.stack(identifier_seqs)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        return self.X[i], self.y[i], self.cat_X[i], self.ids[i]

In [None]:
def encode_data(dataset, categorical_features):
    encoders = []
    for col in categorical_features:
        encoder = LabelEncoder()
        dataset[col] = encoder.fit_transform(dataset[col])
        encoders.append(encoder)

    return encoders


def split_data(dataset, splits):
    # Create a copy of the dataset to avoid modifying the original
    df = dataset.copy()

    # Sort the dataframe by vehicle and time to maintain order
    df = df.sort_values(['vehicle', 't'])

    # Create GroupShuffleSplit objects
    train_test_splitter = GroupShuffleSplit(n_splits=1, train_size=splits[0]+splits[1], random_state=42)
    train_val_splitter = GroupShuffleSplit(n_splits=1, train_size=(splits[0]/(splits[0]+splits[1])), random_state=42)


    # Split into train + val and test
    train_val_idx, test_idx = next(train_test_splitter.split(df, groups=df['unique_trip_id']))
    train_val_df = df.iloc[train_val_idx]
    test_df = df.iloc[test_idx]

    train_val_df = train_val_df.reset_index(drop=True)

    # Split train + val into train and val
    train_idx, val_idx = next(train_val_splitter.split(train_val_df, groups=train_val_df['unique_trip_id']))
    train_df = train_val_df.iloc[train_idx]
    val_df = train_val_df.iloc[val_idx]

    # Assert that trips are unique across splits
    train_trips = set(train_df['unique_trip_id'])
    val_trips = set(val_df['unique_trip_id'])
    test_trips = set(test_df['unique_trip_id'])

    assert train_trips.isdisjoint(val_trips), "Train and validation sets have overlapping trips"
    assert train_trips.isdisjoint(test_trips), "Train and test sets have overlapping trips"
    assert val_trips.isdisjoint(test_trips), "Validation and test sets have overlapping trips"

    return train_df, val_df, test_df

def scale_data(train_df, val_df, test_df, features):
    scaler = StandardScaler().fit(train_df[features])
    train_df[features] = scaler.transform(train_df[features])
    test_df[features] = scaler.transform(test_df[features])
    val_df[features] = scaler.transform(val_df[features])

    return scaler

def load_and_preprocess_data(dataset, target, features, identifiers,
                             categorical_features, non_categorical_features,
                             batch_size, sequence_length, splits):

    # Preprocess the data
    encoders = encode_data(dataset, categorical_features)
    train_df, val_df, test_df = split_data(dataset, splits)
    scaler = scale_data(train_df, val_df, test_df, features)

    torch.manual_seed(101)

    train_dataset = SegmentDataset(
        train_df,
        target=target[0],
        features=features,
        cat_features=categorical_features,
        identifiers=identifiers,
        sequence_length=sequence_length
    )
    val_dataset = SegmentDataset(
        val_df,
        target=target[0],
        features=features,
        cat_features=categorical_features,
        identifiers=identifiers,
        sequence_length=sequence_length
    )

    test_dataset = SegmentDataset(
        test_df,
        target=target[0],
        features=features,
        cat_features=categorical_features,
        identifiers=identifiers,
        sequence_length=sequence_length
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

    return train_loader, val_loader, test_loader, test_df, encoders, scaler

### Building the model

#### Unused model

In [None]:
# def split_data(dataset, splits):
#     # Split the data based on date instead of randomizing it as it is very important for training
#     maximum_dates = dataset['date'].unique().max()
#     train_split = math.floor(splits[0] * maximum_dates)
#     val_split = train_split + math.floor(splits[1] * maximum_dates)

#     # Function to split each group
#     def split_group(group):
#         train = group[group['date'] < train_split]
#         val = group[(group['date'] >= train_split) & (group['date'] < val_split)]
#         test = group[group['date'] >= val_split]
#         return train, val, test

#     # Applying the function to each group and collecting results
#     train_frames = []
#     val_frames = []
#     test_frames = []

#     for _, group in dataset.groupby(by=['vehicle']):
#         train, val, test = split_group(group)
#         train_frames.append(train)
#         val_frames.append(val)
#         test_frames.append(test)

#     # Concatenate all the training and testing groups back into DataFrames
#     train_df = pd.concat(train_frames)
#     val_df = pd.concat(val_frames)
#     test_df = pd.concat(test_frames)

#     return train_df, val_df, test_df

# class LSTMModel(nn.Module):
#     def __init__(self, num_embeddings=2, num_lstm_layers=3, num_categories=):
#         super(LSTMModel, self).__init__()
#         self.num_embeddings = num_embeddings
#         self.lstm_hidden_size = lstm_hidden_size
#         self.num_lstm_layers = num_lstm_layers
#         self.dense_hidden_size = dense_hidden_size
#         self.dropout_prob = dropout_prob

#         self.embeddings = nn.nn.ModuleList([
#             nn.Embedding(num_categories[i], embedding_dims[i])
#             for i in range(num_embeddings)
#         ])

#         # Embedding layers for categorical features
#         self.cat1_embedding = nn.Embedding(num_cat1_categories, cat1_embedding_dim)
#         self.cat2_embedding = nn.Embedding(num_cat2_categories, cat2_embedding_dim)

#         # LSTM layers
#         input_dim = num_numerical_features + cat1_embedding_dim + cat2_embedding_dim
#         self.lstm1 = nn.LSTM(input_dim, hidden_dims[0], batch_first=True)
#         self.lstm2 = nn.LSTM(hidden_dims[0], hidden_dims[1], batch_first=True)
#         self.lstm3 = nn.LSTM(hidden_dims[1], hidden_dims[2], batch_first=True)

#         # Dropout layer after the last LSTM layer
#         self.dropout1 = nn.Dropout(dropout_prob)

#         # Fully connected layer with ReLU activation
#         self.fc1 = nn.Linear(hidden_dims[2], 64)
#         self.relu = nn.ReLU()

#         # Dropout layer before the final dense layer
#         self.dropout2 = nn.Dropout(dropout_prob)

#         # Final fully connected layer
#         self.fc2 = nn.Linear(64, output_dim)

    # def forward(self, numerical, cat1, cat2):
    #     # Embed categorical features
    #     cat1_embedded = self.cat1_embedding(cat1)
    #     cat2_embedded = self.cat2_embedding(cat2)

    #     # Concatenate numerical and embedded categorical features
    #     x = torch.cat((numerical, cat1_embedded, cat2_embedded), dim=-1)

    #     # LSTM layers
    #     lstm_out1, (hn1, cn1) = self.lstm1(x)
    #     lstm_out2, (hn2, cn2) = self.lstm2(lstm_out1)
    #     lstm_out3, (hn3, cn3) = self.lstm3(lstm_out2)

    #     # Dropout after the last LSTM layer
    #     x = self.dropout1(lstm_out3[:, -1, :])

    #     # Fully connected layer with ReLU activation
    #     x = self.relu(self.fc1(x))

    #     # Dropout before the final dense layer
    #     x = self.dropout2(x)

    #     # Final dense layer
    #     output = self.fc2(x)

    #     return output

SyntaxError: invalid syntax (<ipython-input-13-c4628c67c579>, line 2)

#### Defining the model to train

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, num_features=3, output_dim=1, num_embeddings=2, num_lstm_layers=3,
                 num_categories=[1,2], embedding_dims=[1,2], lstm_hidden_size=3, dense_hidden_size=3,
                 activation='relu', dropout_prob=0.2):
        super(LSTMModel, self).__init__()
        self.num_embeddings = num_embeddings
        self.lstm_hidden_size = lstm_hidden_size
        self.num_lstm_layers = num_lstm_layers
        self.dense_hidden_size = dense_hidden_size
        self.dropout_prob = dropout_prob

        # Embedding layers
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories[i], embedding_dims[i])
            for i in range(num_embeddings)
        ])

        for embedding in self.embeddings:
            nn.init.xavier_uniform_(embedding.weight)

        # LSTM layers
        input_dim = num_features + sum(embedding_dims)
        # Create LSTM layers
        self.lstm_layers = nn.ModuleList([
            nn.LSTM(input_dim if i == 0 else lstm_hidden_size[i-1], lstm_hidden_size[i], batch_first=True)
            for i in range(num_lstm_layers)
        ])

        # Dropout layers
        self.dropout1 = nn.Dropout(dropout_prob)
        self.dropout2 = nn.Dropout(dropout_prob)

        # Fully connected layers
        input_dense_sim = lstm_hidden_size[-1]
        self.dense1 = nn.Linear(input_dense_sim, dense_hidden_size[0])
        self.dense2 = nn.Linear(dense_hidden_size[0], dense_hidden_size[1])
        self.dense3 = nn.Linear(dense_hidden_size[1], output_dim)

        self.activations = {
            'relu': F.relu,
            'leakyrelu': F.leaky_relu,
            'prelu': nn.PReLU()
        }
        self.activation = self.activations[activation]

    def forward(self, numerical, *cats):
        # Embed categorical features
        embedded = [self.embeddings[i](cats[i]) for i in range(self.num_embeddings)]

        # Concatenate numerical and embedded categorical features
        x = torch.cat((numerical, *embedded), dim=-1)

        # LSTM layers
        for i in range(self.num_lstm_layers):
            x, _ = self.lstm_layers[i](x)

        # Flatten the sequence
        x = x[:, -1, :]

        # Fully connected layer with activation
        x = self.activation(self.dense1(x))

        # Dropout before the final dense layer
        x = self.dropout1(x)

        # Final dense layer
        x = self.activation(self.dense2(x))
        x = self.dropout2(x)

        output = self.dense3(x)

        return output

#### Function for training the model

In [None]:
def print_embedding_weights(model):
    for i, embedding in enumerate(model.embeddings):
        print(f'Embedding Layer {i} Weights:')
        print(embedding.weight.data)

def train_one_epoch(model, optimizer, loss_fn, train_loader, epoch_index, tb_writer):
    model.train()
    running_loss = 0.0

    for i, (numerical_batch, target_batch, cat_batch, _) in enumerate(train_loader):
        numerical_batch, target_batch, cat_batch = numerical_batch.to(device), target_batch.to(device), cat_batch.to(device)
        cat1_batch = cat_batch[:, :, 0]
        cat2_batch = cat_batch[:, :, 1]

        optimizer.zero_grad()

        outputs = model(numerical_batch, cat1_batch, cat2_batch)
        loss = loss_fn(outputs, target_batch.unsqueeze(1))

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    last_loss = running_loss / len(train_loader)
    tb_writer.add_scalar('Loss/train', last_loss, epoch_index * len(train_loader) + i)

    wandb.log({"training loss": last_loss})

    return last_loss


def validate_model(model, val_loader, loss_fn, device, epoch_index, tb_writer):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for numerical_batch, target_batch, cat_batch, _ in val_loader:
            numerical_batch, target_batch, cat_batch = numerical_batch.to(device), target_batch.to(device), cat_batch.to(device)
            cat1_batch = cat_batch[:, :, 0]
            cat2_batch = cat_batch[:, :, 1]

            outputs = model(numerical_batch, cat1_batch, cat2_batch)
            loss = loss_fn(outputs, target_batch.unsqueeze(1))

            val_loss += loss.item()

    val_loss /= len(val_loader)
    tb_writer.add_scalar('Loss/valid', val_loss, epoch_index + 1)
    return val_loss

def evaluate_model(model, test_loader, loss_fn, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for numerical_batch, target_batch, cat_batch, _ in test_loader:
            numerical_batch, target_batch, cat_batch = numerical_batch.to(device), target_batch.to(device), cat_batch.to(device)
            cat1_batch = cat_batch[:, :, 0]
            cat2_batch = cat_batch[:, :, 1]

            outputs = model(numerical_batch, cat1_batch, cat2_batch)
            loss = loss_fn(outputs, target_batch.unsqueeze(1))

            test_loss += loss.item()

    test_loss /= len(test_loader)
    print(f'Test Loss: {test_loss}')


def build_optimizer(network, optimizer, learning_rate):
    if optimizer == "sgd":
        optimizer = optim.SGD(network.parameters(),
                              lr=learning_rate, momentum=0.9)
    elif optimizer == "adam":
        optimizer = optim.Adam(network.parameters(),
                               lr=learning_rate)
    elif optimizer == "rmsprop":
        optimizer = optim.RMSprop(network.parameters(),
                                  lr=learning_rate)
    elif optimizer == "adagrad":
        optimizer = optim.Adagrad(network.parameters(),
                                  lr=learning_rate)
    return optimizer

def prepare_model(num_features, output_dim, num_embeddings,
                  num_lstm_layers, num_categories, embedding_dims,
                  lstm_hidden_size, dense_hidden_size, activation, dropout_prob,
                  optimizer="Adam", learning_rate=0.0001, device=None):

  # Initialize model, loss function, and optimizer
  model = LSTMModel(
      num_features=num_features,
      output_dim=output_dim,
      num_embeddings=num_embeddings,
      num_lstm_layers=num_lstm_layers,
      num_categories=num_categories,
      embedding_dims=embedding_dims,
      lstm_hidden_size=lstm_hidden_size,
      dense_hidden_size=dense_hidden_size,
      activation=activation,
      dropout_prob=dropout_prob
  )
  model.to(device)

  loss_fn = nn.MSELoss()
  optimizer = build_optimizer(model, optimizer, learning_rate)

  return model, loss_fn, optimizer

#### Defining train routines

In [None]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0.0):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.min_delta = min_delta  # the minimum change to be counted as improvement
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf

    # return True when validation loss is not decreased by the `min_delta` for `patience` times
    def early_stop(self, validation_loss):
        if ((validation_loss+self.min_delta) < self.min_validation_loss):
            self.min_validation_loss = validation_loss
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
        elif ((validation_loss+self.min_delta) > self.min_validation_loss):
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if self.counter >= self.patience:
                return True
        return False


def train(loaders=None, config=None, device=None):
    experiment_name = f"battery-prediction-lstm-test"
    print(config)
    # Initialize a new wandb run
    with wandb.init(config=config, project="battery-consumption", name=experiment_name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        # Load data
        train_loader, val_loader, test_loader = loaders

        # load model
        model, loss_fn, optimizer = prepare_model(config.num_features, config.output_dim, config.num_embeddings,
                                                   config.num_lstm_layers, config.num_categories, config.embedding_dims,
                                                   config.lstm_hidden_size, config.dense_hidden_size, config.activation, config.dropout_prob,
                                                   config.optimizer, config.learning_rate, device)

        print(model)
        tb_writer = SummaryWriter()
        best_vloss = float('inf')

        early_stopper = EarlyStopper(patience=10, min_delta=0.0005)
        for epoch in tqdm(range(config.epochs)):
            train_one_epoch(model, optimizer, loss_fn, train_loader, epoch, tb_writer)
            avg_vloss = validate_model(model, val_loader, loss_fn, device, epoch, tb_writer)

            print(f'Epoch: {epoch}, avg_vloss: {avg_vloss},  best_vloss: {best_vloss}')
            if avg_vloss < best_vloss:
                best_vloss = avg_vloss
                torch.save(model.state_dict(), f'model_battery_best_1.6_embd.pth')
            wandb.log({"loss": avg_vloss, "epoch": epoch, "best_loss": best_vloss})

            if early_stopper.early_stop(avg_vloss):
              break

        tb_writer.close()

        return model, loss_fn, test_loader, epoch, encoders, scaler

def sweep_train(config=None):
    model, test_loader, scaler = train(dataset=dataset, config=config)

#### Preparing the training

In [None]:
def get_embedding_size(num_categories):
    # return min(600, round(1.6 * math.sqrt(num_categories)))
    # return min(500, num_categories // 2)
    return round(num_categories ** (1/4))

categorical_features = ['day_of_week', 'time_range', 'vehicle', 'road_segment_id', 'date', 'unique_trip_id']
non_categorical_features = dataset.columns.difference(categorical_features).tolist()
identifiers = ['vehicle', 'road_segment_id', 'date', 'unique_trip_id']
time = ['t']
target = ['est_battery_consumption(kwh)']
features = list(set(dataset.columns) - set(target) - set(identifiers) - set(categorical_features) - set(time))


config = {
    # Hyperparameters
    "epochs": 30,
    "batch_size": 64,
    "sequence_length": 4,

    "splits": [0.5, 0.2, 0.3], # train, val, test

    "num_features": len(features),
    "output_dim": 1,
    "num_embeddings": 2,
    "num_lstm_layers": len([128, 128, 64]),
    "num_categories": [len(dataset['day_of_week'].unique()), len(dataset['time_range'].unique())],
    "embedding_dims": [get_embedding_size(len(dataset['day_of_week'].unique())),  get_embedding_size(len(dataset['time_range'].unique()))],
    "lstm_hidden_size": [128, 128, 64],
    "dense_hidden_size": [64, 32],
    "activation": 'leakyrelu',
    "dropout_prob": 0.2,
    "optimizer": 'adam',
    "learning_rate": 0.001,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load the data

In [None]:
train_loader, val_loader, test_loader, test_df, encoders, scaler = load_and_preprocess_data(dataset, target, features, identifiers,
                         categorical_features, non_categorical_features,
                         config['batch_size'], config['sequence_length'], config['splits'])

In [None]:
print(features)
print(categorical_features)
print(non_categorical_features)
print(identifiers)
print(target)

['wind_speed_10m', 'relative_humidity_2m', 'distance_traveled', 'wind_direction_10m', 'current_sunshine_duration', 'temperature_2m', 'avg_angle', 'precipitation', 'avg_grade', 'mean_elevation', 'segment_length', 'duration']
['day_of_week', 'time_range', 'vehicle', 'road_segment_id', 'date', 'unique_trip_id']
['avg_angle', 'avg_grade', 'current_sunshine_duration', 'distance_traveled', 'duration', 'est_battery_consumption(kwh)', 'mean_elevation', 'precipitation', 'relative_humidity_2m', 'segment_length', 't', 'temperature_2m', 'wind_direction_10m', 'wind_speed_10m']
['vehicle', 'road_segment_id', 'date', 'unique_trip_id']
['est_battery_consumption(kwh)']


In [None]:
len(train_loader.dataset), len(test_loader.dataset), len(val_loader.dataset)

(122812, 75281, 49216)

### Run the training

In [None]:
model, loss_fn, test_loader, epoch, encoders, scaler = train((train_loader, val_loader, test_loader), config, device)

# Evaluate the model
evaluate_model(model, test_loader, loss_fn, device)

[34m[1mwandb[0m: Currently logged in as: [33msatriabw[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'epochs': 30, 'batch_size': 64, 'sequence_length': 4, 'splits': [0.5, 0.2, 0.3], 'num_features': 12, 'output_dim': 1, 'num_embeddings': 2, 'num_lstm_layers': 3, 'num_categories': [7, 19], 'embedding_dims': [2, 2], 'lstm_hidden_size': [128, 128, 64], 'dense_hidden_size': [64, 32], 'activation': 'leakyrelu', 'dropout_prob': 0.2, 'optimizer': 'adam', 'learning_rate': 0.001}


LSTMModel(
  (embeddings): ModuleList(
    (0): Embedding(7, 2)
    (1): Embedding(19, 2)
  )
  (lstm_layers): ModuleList(
    (0): LSTM(16, 128, batch_first=True)
    (1): LSTM(128, 128, batch_first=True)
    (2): LSTM(128, 64, batch_first=True)
  )
  (dropout1): Dropout(p=0.2, inplace=False)
  (dropout2): Dropout(p=0.2, inplace=False)
  (dense1): Linear(in_features=64, out_features=64, bias=True)
  (dense2): Linear(in_features=64, out_features=32, bias=True)
  (dense3): Linear(in_features=32, out_features=1, bias=True)
)


  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 0, avg_vloss: 0.010922270404024749,  best_vloss: inf


  7%|▋         | 2/30 [00:22<05:15, 11.26s/it]

Epoch: 1, avg_vloss: 0.009808952399373287,  best_vloss: 0.010922270404024749


 10%|█         | 3/30 [00:33<04:51, 10.80s/it]

Epoch: 2, avg_vloss: 0.009949510286547612,  best_vloss: 0.009808952399373287


 13%|█▎        | 4/30 [00:43<04:35, 10.59s/it]

Epoch: 3, avg_vloss: 0.009700793984875335,  best_vloss: 0.009808952399373287


 17%|█▋        | 5/30 [00:53<04:21, 10.44s/it]

Epoch: 4, avg_vloss: 0.00879661481335798,  best_vloss: 0.009700793984875335


 20%|██        | 6/30 [01:03<04:08, 10.37s/it]

Epoch: 5, avg_vloss: 0.00865361093521065,  best_vloss: 0.00879661481335798


 23%|██▎       | 7/30 [01:14<03:57, 10.35s/it]

Epoch: 6, avg_vloss: 0.008518985790528734,  best_vloss: 0.00865361093521065


 27%|██▋       | 8/30 [01:24<03:48, 10.37s/it]

Epoch: 7, avg_vloss: 0.008134531905397443,  best_vloss: 0.008518985790528734


 30%|███       | 9/30 [01:34<03:35, 10.28s/it]

Epoch: 8, avg_vloss: 0.008528379226128858,  best_vloss: 0.008134531905397443


 33%|███▎      | 10/30 [01:44<03:25, 10.28s/it]

Epoch: 9, avg_vloss: 0.008149416418746114,  best_vloss: 0.008134531905397443


 37%|███▋      | 11/30 [01:54<03:14, 10.23s/it]

Epoch: 10, avg_vloss: 0.008406488761525015,  best_vloss: 0.008134531905397443


 40%|████      | 12/30 [02:05<03:03, 10.21s/it]

Epoch: 11, avg_vloss: 0.00792752801648357,  best_vloss: 0.008134531905397443


 43%|████▎     | 13/30 [02:15<02:53, 10.22s/it]

Epoch: 12, avg_vloss: 0.008283328237096805,  best_vloss: 0.00792752801648357


 47%|████▋     | 14/30 [02:25<02:44, 10.26s/it]

Epoch: 13, avg_vloss: 0.00840820055118894,  best_vloss: 0.00792752801648357


 50%|█████     | 15/30 [02:36<02:34, 10.30s/it]

Epoch: 14, avg_vloss: 0.008177312033617885,  best_vloss: 0.00792752801648357


 53%|█████▎    | 16/30 [02:46<02:23, 10.28s/it]

Epoch: 15, avg_vloss: 0.007975455442708718,  best_vloss: 0.00792752801648357


 57%|█████▋    | 17/30 [02:56<02:13, 10.29s/it]

Epoch: 16, avg_vloss: 0.008199394688583057,  best_vloss: 0.00792752801648357


 57%|█████▋    | 17/30 [03:06<02:22, 10.99s/it]

Epoch: 17, avg_vloss: 0.007995681769683259,  best_vloss: 0.00792752801648357





VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_loss,█▅▅▅▃▃▂▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
loss,█▅▆▅▃▃▂▁▂▂▂▁▂▂▂▁▂▁
training loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
best_loss,0.00793
epoch,17.0
loss,0.008
training loss,0.00631


Test Loss: 0.00809438745443236


### Run sweep config

In [None]:
import pprint

# Define the initial sweep configuration
sweep_config = {
    'method': 'grid'
}

# Define the metric to optimize
metric = {
    'name': 'loss',
    'goal': 'minimize'
}

sweep_config['metric'] = metric

# Define the hyperparameters to sweep
parameters_dict = {
    'optimizer': {
        'values': ['adam']
    },
    'lstm_hidden_size': {
        'values': [[128, 128, 64]]
    },
    'num_lstm_layers': {
        'values': [3]
    },
    'dense_hidden_size': {
        'values': [[64, 32]]
    },
    'dropout_prob': {
        'values': [0.2]
    },
    'batch_size': {
        'values': [64]
    },
    'activation': {
        'values': ['leakyrelu']
    },
    'learning_rate': {
        'values': [0.005, 0.001, 0.0005]
    },
    'num_features': {
        'value': len(list(set(dataset.columns) - set(['est_battery_consumption(kwh)']) - set(['vehicle', 'road_segment_id', 'date']) - set(['day_of_week', 'time_range']) - set(['t'])))
    },
    'output_dim': {
        'value': 1
    },
    'num_embeddings': {
        'value': 2
    },
    'num_categories': {
        'value': [len(dataset['day_of_week'].unique()), len(dataset['time_range'].unique())]
    },
    'embedding_dims': {
        'value': [get_embedding_size(len(dataset['day_of_week'].unique())),  get_embedding_size(len(dataset['time_range'].unique()))]
    }
}


parameters_dict.update({
    'epochs': {
        'value': 10
        }
    })

sweep_config['parameters'] = parameters_dict

# Initialize sweep
sweep_id = wandb.sweep(sweep_config, project="battery-sweeps-v2")

# Print the sweep configuration
pprint.pprint(sweep_config)

Create sweep with ID: sw9l2g1f
Sweep URL: https://wandb.ai/satriabw/battery-sweeps-v2/sweeps/sw9l2g1f
{'method': 'grid',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'activation': {'values': ['leakyrelu']},
                'batch_size': {'values': [64]},
                'dense_hidden_size': {'values': [[64, 32]]},
                'dropout_prob': {'values': [0.2]},
                'embedding_dims': {'value': [2, 2]},
                'epochs': {'value': 10},
                'learning_rate': {'values': [0.005, 0.001, 0.0005]},
                'lstm_hidden_size': {'values': [[128, 128, 64]]},
                'num_categories': {'value': [7, 19]},
                'num_embeddings': {'value': 2},
                'num_features': {'value': 12},
                'num_lstm_layers': {'values': [3]},
                'optimizer': {'values': ['adam']},
                'output_dim': {'value': 1}}}


In [None]:
def sweep_train(config=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train((train_loader, val_loader, test_loader), config, device)

In [None]:
wandb.agent(sweep_id, sweep_train)

[34m[1mwandb[0m: Agent Starting Run: 5oh4am00 with config:
[34m[1mwandb[0m: 	activation: leakyrelu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dense_hidden_size: [64, 32]
[34m[1mwandb[0m: 	dropout_prob: 0.2
[34m[1mwandb[0m: 	embedding_dims: [2, 2]
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	lstm_hidden_size: [128, 128, 64]
[34m[1mwandb[0m: 	num_categories: [7, 19]
[34m[1mwandb[0m: 	num_embeddings: 2
[34m[1mwandb[0m: 	num_features: 12
[34m[1mwandb[0m: 	num_lstm_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	output_dim: 1


None




LSTMModel(
  (embeddings): ModuleList(
    (0): Embedding(7, 2)
    (1): Embedding(19, 2)
  )
  (lstm_layers): ModuleList(
    (0): LSTM(16, 128, batch_first=True)
    (1): LSTM(128, 128, batch_first=True)
    (2): LSTM(128, 64, batch_first=True)
  )
  (dropout1): Dropout(p=0.2, inplace=False)
  (dropout2): Dropout(p=0.2, inplace=False)
  (dense1): Linear(in_features=64, out_features=64, bias=True)
  (dense2): Linear(in_features=64, out_features=32, bias=True)
  (dense3): Linear(in_features=32, out_features=1, bias=True)
)


 10%|█         | 1/10 [00:20<03:02, 20.24s/it]

Epoch: 0, avg_vloss: 0.011629288345816531,  best_vloss: inf


 20%|██        | 2/10 [00:40<02:40, 20.08s/it]

Epoch: 1, avg_vloss: 0.009960436930040185,  best_vloss: 0.011629288345816531


 30%|███       | 3/10 [01:00<02:20, 20.09s/it]

Epoch: 2, avg_vloss: 0.009654466491527304,  best_vloss: 0.009960436930040185


 40%|████      | 4/10 [01:20<02:00, 20.09s/it]

Epoch: 3, avg_vloss: 0.008984821127067038,  best_vloss: 0.009654466491527304


 50%|█████     | 5/10 [01:40<01:40, 20.08s/it]

Epoch: 4, avg_vloss: 0.00837535418983033,  best_vloss: 0.008984821127067038


 60%|██████    | 6/10 [02:00<01:20, 20.07s/it]

Epoch: 5, avg_vloss: 0.00861619246035005,  best_vloss: 0.00837535418983033


 70%|███████   | 7/10 [02:20<01:00, 20.03s/it]

Epoch: 6, avg_vloss: 0.008290463427279094,  best_vloss: 0.00837535418983033


 80%|████████  | 8/10 [02:40<00:39, 19.97s/it]

Epoch: 7, avg_vloss: 0.008417421531761124,  best_vloss: 0.008290463427279094


 90%|█████████ | 9/10 [03:00<00:19, 19.93s/it]

Epoch: 8, avg_vloss: 0.007971320492961202,  best_vloss: 0.008290463427279094


100%|██████████| 10/10 [03:20<00:00, 20.02s/it]

Epoch: 9, avg_vloss: 0.008040569332304232,  best_vloss: 0.007971320492961202





VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_loss,█▅▄▃▂▂▂▂▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▅▄▃▂▂▂▂▁▁
training loss,█▄▃▃▂▂▁▁▁▁

0,1
best_loss,0.00797
epoch,9.0
loss,0.00804
training loss,0.00981


[34m[1mwandb[0m: Agent Starting Run: 07vl2oz4 with config:
[34m[1mwandb[0m: 	activation: leakyrelu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dense_hidden_size: [64, 32]
[34m[1mwandb[0m: 	dropout_prob: 0.2
[34m[1mwandb[0m: 	embedding_dims: [2, 2]
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	lstm_hidden_size: [128, 128, 64]
[34m[1mwandb[0m: 	num_categories: [7, 19]
[34m[1mwandb[0m: 	num_embeddings: 2
[34m[1mwandb[0m: 	num_features: 12
[34m[1mwandb[0m: 	num_lstm_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	output_dim: 1


None




LSTMModel(
  (embeddings): ModuleList(
    (0): Embedding(7, 2)
    (1): Embedding(19, 2)
  )
  (lstm_layers): ModuleList(
    (0): LSTM(16, 128, batch_first=True)
    (1): LSTM(128, 128, batch_first=True)
    (2): LSTM(128, 64, batch_first=True)
  )
  (dropout1): Dropout(p=0.2, inplace=False)
  (dropout2): Dropout(p=0.2, inplace=False)
  (dense1): Linear(in_features=64, out_features=64, bias=True)
  (dense2): Linear(in_features=64, out_features=32, bias=True)
  (dense3): Linear(in_features=32, out_features=1, bias=True)
)


 10%|█         | 1/10 [00:20<03:00, 20.01s/it]

Epoch: 0, avg_vloss: 0.010314659393578571,  best_vloss: inf


 20%|██        | 2/10 [00:40<02:40, 20.02s/it]

Epoch: 1, avg_vloss: 0.00950981193208815,  best_vloss: 0.010314659393578571


 30%|███       | 3/10 [00:59<02:19, 19.93s/it]

Epoch: 2, avg_vloss: 0.009038012663673326,  best_vloss: 0.00950981193208815


 40%|████      | 4/10 [01:19<01:59, 19.88s/it]

Epoch: 3, avg_vloss: 0.008463051427061222,  best_vloss: 0.009038012663673326


 50%|█████     | 5/10 [01:39<01:39, 19.81s/it]

Epoch: 4, avg_vloss: 0.008588059801887917,  best_vloss: 0.008463051427061222


 60%|██████    | 6/10 [01:59<01:19, 19.78s/it]

Epoch: 5, avg_vloss: 0.008011181548695094,  best_vloss: 0.008463051427061222


 70%|███████   | 7/10 [02:18<00:59, 19.75s/it]

Epoch: 6, avg_vloss: 0.007843138626432065,  best_vloss: 0.008011181548695094


 80%|████████  | 8/10 [02:38<00:39, 19.80s/it]

Epoch: 7, avg_vloss: 0.0076007639049566166,  best_vloss: 0.007843138626432065


 90%|█████████ | 9/10 [02:58<00:19, 19.88s/it]

Epoch: 8, avg_vloss: 0.00767553244209425,  best_vloss: 0.0076007639049566166


100%|██████████| 10/10 [03:18<00:00, 19.86s/it]

Epoch: 9, avg_vloss: 0.007696353205803085,  best_vloss: 0.0076007639049566166





VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_loss,█▆▅▃▃▂▂▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▅▃▄▂▂▁▁▁
training loss,█▅▄▃▃▂▂▂▁▁

0,1
best_loss,0.0076
epoch,9.0
loss,0.0077
training loss,0.00817


[34m[1mwandb[0m: Agent Starting Run: nx0zjoi0 with config:
[34m[1mwandb[0m: 	activation: leakyrelu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dense_hidden_size: [64, 32]
[34m[1mwandb[0m: 	dropout_prob: 0.2
[34m[1mwandb[0m: 	embedding_dims: [2, 2]
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	lstm_hidden_size: [128, 128, 64]
[34m[1mwandb[0m: 	num_categories: [7, 19]
[34m[1mwandb[0m: 	num_embeddings: 2
[34m[1mwandb[0m: 	num_features: 12
[34m[1mwandb[0m: 	num_lstm_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	output_dim: 1


None




LSTMModel(
  (embeddings): ModuleList(
    (0): Embedding(7, 2)
    (1): Embedding(19, 2)
  )
  (lstm_layers): ModuleList(
    (0): LSTM(16, 128, batch_first=True)
    (1): LSTM(128, 128, batch_first=True)
    (2): LSTM(128, 64, batch_first=True)
  )
  (dropout1): Dropout(p=0.2, inplace=False)
  (dropout2): Dropout(p=0.2, inplace=False)
  (dense1): Linear(in_features=64, out_features=64, bias=True)
  (dense2): Linear(in_features=64, out_features=32, bias=True)
  (dense3): Linear(in_features=32, out_features=1, bias=True)
)


 10%|█         | 1/10 [00:19<02:59, 19.91s/it]

Epoch: 0, avg_vloss: 0.01137790287009009,  best_vloss: inf


 20%|██        | 2/10 [00:39<02:38, 19.85s/it]

Epoch: 1, avg_vloss: 0.010501276465966198,  best_vloss: 0.01137790287009009


 30%|███       | 3/10 [00:59<02:19, 19.89s/it]

Epoch: 2, avg_vloss: 0.009460076285589163,  best_vloss: 0.010501276465966198


 40%|████      | 4/10 [01:19<01:58, 19.81s/it]

Epoch: 3, avg_vloss: 0.010037745274657513,  best_vloss: 0.009460076285589163


 50%|█████     | 5/10 [01:39<01:39, 19.88s/it]

Epoch: 4, avg_vloss: 0.009287769618630213,  best_vloss: 0.009460076285589163


 60%|██████    | 6/10 [01:59<01:19, 19.91s/it]

Epoch: 5, avg_vloss: 0.008425617751940577,  best_vloss: 0.009287769618630213


 70%|███████   | 7/10 [02:19<00:59, 19.90s/it]

Epoch: 6, avg_vloss: 0.008107148861043927,  best_vloss: 0.008425617751940577


 80%|████████  | 8/10 [02:39<00:39, 19.93s/it]

Epoch: 7, avg_vloss: 0.008539308427799056,  best_vloss: 0.008107148861043927


 90%|█████████ | 9/10 [02:59<00:19, 19.94s/it]

Epoch: 8, avg_vloss: 0.008068957718248709,  best_vloss: 0.008107148861043927


100%|██████████| 10/10 [03:19<00:00, 19.90s/it]

Epoch: 9, avg_vloss: 0.007876123870532452,  best_vloss: 0.008068957718248709





VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_loss,█▆▄▄▄▂▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▄▅▄▂▁▂▁▁
training loss,█▅▄▃▃▂▂▂▁▁

0,1
best_loss,0.00788
epoch,9.0
loss,0.00788
training loss,0.00892


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


### Evaluation of the model

In [None]:
def predict(model, test_loader, device):
    model.load_state_dict(torch.load(f'model_battery_best_1.6_embd.pth'))
    model.eval()
    predictions = []
    identifiers = []
    with torch.no_grad():
        for numerical_batch, _, cat_batch, ids in test_loader:
            numerical_batch, cat_batch = numerical_batch.to(device), cat_batch.to(device)
            cat1_batch = cat_batch[:, :, 0]
            cat2_batch = cat_batch[:, :, 1]

            outputs = model(numerical_batch, cat1_batch, cat2_batch)
            predictions.extend(outputs.cpu().numpy())
            identifiers.extend(ids.cpu().numpy())

    return predictions, identifiers

# Make predictions on the test dataset
predictions, ids = predict(model, test_loader, device)
result = {'vehicle': [], 'date': [], 'road_segment_id': [], 'unique_trip_id': [], 'value': []}
test = []


for identifier, prediction in zip(ids, predictions):
    result['vehicle'].append(identifier[0])
    result['road_segment_id'].append(identifier[1])
    result['date'].append(identifier[2])
    result['unique_trip_id'].append(identifier[3])
    result['value'].append(prediction[0])

In [None]:
result_df = pd.DataFrame(result)

mapping = {'vehicle': 2,  'road_segment_id': 3, 'date': 4, 'unique_trip_id': 5}
for column in identifiers:
  encoder = encoders[mapping[column]]
  result_df[column] = result_df[column].astype(int)

  result_df[column] = encoder.inverse_transform(result_df[column])
  test_df[column] = encoder.inverse_transform(test_df[column])

In [None]:
gt_df = pd.DataFrame()
for _, group in test_df.groupby(by=['vehicle', 'date']):
    group = group.sort_values('t')
    gt_df = pd.concat([gt_df, group])

### Read Ground Truth

In [None]:
dt = dataset.groupby(['vehicle', 'date'])['est_battery_consumption(kwh)'].sum().reset_index()
battery_df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('../data', "*.csv"))))
battery_df = battery_df.rename(columns={'Segnale': 'signal', 'Valore': 'value', 'DataOra': 't', 'Latitudine': 'latitude', 'Veicolo': 'vehicle', 'Longitudine': 'longitude'})
battery_df = battery_df[(battery_df["longitude"] >= 5.93) & (battery_df["longitude"] <= 18.99)]
battery_df = battery_df[(battery_df["latitude"] >= 34.76) & (battery_df["latitude"] <= 47.1)]

consumption_df = battery_df[battery_df['signal'] == 'lastDayTotalConsumedEnergy']
consumption_df['t'] = pd.to_datetime(consumption_df['t'])
consumption_df['date'] = consumption_df['t'].dt.date
consumption_df = consumption_df.sort_values(by=['vehicle', 't'])

dataset = pd.read_csv('./dataset_cleaned_v5.csv')
dataset['t'] = pd.to_datetime(dataset['t'])
dataset['unique_trip_id'] = dataset['vehicle'].astype(str) + '_' + dataset['date'].astype(str) + '_' + dataset['trip_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  consumption_df['t'] = pd.to_datetime(consumption_df['t'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  consumption_df['date'] = consumption_df['t'].dt.date


### Evaluate against daily total consumption

In [None]:
res = {'vehicle': [], 'date': [], 'mse': [], 'rmse': [], 'mape': [], 'y_pred':[], 'y': [], 'times': [], 'weathers': [], 'trip_id': []}


for name, group in result_df.groupby('unique_trip_id'):
    vehicle, date, _ , _ = name.split("_")
    date = pd.to_datetime(date).date()

    test_data = dataset[dataset['unique_trip_id'] == name]
    start, end = test_data['t'].min(), test_data['t'].max()


    mask_id = (consumption_df['vehicle'] == vehicle) & (consumption_df['date'] == date)
    mask_trip = (consumption_df['t'] >= start) & (consumption_df['t'] <= end)

    consumption = consumption_df[(mask_id) & (mask_trip)].sort_values('t')


    y = [consumption['value'].iloc[-1] - consumption['value'].iloc[0]]
    y_pred = [group['value'].sum()]

    res['vehicle'].append(vehicle)
    res['date'].append(date)
    res['trip_id'].append(name)

    res['times'].append(consumption['t'].values)
    res['weathers'].append(test_data['precipitation'].values)

    res['y_pred'].append(y_pred[0])
    res['y'].append(y[0])

    res['mse'].append(metrics.mean_squared_error(y, y_pred))
    res['rmse'].append(math.sqrt(metrics.mean_squared_error(y, y_pred)))
    res['mape'].append(metrics.mean_absolute_percentage_error(y, y_pred))


res_df = pd.DataFrame(res)
res_df.to_csv('./lstm_prediction_trips.csv', index=False)
res_df

mse = metrics.mean_squared_error(res_df['y'], res_df['y_pred'])
mape = metrics.mean_absolute_percentage_error(res_df['y'], res_df['y_pred'])
std = np.std(res_df['y'] - res_df['y_pred'])

print(f"Model , Overall RMSE: {math.sqrt(mse)}, MAPE: {mape}, std: {std}")

Model , Overall RMSE: 1.3159582320452725, MAPE: 32645291076127.816, std: 1.3154711406428887


In [None]:
metrics_date = {'date': [], 'mse': [], 'rmse': [], 'mae': [], 'mape': [], 'variance': [], 'r2': [], 'std': []}
for name, group in res_df.groupby('date'):
    y_pred = group['y_pred'].values
    y = group['y'].values

    mse = metrics.mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = metrics.mean_absolute_error(y, y_pred)
    mape = metrics.mean_absolute_percentage_error(y, y_pred)
    variance = np.var(y_pred)
    r2 = metrics.r2_score(y, y_pred)
    std = np.std(y_pred)

    metrics_date['date'].append(name)
    metrics_date['mse'].append(mse)
    metrics_date['rmse'].append(rmse)
    metrics_date['mae'].append(mae)
    metrics_date['mape'].append(mape)
    metrics_date['variance'].append(variance)
    metrics_date['r2'].append(r2)
    metrics_date['std'].append(std)

# Converting the dictionary to a DataFrame
metrics_date_df = pd.DataFrame(metrics_date)
metrics_date_df.to_csv('./lstm_prediction_by_date_with_trip.csv')
metrics_date_df

Unnamed: 0,date,mse,rmse,mae,mape,variance,r2,std
0,2024-02-03,231.741985,15.223074,10.647407,0.131973,1356.713257,0.807337,36.833588
1,2024-02-04,57.929291,7.611129,5.909008,0.134763,779.126648,0.922676,27.91284
2,2024-02-06,117.313487,10.831135,9.072352,0.094665,605.94165,0.840464,24.615883
3,2024-02-07,60.514849,7.779129,6.88249,0.075958,961.554016,0.941041,31.008934
4,2024-02-08,70.636094,8.404528,5.642706,0.055163,1076.687988,0.93911,32.812923
5,2024-02-09,137.449062,11.723867,9.518007,0.08999,938.805054,0.840657,30.639925


In [None]:
metrics_date = {'vehicle': [], 'mse': [], 'rmse': [], 'mae': [], 'mape': [], 'variance': [], 'r2': [], 'std': []}
for name, group in res_df.groupby('vehicle'):
    y_pred = group['y_pred'].values
    y = group['y'].values

    mse = metrics.mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = metrics.mean_absolute_error(y, y_pred)
    mape = metrics.mean_absolute_percentage_error(y, y_pred)
    variance = np.var(y_pred)
    r2 = metrics.r2_score(y, y_pred)
    std = np.std(y_pred)

    metrics_date['vehicle'].append(name)
    metrics_date['mse'].append(mse)
    metrics_date['rmse'].append(rmse)
    metrics_date['mae'].append(mae)
    metrics_date['mape'].append(mape)
    metrics_date['variance'].append(variance)
    metrics_date['r2'].append(r2)
    metrics_date['std'].append(std)

# Converting the dictionary to a DataFrame
metrics_df = pd.DataFrame(metrics_date)
metrics_df.to_csv('./lstm_metrics_split_vehicle_with_trip.csv')
metrics_df

Unnamed: 0,vehicle,mse,rmse,mae,mape,variance,r2,std
0,E302,87.607681,9.359897,7.323351,0.11753,1888.808838,0.952182,43.46043
1,E304,44.770366,6.691066,5.044176,0.042284,360.792145,0.886239,18.99453
2,E305,1028.07807,32.063657,23.142878,0.443331,3973.840576,0.354082,63.038406
3,E306,118.244224,10.874016,6.408887,0.053874,255.420074,-0.519746,15.981867
4,E307,146.95386,12.122453,9.967712,0.079857,664.041931,0.861001,25.769011
5,E308,180.189883,13.423483,12.80629,0.112108,743.028259,0.763783,27.258545
6,E309,68.143693,8.254919,7.762956,0.071305,424.898346,0.806517,20.613062
7,E310,30.934659,5.561894,4.736547,0.098644,1514.777954,0.979326,38.920147
8,E311,99.182225,9.959027,8.132845,0.084212,549.639404,0.683362,23.444389
9,E312,36.144872,6.012061,4.690858,0.071646,1089.851318,0.961501,33.012897
