In [2]:
import pandas as pd
from sklearn.utils import shuffle

X_test = pd.read_csv('X_test.csv')
X_train = pd.read_csv('X_train.csv')
X_val = pd.read_csv('X_val.csv')
y_train = pd.read_csv('y_train.csv')
y_val = pd.read_csv('y_val.csv')

In [3]:
X_train.head(50)

Unnamed: 0,Traffic_Volume,tavg,tmin,tmax,prcp,snow,wdir,wspd,pres,year,...,month_sin,month_cos,week_number_sin,week_number_cos,quarter_sin,quarter_cos,four_month_sin,four_month_cos,half_year_sin,half_year_cos
0,-0.277074,-0.101973,-0.381232,-0.134296,1.188318,-0.137643,1.603098,0.133181,-1.822248,2022,...,0.5,0.866025,6.432491e-16,1.0,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
1,-0.499842,-1.17281,-1.231675,-1.351007,-0.306376,-0.137643,-2.00976,0.485982,0.014492,2022,...,0.5,0.866025,6.432491e-16,1.0,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
2,-0.332017,-1.617308,-1.345759,-1.712218,-0.378236,-0.137643,1.018358,-0.774021,1.760901,2022,...,0.5,0.866025,0.1205367,0.992709,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
3,-1.266043,-1.475877,-1.345759,-1.189412,-0.378236,-0.137643,-0.25554,0.737983,0.52637,2022,...,0.5,0.866025,0.1205367,0.992709,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
4,-0.734597,-1.081889,-1.750238,-1.246446,-0.378236,-0.137643,0.579803,3.15719,-0.99421,2022,...,0.5,0.866025,0.1205367,0.992709,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
5,-0.596741,-2.152726,-1.978405,-2.396618,-0.378236,-0.137643,0.893057,0.670783,0.511315,2022,...,0.5,0.866025,0.1205367,0.992709,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
6,-0.729602,-2.425486,-2.382884,-2.339584,-0.378236,-0.137643,0.684221,-0.35402,1.685625,2022,...,0.5,0.866025,0.1205367,0.992709,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
7,-0.084275,-1.890068,-1.750238,-1.293973,0.239762,-0.137643,-0.464375,0.737983,1.309243,2022,...,0.5,0.866025,0.1205367,0.992709,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
8,-1.569726,-0.980867,-1.573926,-1.132379,1.087714,-0.137643,0.91394,1.157984,1.203857,2022,...,0.5,0.866025,0.1205367,0.992709,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0
9,-1.152162,-1.910272,-1.812465,-1.864307,-0.378236,-0.137643,0.934824,0.368382,2.61905,2022,...,0.5,0.866025,0.2393157,0.970942,1.0,6.123234000000001e-17,0.866025,-0.5,1.224647e-16,-1.0


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

X_train_numeric = X_train.select_dtypes(include=[float, int])
X_val_numeric = X_val.select_dtypes(include=[float, int])

X_train_tensor = torch.tensor(X_train_numeric.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)  
X_val_tensor = torch.tensor(X_val_numeric.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)  

In [7]:
y_train_labels = y_train_tensor
y_val_labels = y_val_tensor

In [8]:
y_train_tensor.shape

torch.Size([705, 1])

In [9]:
train_combined = torch.cat((X_train_tensor, y_train_tensor), dim=1)
val_combined = torch.cat((X_val_tensor, y_val_tensor), dim=1)
full_dataset = torch.cat((train_combined, val_combined), dim=0)

In [12]:
from torch.utils.data import DataLoader, Dataset

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

In [16]:
def create_subsequences(full_data, seq_length):
    sequences = []
    targets = []

    X_data = full_data[:, :-1]  
    y_data = full_data[:, -1]  

    for i in range(seq_length, len(full_data)):
        X_seq = X_data[i - seq_length:i]
        y_seq = y_data[i] 
        sequences.append(X_seq)
        targets.append(y_seq)
        
    return torch.stack(sequences), torch.tensor(targets).long()

sequence_length = 4

X_sequences, y_sequences = create_subsequences(full_dataset, sequence_length)

print(f"X_sequences shape: {X_sequences.shape}")
print(f"y_sequences shape: {y_sequences.shape}")

X_sequences shape: torch.Size([878, 4, 35])
y_sequences shape: torch.Size([878])


In [18]:
full_dataset_df = pd.DataFrame(full_dataset.numpy())
full_dataset_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,-0.277074,-0.101973,-0.381232,-0.134296,1.188318,-0.137643,1.603098,0.133181,-1.822248,2022.0,...,0.866025,6.432490e-16,1.000000,1.000000e+00,6.123234e-17,0.866025,-0.5,1.224647e-16,-1.0,0.0
1,-0.499842,-1.172810,-1.231675,-1.351007,-0.306376,-0.137643,-2.009760,0.485982,0.014492,2022.0,...,0.866025,6.432490e-16,1.000000,1.000000e+00,6.123234e-17,0.866025,-0.5,1.224647e-16,-1.0,0.0
2,-0.332017,-1.617308,-1.345759,-1.712218,-0.378236,-0.137643,1.018358,-0.774021,1.760901,2022.0,...,0.866025,1.205367e-01,0.992709,1.000000e+00,6.123234e-17,0.866025,-0.5,1.224647e-16,-1.0,0.0
3,-1.266043,-1.475877,-1.345759,-1.189412,-0.378236,-0.137643,-0.255540,0.737983,0.526370,2022.0,...,0.866025,1.205367e-01,0.992709,1.000000e+00,6.123234e-17,0.866025,-0.5,1.224647e-16,-1.0,0.0
4,-0.734597,-1.081890,-1.750238,-1.246446,-0.378236,-0.137643,0.579803,3.157190,-0.994210,2022.0,...,0.866025,1.205367e-01,0.992709,1.000000e+00,6.123234e-17,0.866025,-0.5,1.224647e-16,-1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
877,1.530042,0.726410,0.946289,0.550104,-0.335120,-0.137643,0.611129,1.275585,-1.566309,2024.0,...,-0.866025,4.647232e-01,-0.885456,1.224647e-16,-1.000000e+00,-0.866025,-0.5,1.224647e-16,-1.0,0.0
878,1.462113,0.675899,0.655894,0.654665,-0.306376,-0.137643,0.861731,0.133181,-0.361889,2024.0,...,-0.866025,4.647232e-01,-0.885456,1.224647e-16,-1.000000e+00,-0.866025,-0.5,1.224647e-16,-1.0,0.0
879,2.159386,0.473854,0.541810,0.502577,-0.378236,-0.137643,1.237636,0.183582,0.360763,2024.0,...,-0.866025,4.647232e-01,-0.885456,1.224647e-16,-1.000000e+00,-0.866025,-0.5,1.224647e-16,-1.0,0.0
880,2.273267,0.514263,0.313642,0.607138,-0.378236,-0.137643,-1.967993,-1.513224,0.752199,2024.0,...,-0.866025,4.647232e-01,-0.885456,1.224647e-16,-1.000000e+00,-0.866025,-0.5,1.224647e-16,-1.0,0.0


In [197]:
import torch
import random

def create_subsequences_with_duplication(full_data, seq_length, random_duplicate):
    sequences = []
    targets = []

    X_data = full_data[:, :-1]  
    y_data = full_data[:, -1]  

    for i in range(seq_length, len(full_data)):
        X_seq = X_data[i - seq_length:i]
        y_seq = y_data[i]
        
        sequences.append(X_seq)
        targets.append(y_seq)
        
        if y_seq == 1:
            
            num_duplicates = 1
            for _ in range(num_duplicates):
                sequences.append(X_seq)
                targets.append(y_seq)
        elif y_seq == 2:
            num_duplicates = 1
            for _ in range(num_duplicates):
                sequences.append(X_seq)
                targets.append(y_seq)
        elif y_seq == 3:
            num_duplicates = 2
            for _ in range(num_duplicates):
                sequences.append(X_seq)
                targets.append(y_seq)
        elif y_seq == 4:
            num_duplicates = 2
            for _ in range(num_duplicates):
                sequences.append(X_seq)
                targets.append(y_seq)
        
        
    return torch.stack(sequences), torch.tensor(targets).long()

sequence_length = 5
random_duplicate = 2

X_sequences, y_sequences = create_subsequences_with_duplication(full_dataset, sequence_length, random_duplicate)

In [199]:
y_sequences.shape

torch.Size([911])

In [201]:
import torch
import random

def shuffle_sequences(X_sequences, y_sequences):
    combined = list(zip(X_sequences, y_sequences))
    random.shuffle(combined)
    X_shuffled, y_shuffled = zip(*combined)
    
    X_shuffled = torch.stack(X_shuffled)
    y_shuffled = torch.tensor(y_shuffled).long()
    
    return X_shuffled, y_shuffled

X_shuffled, y_shuffled = shuffle_sequences(X_sequences, y_sequences)

In [203]:
X_sequences, y_sequences = X_shuffled, y_shuffled

In [205]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter

def stratified_split(X, y, test_size=0.2):
    unique, counts = np.unique(y, return_counts=True)
    label_counts = dict(zip(unique, counts))

    rare_indices = []
    common_indices = []
    for label, count in label_counts.items():
        indices = np.where(y == label)[0]
        if count == 1:
            continue
            rare_indices.extend(indices)
        else:
            common_indices.extend(indices)
    
    common_train_idx, common_val_idx = train_test_split(
        common_indices, test_size=test_size, stratify=y[common_indices]
    )
    
    train_idx = np.concatenate((common_train_idx, rare_indices)).astype(int)
    val_idx = np.concatenate((common_val_idx, rare_indices)).astype(int)
    
    return train_idx, val_idx

X_np = X_sequences.numpy()
y_np = y_sequences.numpy()

train_idx, val_idx = stratified_split(X_np, y_np)

X_train, y_train = X_np[train_idx], y_np[train_idx]
X_val, y_val = X_np[val_idx], y_np[val_idx]

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

train_dataset = TimeSeriesDataset(X_train_tensor, y_train_tensor)
val_dataset = TimeSeriesDataset(X_val_tensor, y_val_tensor)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Train dataset size: 728
Validation dataset size: 183


In [207]:
unique_labels, counts = y_val_tensor.unique(return_counts=True)
print(f"Unique labels in y_val_tensor: {unique_labels}")
print(f"Counts of each label: {counts}")

unique_labels, counts = y_train_tensor.unique(return_counts=True)
print(f"Unique labels in y_train_tensor: {unique_labels}")
print(f"Counts of each label: {counts}")

Unique labels in y_val_tensor: tensor([0, 1, 2, 4])
Counts of each label: tensor([170,  11,   1,   1])
Unique labels in y_train_tensor: tensor([0, 1, 2, 3, 4])
Counts of each label: tensor([675,  43,   5,   3,   2])


In [209]:
import torch
from collections import Counter

val_y_values = [y for _, y in val_dataset]

val_y_tensor = torch.tensor(val_y_values)

unique_values, counts = torch.unique(val_y_tensor, return_counts=True)

print("Unique y values in validation dataset:")
for value, count in zip(unique_values.tolist(), counts.tolist()):
    print(f"Class {value}: {count} samples")

total_samples = len(val_y_values)
print("\nClass distribution in validation dataset:")
for value, count in zip(unique_values.tolist(), counts.tolist()):
    percentage = (count / total_samples) * 100
    print(f"Class {value}: {percentage:.2f}%")

val_y_counter = Counter(val_y_values)
print("\nDetailed count of y values:")
print(val_y_counter)

Unique y values in validation dataset:
Class 0: 170 samples
Class 1: 11 samples
Class 2: 1 samples
Class 4: 1 samples

Class distribution in validation dataset:
Class 0: 92.90%
Class 1: 6.01%
Class 2: 0.55%
Class 4: 0.55%

Detailed count of y values:
Counter({tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1

In [211]:
import torch
from collections import Counter

train_y_values = [y for _, y in train_dataset]

train_y_tensor = torch.tensor(train_y_values)

unique_values, counts = torch.unique(train_y_tensor, return_counts=True)

print("Unique y values in train dataset:")
for value, count in zip(unique_values.tolist(), counts.tolist()):
    print(f"Class {value}: {count} samples")

total_samples = len(train_y_values)
print("\nClass distribution in train dataset:")
for value, count in zip(unique_values.tolist(), counts.tolist()):
    percentage = (count / total_samples) * 100
    print(f"Class {value}: {percentage:.2f}%")

train_y_counter = Counter(val_y_values)
print("\nDetailed count of y values:")
print(train_y_counter)

Unique y values in train dataset:
Class 0: 675 samples
Class 1: 43 samples
Class 2: 5 samples
Class 3: 3 samples
Class 4: 2 samples

Class distribution in train dataset:
Class 0: 92.72%
Class 1: 5.91%
Class 2: 0.69%
Class 3: 0.41%
Class 4: 0.27%

Detailed count of y values:
Counter({tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, te

In [213]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

for X_batch, y_batch in train_loader:
    print(f"Train batch shapes: X: {X_batch.shape}, y: {y_batch.shape}")
    break

for X_batch, y_batch in val_loader:
    print(f"Validation batch shapes: X: {X_batch.shape}, y: {y_batch.shape}")
    break

Train batch shapes: X: torch.Size([16, 5, 35]), y: torch.Size([16])
Validation batch shapes: X: torch.Size([16, 5, 35]), y: torch.Size([16])


In [350]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class GatedLinearUnit(nn.Module):
    def __init__(self, input_size, output_size):
        super(GatedLinearUnit, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        self.gate = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x) * torch.sigmoid(self.gate(x))


class LayerNormLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
        super(LayerNormLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers=num_layers,
            batch_first=True, dropout=dropout, bidirectional=bidirectional
        )
        self.layer_norm = nn.LayerNorm(hidden_size * (2 if bidirectional else 1))

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = self.layer_norm(lstm_out)
        return lstm_out

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads, batch_first=True)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        return attn_output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.pos_embedding = nn.Embedding(max_len, d_model)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
        pos_enc = self.pos_embedding(positions)
        return x + pos_enc

class Mish(nn.Module):
    def forward(self, x):
        return x * torch.tanh(F.softplus(x))


import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, lstm_dropout=0.3, fcn_dropout=0.5):
        super(LSTMModel, self).__init__()
        self.bidirectional = True
        self.num_directions = 2 if self.bidirectional else 1
        
        self.num_heads = 16
        self.hidden_size = (hidden_size // (self.num_directions * 3 * self.num_heads)) * self.num_heads
        
        print(f"Adjusted hidden_size: {self.hidden_size}")
        
        self.embedding_size = input_size

        self.positional_encoding = PositionalEncoding(self.embedding_size)

        self.lstm1 = LayerNormLSTM(
            self.embedding_size, self.hidden_size, num_layers=num_layers,
            dropout=lstm_dropout, bidirectional=self.bidirectional
        )
        self.lstm2 = ResidualLSTM(
            self.embedding_size, self.hidden_size, num_layers=num_layers,
            dropout=lstm_dropout, bidirectional=self.bidirectional
        )
        self.lstm3 = LayerDropLSTM(
            self.embedding_size, self.hidden_size, num_layers=num_layers,
            dropout=lstm_dropout, bidirectional=self.bidirectional
        )

        self.combined_lstm_size = self.hidden_size * self.num_directions * 3
        
        print(f"Combined LSTM size: {self.combined_lstm_size}")
        print(f"Number of heads: {self.num_heads}")
        print(f"Is combined_lstm_size divisible by num_heads? {self.combined_lstm_size % self.num_heads == 0}")

        self.transformer_layer = nn.TransformerEncoderLayer(
            d_model=self.combined_lstm_size,
            nhead=self.num_heads,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.transformer_layer, num_layers=1
        )

        self.attention = MultiHeadSelfAttention(self.combined_lstm_size, self.num_heads)

        self.glu1 = GatedLinearUnit(self.combined_lstm_size, self.combined_lstm_size // 2)
        self.glu2 = GatedLinearUnit(self.combined_lstm_size // 2, self.combined_lstm_size // 4)

        self.fc = nn.Linear(self.combined_lstm_size // 4, num_classes)

        self.layer_norm2 = nn.LayerNorm(self.combined_lstm_size // 2)
        self.layer_norm3 = nn.LayerNorm(self.combined_lstm_size // 4)

        self.dropout = nn.Dropout(p=fcn_dropout)

    def forward(self, x):
        x = self.positional_encoding(x)

        lstm_out1 = self.lstm1(x)
        lstm_out2 = self.lstm2(x)
        lstm_out3 = self.lstm3(x)

        lstm_out_concat = torch.cat((lstm_out1, lstm_out2, lstm_out3), dim=-1)

        transformer_out = self.transformer_encoder(lstm_out_concat)
        transformer_out = lstm_out_concat + transformer_out  

        attn_out = self.attention(transformer_out)
        attn_out = transformer_out + attn_out  


        global_avg_pool = torch.mean(attn_out, dim=1)

        out = self.glu1(global_avg_pool)
        out = self.layer_norm2(out)
        out = self.dropout(out)

        out = self.glu2(out)
        out = self.layer_norm3(out)
        out = self.dropout(out)

        out = self.fc(out)

        return out


class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean', label_smoothing=0.1):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.label_smoothing = label_smoothing

    def forward(self, inputs, targets):
        num_classes = inputs.size(1)
        smoothed_labels = F.one_hot(targets, num_classes=num_classes)
        smoothed_labels = smoothed_labels * (1 - self.label_smoothing) + self.label_smoothing / num_classes

        ce_loss = F.cross_entropy(inputs, targets, reduction='none', label_smoothing=self.label_smoothing)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


In [509]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GatedLinearUnit(nn.Module):
    def __init__(self, input_size, output_size):
        super(GatedLinearUnit, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        self.gate = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x) * torch.sigmoid(self.gate(x))

class LayerNormLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
        super(LayerNormLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers=num_layers,
            batch_first=True, dropout=dropout, bidirectional=bidirectional
        )
        self.layer_norm = nn.LayerNorm(hidden_size * (2 if bidirectional else 1))

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = self.layer_norm(lstm_out)
        return lstm_out

class LayerDropLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional, layer_drop_prob=0.2):
        super(LayerDropLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers,
                            batch_first=True, dropout=dropout, bidirectional=bidirectional)
        self.layer_drop_prob = layer_drop_prob
        self.projection = nn.Linear(input_size, hidden_size * (2 if bidirectional else 1))

    def forward(self, x):
        if self.training and torch.rand(1).item() < self.layer_drop_prob:
            return self.projection(x)  
        lstm_out, _ = self.lstm(x)
        return lstm_out

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads, batch_first=True)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        return attn_output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.pos_embedding = nn.Embedding(max_len, d_model)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
        pos_enc = self.pos_embedding(positions)
        return x + pos_enc

class Mish(nn.Module):
    def forward(self, x):
        return x * torch.tanh(F.softplus(x))


class ResidualLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
        super(ResidualLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers,
                            batch_first=True, dropout=dropout, bidirectional=bidirectional)
        self.projection = nn.Linear(hidden_size * (2 if bidirectional else 1), hidden_size * (2 if bidirectional else 1))

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        projected_out = self.projection(lstm_out)
        return projected_out



class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, lstm_dropout=0.3, fcn_dropout=0.5, debug=False):
        super(LSTMModel, self).__init__()
        self.bidirectional = True
        self.num_directions = 2 if self.bidirectional else 1
        self.debug = debug
        
        self.num_heads = 16
        self.hidden_size = hidden_size // (self.num_directions * 3)
        
        if self.debug:
            print(f"Adjusted hidden_size: {self.hidden_size}")
        
        self.embedding_size = input_size

        self.positional_encoding = PositionalEncoding(self.embedding_size)

        self.lstm1 = LayerNormLSTM(
            self.embedding_size, self.hidden_size, num_layers=num_layers,
            dropout=lstm_dropout, bidirectional=self.bidirectional
        )
        self.lstm2 = ResidualLSTM(
            self.embedding_size, self.hidden_size, num_layers=num_layers,
            dropout=lstm_dropout, bidirectional=self.bidirectional
        )
        self.lstm3 = LayerDropLSTM(
            self.embedding_size, self.hidden_size, num_layers=num_layers,
            dropout=lstm_dropout, bidirectional=self.bidirectional
        )

        self.combined_lstm_size = self.hidden_size * self.num_directions * 3
        
        if self.debug:
            print(f"Combined LSTM size: {self.combined_lstm_size}")
            print(f"Number of heads: {self.num_heads}")
            print(f"Is combined_lstm_size divisible by num_heads? {self.combined_lstm_size % self.num_heads == 0}")

        self.transformer_layer = nn.TransformerEncoderLayer(
            d_model=self.combined_lstm_size,
            nhead=self.num_heads,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.transformer_layer, num_layers=3
        )

        self.attention = MultiHeadSelfAttention(self.combined_lstm_size, self.num_heads)

        self.glu1 = GatedLinearUnit(self.combined_lstm_size, self.combined_lstm_size // 2)
        self.glu2 = GatedLinearUnit(self.combined_lstm_size // 2, self.combined_lstm_size // 4)

        self.fc = nn.Linear(self.combined_lstm_size // 4, num_classes)

        self.layer_norm2 = nn.LayerNorm(self.combined_lstm_size // 2)
        self.layer_norm3 = nn.LayerNorm(self.combined_lstm_size // 4)

        self.dropout = nn.Dropout(p=fcn_dropout)

    def forward(self, x):
        x = self.positional_encoding(x)
        
        if self.debug:
            print(f"Shape after positional encoding: {x.shape}")


        lstm_out1 = self.lstm1(x)
        lstm_out2 = self.lstm2(x)
        lstm_out3 = self.lstm3(x)
        
        if self.debug:
            print(f"Shape of lstm_out1: {lstm_out1.shape}")
            print(f"Shape of lstm_out2: {lstm_out2.shape}")
            print(f"Shape of lstm_out3: {lstm_out3.shape}")

        lstm_out_concat = torch.cat((lstm_out1, lstm_out2, lstm_out3), dim=-1)
        
        if self.debug:
            print(f"Shape after concatenation: {lstm_out_concat.shape}")
        transformer_out = self.transformer_encoder(lstm_out_concat)
        transformer_out = lstm_out_concat + transformer_out  #Adding residual connection
        
        if self.debug:
            print(f"Shape after transformer: {transformer_out.shape}")

        #Attention mechanism
        attn_out = self.attention(transformer_out)
        attn_out = transformer_out + attn_out  #Adding residual connection
        
        if self.debug:
            print(f"Shape after attention: {attn_out.shape}")

        #Global average pooling
        global_avg_pool = torch.mean(attn_out, dim=1)
        
        if self.debug:
            print(f"Shape after global average pooling: {global_avg_pool.shape}")

        #Passing through GLUs
        out = self.glu1(global_avg_pool)
        out = self.layer_norm2(out)
        out = self.dropout(out)
        
        if self.debug:
            print(f"Shape after first GLU: {out.shape}")

        out = self.glu2(out)
        out = self.layer_norm3(out)
        out = self.dropout(out)
        
        if self.debug:
            print(f"Shape after second GLU: {out.shape}")

        out = self.fc(out)
        
        if self.debug:
            print(f"Final output shape: {out.shape}")

        return out

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean', label_smoothing=0.1):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.label_smoothing = label_smoothing

    def forward(self, inputs, targets):
        num_classes = inputs.size(1)
        smoothed_labels = F.one_hot(targets, num_classes=num_classes)
        smoothed_labels = smoothed_labels * (1 - self.label_smoothing) + self.label_smoothing / num_classes

        ce_loss = F.cross_entropy(inputs, targets, reduction='none', label_smoothing=self.label_smoothing)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

In [511]:
X_sequences.shape

torch.Size([911, 5, 35])

In [513]:
input_size = X_sequences.shape[2] #should match your X_sequences.shape[2] which is 35
hidden_size = 384  #should be divisible by (num_directions * 3 * num_heads)
num_layers = 4
num_classes = 5

print(f"Input size: {input_size}")
print(f"Initial hidden size: {hidden_size}")

model = LSTMModel(input_size, hidden_size, num_layers, num_classes, debug=True)

batch_size = 16
sequence_length = 100
dummy_input = torch.randn(batch_size, sequence_length, input_size)

output = model(dummy_input)

Input size: 35
Initial hidden size: 384
Adjusted hidden_size: 64
Combined LSTM size: 384
Number of heads: 16
Is combined_lstm_size divisible by num_heads? True
Shape after positional encoding: torch.Size([16, 100, 35])
Shape of lstm_out1: torch.Size([16, 100, 128])
Shape of lstm_out2: torch.Size([16, 100, 128])
Shape of lstm_out3: torch.Size([16, 100, 128])
Shape after concatenation: torch.Size([16, 100, 384])
Shape after transformer: torch.Size([16, 100, 384])
Shape after attention: torch.Size([16, 100, 384])
Shape after global average pooling: torch.Size([16, 384])
Shape after first GLU: torch.Size([16, 192])
Shape after second GLU: torch.Size([16, 96])
Final output shape: torch.Size([16, 5])


In [454]:
input_size = X_sequences.shape[2]
hidden_size = 384 #should be divisible by (num_directions * 3 * num_heads)
num_layers = 4
num_layers = 4
num_epochs = 70
num_classes = 5

In [456]:
model = LSTMModel(input_size, hidden_size, num_layers, num_classes, debug = False)
criterion = FocalLoss(alpha=1, gamma=2, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-7)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

LSTMModel(
  (positional_encoding): PositionalEncoding(
    (pos_embedding): Embedding(5000, 35)
  )
  (lstm1): LayerNormLSTM(
    (lstm): LSTM(35, 64, num_layers=4, batch_first=True, dropout=0.3, bidirectional=True)
    (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (lstm2): ResidualLSTM(
    (lstm): LSTM(35, 64, num_layers=4, batch_first=True, dropout=0.3, bidirectional=True)
    (projection): Linear(in_features=128, out_features=128, bias=True)
  )
  (lstm3): LayerDropLSTM(
    (lstm): LSTM(35, 64, num_layers=4, batch_first=True, dropout=0.3, bidirectional=True)
    (projection): Linear(in_features=35, out_features=128, bias=True)
  )
  (transformer_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
    )
    (linear1): Linear(in_features=384, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_f

In [519]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight_ih' in name:
                nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                nn.init.orthogonal_(param.data)
            elif 'bias_ih' in name:
                nn.init.zeros_(param.data)
                #setting forget gate bias to 1
                n = param.size(0)
                start, end = n // 4, n // 2
                param.data[start:end].fill_(1.)
            elif 'bias_hh' in name:
                nn.init.zeros_(param.data)
    elif isinstance(m, nn.LayerNorm):
        nn.init.ones_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.MultiheadAttention):
        if m.in_proj_weight is not None:
            nn.init.xavier_uniform_(m.in_proj_weight)
        if m.out_proj.weight is not None:
            nn.init.xavier_uniform_(m.out_proj.weight)
        if m.in_proj_bias is not None:
            nn.init.zeros_(m.in_proj_bias)
        if m.out_proj.bias is not None:
            nn.init.zeros_(m.out_proj.bias)

model.apply(init_weights)

print("Model weights initialized.")

Model weights initialized.


In [458]:
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)

# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=9)

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=0.001, steps_per_epoch=len(train_loader), epochs=num_epochs
)

# scheduler = optim.lr_scheduler.CyclicLR(
#     optimizer, base_lr=0.0001, max_lr=0.01, step_size_up=5, mode='triangular2'
# )

# optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-6)

In [460]:
early_stop_patience = 10
best_val_loss = float('inf')
epochs_no_improve = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

    scheduler.step()

    train_loss /= len(train_loader.dataset)
    train_accuracy = 100 * correct / total
    
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item() * X_batch.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    val_loss /= len(val_loader.dataset)
    val_accuracy = 100 * correct / total

    print(f'Epoch [{epoch+1}/{num_epochs}], '
          f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        best_model = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve == early_stop_patience:
            print("Early stopping!")
            model.load_state_dict(best_model)
            break

Epoch [1/70], Train Loss: 0.3912, Train Acc: 81.32%, Val Loss: 0.2539, Val Acc: 92.90%
Epoch [2/70], Train Loss: 0.2750, Train Acc: 92.72%, Val Loss: 0.2466, Val Acc: 92.90%
Epoch [3/70], Train Loss: 0.2408, Train Acc: 92.58%, Val Loss: 0.2178, Val Acc: 92.90%
Epoch [4/70], Train Loss: 0.2344, Train Acc: 92.72%, Val Loss: 0.2228, Val Acc: 92.90%
Epoch [5/70], Train Loss: 0.2542, Train Acc: 92.58%, Val Loss: 0.2279, Val Acc: 92.90%
Epoch [6/70], Train Loss: 0.2379, Train Acc: 92.58%, Val Loss: 0.2182, Val Acc: 92.90%
Epoch [7/70], Train Loss: 0.2391, Train Acc: 92.72%, Val Loss: 0.2118, Val Acc: 92.90%
Epoch [8/70], Train Loss: 0.2458, Train Acc: 92.72%, Val Loss: 0.2133, Val Acc: 92.90%
Epoch [9/70], Train Loss: 0.2372, Train Acc: 92.72%, Val Loss: 0.2313, Val Acc: 92.90%
Epoch [10/70], Train Loss: 0.2391, Train Acc: 92.72%, Val Loss: 0.2315, Val Acc: 92.90%
Epoch [11/70], Train Loss: 0.2336, Train Acc: 92.45%, Val Loss: 0.2115, Val Acc: 92.90%
Epoch [12/70], Train Loss: 0.2192, Train 

In [462]:
torch.save(model.state_dict(), 'best_lstm_model_improved.pth')

In [464]:
merged_data_2024_test_cleaned.to_csv('X_test.csv', index=False)

In [466]:
X_test.shape

(133, 12)

In [472]:
X_test_numeric = merged_data_2024_test_cleaned.select_dtypes(include=[float, int])


X_test_tensor = torch.tensor(X_test_numeric.values, dtype=torch.float32)

def create_sequences_test(X, seq_length):
    sequences = []
    for i in range(seq_length, len(X)):
        X_seq = X[i-seq_length:i]
        sequences.append(X_seq)
    return torch.stack(sequences)

sequence_length = 5
X_test_sequences = create_sequences_test(X_test_tensor, sequence_length)

test_dataset = torch.utils.data.TensorDataset(X_test_sequences)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

In [474]:
model.load_state_dict(torch.load('best_lstm_model_improved.pth', weights_only=True))

model.to(device)
model.eval()
sequence_predictions = []
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch[0].to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        sequence_predictions.extend(predicted.cpu().numpy())
        
individual_predictions = []
for i in range(len(X_test_numeric) - sequence_length):
    individual_predictions.append(sequence_predictions[i])

for i in range(sequence_length):
    individual_predictions.insert(0, individual_predictions[0])

In [476]:
label_mapping_legend = {'Good': 0, 'Moderate': 1, 'Poor': 2, 'Severe': 3, 'Unhealthy': 4}
reverse_label_mapping = {v: k for k, v in label_mapping_legend.items()}
test_predictions_labels = pd.Series([reverse_label_mapping[pred] for pred in individual_predictions], name='Predicted_AQI')
ID_column = pd.Series(range(1, len(individual_predictions) + 1), name='ID')
predictions_df = pd.concat([ID_column, test_predictions_labels], axis=1)

predictions_df.to_csv('test_predictions_with_labels.csv', index=False)
print(predictions_df.head())
print(f"Total predictions: {len(predictions_df)}")
print(f"Original data points: {len(X_test_numeric)}")

   ID Predicted_AQI
0   1          Good
1   2          Good
2   3          Good
3   4          Good
4   5          Good
Total predictions: 133
Original data points: 133


In [478]:
unique_value_counts = predictions_df['Predicted_AQI'].value_counts()
unique_value_counts

Predicted_AQI
Good    133
Name: count, dtype: int64