### Training

In [11]:
!pip install torch

Collecting torch
  Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl (66.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sympy, torch
  Attempting uninstall: sympy
    Found existing installation: sympy 1.12
    Uninstalling sympy-1.12:
      Successfully uninstalled sympy-1.12
Successfully installed sympy-1.13.1 torch-2.6.0


In [23]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsing cached huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl (418 kB)
Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m14.7 MB/s[0m eta [3

## Due to Jupyter constraints and crash issues this notebook represents only a sample of the training process (1 epoch)
## In reality, every model is trained for 50 epochs or Early stopping in the terminal separately. 
## The log files for the 50 epoch trainings are attached as a part of the repository under the folder "training_logs"
## The models used for predictions, inference and loss calculations are 50 epochs trained ones.

In [97]:
import os
import json
import torch
import yaml
import time
import random
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer

In [73]:
json_path = "../src/dataset/processed/split_data.json"

In [74]:
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

In [75]:
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [76]:
class DatasetLoader(Dataset):
    
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        assert len(texts) == len(labels), "Length of texts and labels must be the same"
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoded_text = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=120,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        label = torch.tensor(self.labels[idx])
        
        return (encoded_text['input_ids'].squeeze(0), encoded_text['attention_mask'].squeeze(0)), label


In [77]:
train_texts, train_labels = data["train_texts"], data["train_labels"]
valid_texts, valid_labels = data["valid_texts"], data["valid_labels"]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_vocab_size = tokenizer.vocab_size

In [78]:
train_dataset = DatasetLoader(train_texts, train_labels, tokenizer)
valid_dataset = DatasetLoader(valid_texts, valid_labels, tokenizer)

In [79]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128)

In [80]:
if torch.backends.mps.is_available():
    DEVICE_TYPE = "mps"
else:
    DEVICE_TYPE = "cpu"

In [176]:
cnn_model = "CNN_Model"
lstm_tc_model = "LSTM_Text_Classifier"
mlp_class_model = "MLP_Classifier"
lstm_multihead = "LSTM_Multi_Head_Attention"
rcnn = "RCNN_Text_Classifier"
bigru = "BiGRU_Attention_Residual"
yml_file_path = "../src/configs/hyperparams.yaml"

In [82]:
def read_config_file(file_name,model_name):
    try:
        with open(file_name, "r") as yamlfile:
            data = yaml.load(yamlfile, Loader=yaml.FullLoader)
            if model_name:
                if model_name in data:
                    model_parameters = data[model_name]
            else:
                model_parameters = data
            return model_parameters
    except Exception as e:
        print(e)
        model_parameters = {}
        return model_parameters

In [83]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            @patience (int): How long to wait after last time validation loss improved.
            @verbose (bool): If True, prints a message for each validation loss improvement. 
            @delta (float): Minimum change in the monitored quantity to qualify as an improvement.
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.inf
        self.delta = delta

    def __call__(self, val_loss):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss)
            self.counter = 0

    def save_checkpoint(self, val_loss):
        """
        Saves model checkpoints when validation loss decrease.
        """
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        self.val_loss_min = val_loss

In [94]:
def train(model, optimizer, train_loader, val_loader, loss_fn, epochs=10, 
          model_save_path='models/best_model.pth',early_stopping=None,device=DEVICE_TYPE):
    set_seed(2023)
    best_accuracy = 0
    print("=========Starting Training==========")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*60)
    for epoch in range(epochs):
        model.train()
        t0_epoch = time.time()
        total_loss = 0
        for batch_X ,batch_labels in train_loader:
            input_ids, attention_masks = batch_X
            model.zero_grad()
            logits = model(input_ids.to(DEVICE_TYPE))
            loss = loss_fn(logits, batch_labels.to(device))
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_loader)
        
        if val_loader is not None:
            
            val_loss, val_accuracy = evaluate(model, val_loader, loss_fn, device)
            if early_stopping:
                early_stopping(val_loss)
                if early_stopping.early_stop:
                    print("Early stopping")
                    break
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
                torch.save(model.state_dict(), model_save_path)
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch + 1:^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
    print("\n")
    print("==========Best Accuracy After training================",best_accuracy)

In [95]:
def evaluate(model, val_loader, loss_fn, device=DEVICE_TYPE):
    model.eval()
    val_accuracy = []
    val_loss = []
    with torch.no_grad():
        for val_batch_input, val_batch_label in val_loader:
            input_ids, attention_masks = val_batch_input
            logits = model(input_ids.to(DEVICE_TYPE))
            loss = loss_fn(logits, val_batch_label.to(device))
            val_loss.append(loss.item())
            preds = torch.argmax(logits, dim=1)
            labels = torch.argmax(val_batch_label, dim=1)
            accuracy = (preds == labels.to(device)).float().mean() * 100
            val_accuracy.append(accuracy.item())

        val_loss = torch.tensor(val_loss).mean().item()
        val_accuracy = torch.tensor(val_accuracy).mean().item()
    
    return val_loss, val_accuracy

### CNN Model

In [136]:
class CNN_Model(nn.Module):
    def __init__(self,
                 vocab_size,
                 embed_dim=256, 
                 filter_sizes=[3, 4, 5],
                 num_filters=[200, 250, 200],
                 num_classes=3,
                 dropout=0.5
                 ):
        super(CNN_Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters[i],
                     kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        self.fc1 = nn.Linear(np.sum(num_filters), 256)     
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        x_embed = self.embedding(x)
        x_reshaped = x_embed.permute(0, 2, 1)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list], dim=1)
        fc1 = self.dropout(F.relu(self.fc1(x_fc)))
        logits = self.fc2(fc1)
        return logits

In [137]:
model_params = read_config_file(yml_file_path, cnn_model)
model_params['vocab_size'] = tokenizer_vocab_size

In [138]:
model = CNN_Model(**model_params)
optimizer = optim.Adadelta(model.parameters(), lr=0.01, rho=0.95)

In [139]:
loss_fn = nn.CrossEntropyLoss()

In [140]:
model.to(DEVICE_TYPE)

CNN_Model(
  (embedding): Embedding(30522, 128)
  (conv1d_list): ModuleList(
    (0): Conv1d(128, 64, kernel_size=(3,), stride=(1,))
    (1): Conv1d(128, 128, kernel_size=(4,), stride=(1,))
    (2): Conv1d(128, 256, kernel_size=(5,), stride=(1,))
  )
  (fc1): Linear(in_features=448, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [141]:
early_stopping = EarlyStopping(patience=3, verbose=True)

In [142]:
train(model, optimizer, train_loader, valid_loader, loss_fn, epochs=1, model_save_path=f'models/{cnn_model}_epoch_1', early_stopping=early_stopping)

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
Validation loss decreased (inf --> 0.540565).  Saving model ...
   1    |   0.652857   |  0.540565  |   75.51   |  207.71  




### LSTM Text Classifier Model

In [143]:
class LSTM_Text_Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        """
        Initialize the LSTMTextClassifier model.

        Parameters:
        @vocab_size: Size of the vocabulary.
        @embedding_dim: Dimension of the input embeddings.
        @hidden_dim: Dimension of the hidden state in the LSTM.
        @output_dim: Number of classes in the output layer.
        @n_layers: Number of layers in the LSTM.
        @bidirectional: If True, initializes a bidirectional LSTM.
        @dropout: Dropout rate for regularization.
        """
        super(LSTM_Text_Classifier, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Forward pass through the model.
        Params:
        @x: input_ids
        Returns: The logits for each class.
        """
        text_embeddings = self.embedding(x)
        lstm_out, (hidden,cell) = self.lstm(text_embeddings)
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
        hidden = self.dropout(hidden)
        logits = self.fc(hidden)
        return logits

In [144]:
model_params = read_config_file(yml_file_path, lstm_tc_model)
model_params['vocab_size'] = tokenizer_vocab_size

In [145]:
model = LSTM_Text_Classifier(**model_params)
optimizer = optim.Adadelta(model.parameters(), lr=0.01, rho=0.95)

In [146]:
loss_fn = nn.CrossEntropyLoss()

In [147]:
model.to(DEVICE_TYPE)

LSTM_Text_Classifier(
  (embedding): Embedding(30522, 128)
  (lstm): LSTM(128, 128, num_layers=6, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [148]:
early_stopping = EarlyStopping(patience=3, verbose=True)

In [149]:
train(model, optimizer, train_loader, valid_loader, loss_fn, epochs=1, model_save_path=f'models/{lstm_tc_model}_epoch_1', early_stopping=early_stopping)

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
Validation loss decreased (inf --> 1.098479).  Saving model ...
   1    |   1.098663   |  1.098479  |   34.19   |  697.25  




### MLP Classifier Model

In [150]:
class MLP_Classifier(nn.Module):
    def __init__(self, vocab_size, input_dim, hidden_size, num_classes=2, dropout=0.5):
        """
        Params:
        @vocab_size: Size of the vocabulary.
        @input_dim: Dimension of the input embeddings.
        @hidden_dim: Dimension of the hidden state in the LSTM.
        @num_classes: Number of classes in the output layer.
        @dropout: Dropout rate for regularization.
        """
        super(MLP_Classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, input_dim)
        self.fc1 = nn.Linear(input_dim * 120, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.fc3 = nn.Linear(hidden_size // 2, hidden_size // 4)
        self.fc4 = nn.Linear(hidden_size // 4, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embeddings = self.embedding(x)
        x = embeddings.view(embeddings.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x


In [151]:
model_params = read_config_file(yml_file_path, mlp_class_model)
model_params['vocab_size'] = tokenizer_vocab_size

In [152]:
model = MLP_Classifier(**model_params)
optimizer = optim.Adadelta(model.parameters(), lr=0.01, rho=0.95)

In [153]:
loss_fn = nn.CrossEntropyLoss()

In [154]:
model.to(DEVICE_TYPE)

MLP_Classifier(
  (embedding): Embedding(30522, 128)
  (fc1): Linear(in_features=15360, out_features=2048, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=2048, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=512, bias=True)
  (fc4): Linear(in_features=512, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [155]:
early_stopping = EarlyStopping(patience=3, verbose=True)

In [156]:
train(model, optimizer, train_loader, valid_loader, loss_fn, epochs=1, model_save_path=f'models/{mlp_class_model}_epoch_1', early_stopping=early_stopping)

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
Validation loss decreased (inf --> 0.834795).  Saving model ...
   1    |   0.897569   |  0.834795  |   59.79   |  174.80  




### LSTM Multi Head Attention

In [157]:
class LSTM_Multi_Head_Attention(nn.Module):
    def __init__(self, vocab_size, input_dim, hidden_dim, output_dim, num_layers, bidirectional , dropout,
                num_heads):
        """
        Params:
        @vocab_size: Size of the vocabulary.
        @embedding_dim: Dimension of the input embeddings.
        @hidden_dim: Dimension of the hidden state in the LSTM.
        @output_dim: Number of classes in the output layer.
        @n_layers: Number of layers in the LSTM.
        @bidirectional: If True, initializes a bidirectional LSTM.
        @dropout: Dropout rate for regularization.
        """
        super(LSTM_Multi_Head_Attention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, input_dim)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, bidirectional=self.bidirectional, batch_first=True, dropout=dropout)
        
        # Attention Layer
        if self.bidirectional:
            self.head_dim = hidden_dim * 2 // num_heads
        else:
            self.head_dim = hidden_dim // num_heads
        assert self.hidden_dim % num_heads == 0, "hidden_dim must be divisible by the number of heads"
        if self.bidirectional:
            self.query_layers = nn.ModuleList([nn.Linear(hidden_dim * 2, self.head_dim) for _ in range(self.num_heads)])
            self.key_layers = nn.ModuleList([nn.Linear(hidden_dim * 2, self.head_dim) for _ in range(self.num_heads)])
            self.value_layers = nn.ModuleList([nn.Linear(hidden_dim * 2, self.head_dim) for _ in range(self.num_heads)])
        elif not self.bidirectional:
            self.query_layers = nn.ModuleList([nn.Linear(hidden_dim, self.head_dim) for _ in range(self.num_heads)])
            self.key_layers = nn.ModuleList([nn.Linear(hidden_dim, self.head_dim) for _ in range(self.num_heads)])
            self.value_layers = nn.ModuleList([nn.Linear(hidden_dim, self.head_dim) for _ in range(self.num_heads)])
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.last_attention_weights = None
        self.dropout = nn.Dropout(p=dropout)

    def attention_net(self, lstm_outputs):
        """
        Multi-Head Attention mechanism
        """
        attention_outputs = []
        attention_scores = []
        for i in range(self.num_heads):
            Q = self.query_layers[i](lstm_outputs)
            Q = self.dropout(Q)
            K = self.key_layers[i](lstm_outputs)
            K = self.dropout(K)
            V = self.value_layers[i](lstm_outputs)
            V = self.dropout(V)
            scores = torch.bmm(Q, K.transpose(1, 2)) / (self.head_dim ** 0.5)
            scores = F.softmax(scores, dim=-1)
            output = torch.bmm(scores, V)
            attention_scores.append(scores)
            attention_outputs.append(output)
        final_output = torch.cat(attention_outputs, dim=-1)
        self.last_attention_weights = attention_scores
        return final_output, attention_scores
            
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim).requires_grad_().to(DEVICE_TYPE)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_dim).requires_grad_().to(DEVICE_TYPE)
        x = self.embedding(x)
        lstm_out, (hn, cn) = self.lstm(x, (h0.detach(),c0.detach()))
        attention_output, attention_weights = self.attention_net(lstm_out)
        final_attention_output = torch.mean(attention_output, dim=1)
        out = self.fc(final_attention_output)
        return out

In [158]:
model_params = read_config_file(yml_file_path, lstm_multihead)
model_params['vocab_size'] = tokenizer_vocab_size

In [159]:
model = LSTM_Multi_Head_Attention(**model_params)
optimizer = optim.Adadelta(model.parameters(), lr=0.01, rho=0.95)

In [160]:
loss_fn = nn.CrossEntropyLoss()

In [161]:
model.to(DEVICE_TYPE)

LSTM_Multi_Head_Attention(
  (embedding): Embedding(30522, 128)
  (lstm): LSTM(128, 128, num_layers=6, batch_first=True, dropout=0.5, bidirectional=True)
  (query_layers): ModuleList(
    (0-7): 8 x Linear(in_features=256, out_features=32, bias=True)
  )
  (key_layers): ModuleList(
    (0-7): 8 x Linear(in_features=256, out_features=32, bias=True)
  )
  (value_layers): ModuleList(
    (0-7): 8 x Linear(in_features=256, out_features=32, bias=True)
  )
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [162]:
early_stopping = EarlyStopping(patience=3, verbose=True)

In [163]:
train(model, optimizer, train_loader, valid_loader, loss_fn, epochs=1, model_save_path=f'models/{lstm_multihead}_epoch_1', early_stopping=early_stopping)

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
Validation loss decreased (inf --> 1.098421).  Saving model ...
   1    |   1.098405   |  1.098421  |   34.19   |  1038.37 




### RCNN Text Classifier (Combine LSTM and CNN)

In [164]:
class RCNN_Text_Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
        """
        RCNN model combining LSTM and CNN.

        Params:
        @vocab_size: Size of vocabulary.
        @embedding_dim: Dimension of input embeddings.
        @hidden_dim: Hidden state size for LSTM.
        @output_dim: Number of output classes.
        @dropout: Dropout rate.
        """
        super(RCNN_Text_Classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.conv = nn.Conv1d(in_channels=embedding_dim + 2 * hidden_dim, out_channels=128, kernel_size=3, padding=1)
        self.fc = nn.Linear(128, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embeddings = self.embedding(x)  # [B, T, D]
        lstm_out, _ = self.lstm(embeddings)  # [B, T, 2H]
        combined = torch.cat([embeddings, lstm_out], dim=2)  # [B, T, D+2H]
        combined = combined.permute(0, 2, 1)  # [B, D+2H, T]
        conv_out = F.relu(self.conv(combined))  # [B, 128, T]
        pooled = F.max_pool1d(conv_out, kernel_size=conv_out.shape[2]).squeeze(2)  # [B, 128]
        dropped = self.dropout(pooled)
        logits = self.fc(dropped)
        return logits

In [170]:
rcnn_model_params = read_config_file(yml_file_path, rcnn)
rcnn_model_params['vocab_size'] = tokenizer_vocab_size

In [171]:
rcnn_model = RCNN_Text_Classifier(**rcnn_model_params)
rcnn_optimizer = optim.Adadelta(rcnn_model.parameters(), lr=0.01, rho=0.95)

In [172]:
loss_fn = nn.CrossEntropyLoss()

In [173]:
rcnn_model.to(DEVICE_TYPE)

RCNN_Text_Classifier(
  (embedding): Embedding(30522, 128)
  (lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (conv): Conv1d(384, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [174]:
rcnn_early_stopping = EarlyStopping(patience=3, verbose=True)

In [175]:
train(rcnn_model, rcnn_optimizer, train_loader, valid_loader, loss_fn, epochs=1, model_save_path=f'models/{rcnn}_epoch_1', early_stopping=rcnn_early_stopping)

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
Validation loss decreased (inf --> 0.569093).  Saving model ...
   1    |   0.692761   |  0.569093  |   74.90   |  244.58  


