In [3]:
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import torchtext.vocab as vocab
import torch.nn.functional as F
import nltk
from nltk.tokenize import word_tokenize

In [8]:
custom_embeddings = vocab.Vectors(name='W2V-Kin-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [4]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.float)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [6]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',
    train='train.csv', 
    test='test.csv',
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header(title)
)

In [9]:
# Perform train/validation set split
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))

# Build the vocabulary
TEXT.build_vocab(train_data.title, train_data.content, max_size=15000, vectors=custom_embeddings)
TITLE.vocab = TEXT.vocab
LABEL.build_vocab(train_data)

In [10]:
len(TEXT.vocab)

15002

In [6]:
# Create the iterator and place the tensor it returned on GPU(if it is available)
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [7]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
                          bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, hidden = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden.squeeze(0))

In [12]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

In [14]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [15]:
def multiclass_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.max(preds, 1)[1]
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [17]:
def multiclass_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.max(preds, 1)[1]
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

# Train the model
def train(model, iterator, optimizer, criterion):
    """Training the model"""
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()

        predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
        loss = criterion(predictions, batch.label.type(torch.long).to(device))
        acc = multiclass_accuracy(predictions, batch.label.type(torch.long).to(device))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Evaluate the model
def evaluate(model, iterator, criterion):
    """Evaluating the model"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
            loss = criterion(predictions, batch.label.type(torch.long).to(device))
            acc = multiclass_accuracy(predictions, batch.label.type(torch.long).to(device))
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [18]:
from tqdm import tqdm

if __name__ == "__main__":
    N_EPOCHS = 10

    for epoch in range(N_EPOCHS):
        # Wrapping train_iterator with tqdm for a progress bar
        train_iterator = tqdm(train_iterator, desc=f"Training Epoch {epoch + 1}")
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        
        # Wrapping valid_iterator with tqdm for a progress bar
        valid_iterator = tqdm(valid_iterator, desc=f"Validating Epoch {epoch + 1}")
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
        print(f'\n| Epoch: {epoch + 1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}% |'
              f' Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}% |')

    # Wrapping test_iterator with tqdm for a progress bar
    test_iterator = tqdm(test_iterator, desc="Testing")
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}% |')

    print(f'The model has {count_parameters(model):,} trainable parameters')


Training Epoch 1: 100%|███████████████████████| 466/466 [32:01<00:00,  4.12s/it]
Validating Epoch 1: 100%|███████████████████████| 52/52 [01:01<00:00,  1.18s/it]



| Epoch: 01 | Train Loss: 1.018 | Train Acc: 69.50% | Val. Loss: 0.679 | Val. Acc: 79.69% |


Training Epoch 2: 100%|███████████████████████| 466/466 [35:57<00:00,  4.63s/it]
Validating Epoch 2: 100%|███████████████████████| 52/52 [00:59<00:00,  1.14s/it]



| Epoch: 02 | Train Loss: 0.615 | Train Acc: 81.62% | Val. Loss: 0.557 | Val. Acc: 83.14% |


Training Epoch 3: 100%|███████████████████████| 466/466 [42:12<00:00,  5.43s/it]
Validating Epoch 3: 100%|███████████████████████| 52/52 [00:49<00:00,  1.06it/s]



| Epoch: 03 | Train Loss: 0.502 | Train Acc: 84.76% | Val. Loss: 0.485 | Val. Acc: 84.99% |


Training Epoch 4: 100%|███████████████████████| 466/466 [40:56<00:00,  5.27s/it]
Validating Epoch 4: 100%|███████████████████████| 52/52 [02:26<00:00,  2.82s/it]



| Epoch: 04 | Train Loss: 0.421 | Train Acc: 87.23% | Val. Loss: 0.461 | Val. Acc: 85.97% |


Training Epoch 5: 100%|███████████████████████| 466/466 [36:50<00:00,  4.74s/it]
Validating Epoch 5: 100%|███████████████████████| 52/52 [00:42<00:00,  1.21it/s]



| Epoch: 05 | Train Loss: 0.368 | Train Acc: 88.70% | Val. Loss: 0.407 | Val. Acc: 88.50% |


Training Epoch 6: 100%|███████████████████████| 466/466 [30:45<00:00,  3.96s/it]
Validating Epoch 6: 100%|███████████████████████| 52/52 [00:52<00:00,  1.01s/it]



| Epoch: 06 | Train Loss: 0.333 | Train Acc: 89.85% | Val. Loss: 0.405 | Val. Acc: 88.03% |


Training Epoch 7: 100%|███████████████████████| 466/466 [31:58<00:00,  4.12s/it]
Validating Epoch 7: 100%|███████████████████████| 52/52 [00:39<00:00,  1.31it/s]



| Epoch: 07 | Train Loss: 0.296 | Train Acc: 90.79% | Val. Loss: 0.400 | Val. Acc: 89.34% |


Training Epoch 8: 100%|███████████████████████| 466/466 [31:16<00:00,  4.03s/it]
Validating Epoch 8: 100%|███████████████████████| 52/52 [00:43<00:00,  1.19it/s]



| Epoch: 08 | Train Loss: 0.266 | Train Acc: 91.72% | Val. Loss: 0.402 | Val. Acc: 88.20% |


Training Epoch 9: 100%|███████████████████████| 466/466 [30:40<00:00,  3.95s/it]
Validating Epoch 9: 100%|███████████████████████| 52/52 [00:50<00:00,  1.04it/s]



| Epoch: 09 | Train Loss: 0.237 | Train Acc: 92.63% | Val. Loss: 0.454 | Val. Acc: 87.79% |


Training Epoch 10: 100%|██████████████████████| 466/466 [31:32<00:00,  4.06s/it]
Validating Epoch 10: 100%|██████████████████████| 52/52 [01:21<00:00,  1.57s/it]



| Epoch: 10 | Train Loss: 0.219 | Train Acc: 93.12% | Val. Loss: 0.422 | Val. Acc: 88.36% |


Testing: 100%|████████████████████████████████| 130/130 [02:30<00:00,  1.16s/it]

| Test Loss: 0.433 | Test Acc: 88.51% |
The model has 2,412,064 trainable parameters





In [19]:
torch.save(model.state_dict(), 'bigru_kinn_model.pt')

## Kirundi - without fine tuning

In [20]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [21]:
# Load your dataset (modify according to your actual dataset path and format)
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',  # Change to the directory containing your CSV files
    train='train.csv',  # Change to your train CSV file name
    test='kir_test.csv',  # Change to your test CSV file name
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header
)

In [22]:
TEXT.build_vocab(train_data, max_size=15000, vectors=custom_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [23]:
# Create the iterator and place the tensor it returned on GPU(if it is available)
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

test_iterator = data.BucketIterator(
    (test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [24]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
                          bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, hidden = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden.squeeze(0))

In [25]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

In [26]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [27]:
model_path = 'bigru_kinn_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

RNN(
  (embedding): Embedding(15002, 50)
  (rnn): GRU(50, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [29]:
from sklearn.metrics import f1_score

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)
            
            # Store predictions and labels for F1 score calculation
            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

            epoch_loss += loss.item()
            epoch_acc += acc

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='weighted')  # or 'macro'/'micro' based on your need

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

# Evaluate the model
test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1: {test_f1:.2f}')


Test Loss: 3.771 | Test Acc: 24.04% | Test F1: 0.23


## Kirundi - after fine tuning

In [110]:
kir_embeddings = vocab.Vectors(name='W2V-Kir-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [111]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [112]:
# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [113]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',  # Change to the directory containing your CSV files
    train='kir_train.csv',  # Change to your Kirundi train CSV file name
    test='kir_test.csv',  # Change to your test CSV file name
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header
)

In [114]:
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))

In [115]:
TEXT.build_vocab(train_data, max_size=15000, vectors=kir_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [116]:
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [117]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
                          bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, hidden = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden.squeeze(0))

In [118]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [119]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [75]:
model_path = 'bigru_kinn_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [76]:
# Fine-tuning the model on Kirundi training data
N_EPOCHS = 50

# Initialize variables to track the best model
best_valid_loss = float('inf')
model_path = 'Bigru_tuned_model.pt'

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.content).squeeze(1)
        loss = criterion(predictions, batch.label.long())  # Convert batch.label to LongTensor
        acc = (predictions.argmax(1) == batch.label.long()).sum().item() / len(batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label.long())  # Convert batch.label to LongTensor
            acc = (predictions.argmax(1) == batch.label.long()).sum().item() / len(batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Training loop with validation and checkpointing
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')
    
    # Save the model if the validation loss improves
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_path)
        print(f'Saved best model with validation loss: {best_valid_loss:.3f}')


Epoch: 01, Train Loss: 1.947, Train Acc: 36.61%, Val. Loss: 1.720, Val. Acc: 41.68%
Saved best model with validation loss: 1.720
Epoch: 02, Train Loss: 1.483, Train Acc: 50.20%, Val. Loss: 1.356, Val. Acc: 58.61%
Saved best model with validation loss: 1.356
Epoch: 03, Train Loss: 1.190, Train Acc: 61.67%, Val. Loss: 1.170, Val. Acc: 64.60%
Saved best model with validation loss: 1.170
Epoch: 04, Train Loss: 0.948, Train Acc: 68.79%, Val. Loss: 0.857, Val. Acc: 72.12%
Saved best model with validation loss: 0.857
Epoch: 05, Train Loss: 0.770, Train Acc: 74.20%, Val. Loss: 0.850, Val. Acc: 72.15%
Saved best model with validation loss: 0.850
Epoch: 06, Train Loss: 0.610, Train Acc: 79.52%, Val. Loss: 0.659, Val. Acc: 78.37%
Saved best model with validation loss: 0.659
Epoch: 07, Train Loss: 0.521, Train Acc: 81.33%, Val. Loss: 0.657, Val. Acc: 79.67%
Saved best model with validation loss: 0.657
Epoch: 08, Train Loss: 0.436, Train Acc: 83.31%, Val. Loss: 0.635, Val. Acc: 79.15%
Saved best mo

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'Bigru_tuned_model.pt')

In [77]:
model.eval()

RNN(
  (embedding): Embedding(15002, 50)
  (rnn): GRU(50, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [122]:
from sklearn.metrics import f1_score

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc

            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1 Score: {test_f1:.3f}')


Test Loss: 0.432 | Test Acc: 87.26% | Test F1 Score: 0.870


## Evaluating forgetting on Kinyarwanda

In [79]:
custom_embeddings = vocab.Vectors(name='W2V-Kin-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [80]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [92]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',
    train='train.csv', 
    test='kir_test.csv',
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header(title)
)

In [93]:
TEXT.build_vocab(train_data, max_size=15000, vectors=kir_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [94]:
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))

In [96]:
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [97]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
                          bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, hidden = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden.squeeze(0))

In [98]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

In [99]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [100]:
model_path = 'Bigru_tuned_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [101]:
model.eval()

RNN(
  (embedding): Embedding(15002, 50)
  (rnn): GRU(50, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [102]:
from sklearn.metrics import f1_score

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)
            
            # Store predictions and labels for F1 score calculation
            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

            epoch_loss += loss.item()
            epoch_acc += acc

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='weighted')  # or 'macro'/'micro' based on your need

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

# Evaluate the model
test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1: {test_f1:.2f}')


Test Loss: 4.443 | Test Acc: 16.46% | Test F1: 0.14
