In [None]:
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import torchtext.vocab as vocab
import torch.nn.functional as F
import nltk
from nltk.tokenize import word_tokenize

In [None]:
custom_embeddings = vocab.Vectors(name='W2V-Kin-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [None]:
# Ensure the necessary NLTK tokenizer data is downloaded
nltk.download('punkt')

In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.float)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [None]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',
    train='zero_kin_train.csv', 
    test='zero_kin_test.csv',
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header(title)
)

In [None]:
# Perform train/validation set split
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))

# Build the vocabulary
TEXT.build_vocab(train_data.title, train_data.content, max_size=15000, vectors=custom_embeddings)
TITLE.vocab = TEXT.vocab
LABEL.build_vocab(train_data)

In [None]:
# Create the iterator and place the tensor it returned on GPU(if it is available)
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [None]:
# Build the model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [None]:
# Create the instance of the model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]


In [None]:
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def multiclass_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.max(preds, 1)[1]
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def multiclass_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.max(preds, 1)[1]
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

# Train the model
def train(model, iterator, optimizer, criterion):
    """Training the model"""
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()

        predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
        loss = criterion(predictions, batch.label.type(torch.long).to(device))
        acc = multiclass_accuracy(predictions, batch.label.type(torch.long).to(device))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Evaluate the model
def evaluate(model, iterator, criterion):
    """Evaluating the model"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
            loss = criterion(predictions, batch.label.type(torch.long).to(device))
            acc = multiclass_accuracy(predictions, batch.label.type(torch.long).to(device))
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [None]:
from tqdm import tqdm

if __name__ == "__main__":
    N_EPOCHS = 8

    for epoch in range(N_EPOCHS):
        # Wrapping train_iterator with tqdm for a progress bar
        train_iterator = tqdm(train_iterator, desc=f"Training Epoch {epoch + 1}")
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        
        # Wrapping valid_iterator with tqdm for a progress bar
        valid_iterator = tqdm(valid_iterator, desc=f"Validating Epoch {epoch + 1}")
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
        print(f'\n| Epoch: {epoch + 1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}% |'
              f' Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}% |')

    # Wrapping test_iterator with tqdm for a progress bar
    test_iterator = tqdm(test_iterator, desc="Testing")
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}% |')

    print(f'The model has {count_parameters(model):,} trainable parameters')


In [None]:
torch.save(model.state_dict(), 'zero_kinn_model.pt')

## Kirundi - without fine tuning

In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [None]:
# Load your dataset (modify according to your actual dataset path and format)
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',  # Change to the directory containing your CSV files
    train='zero_kin_train.csv',  # Change to your train CSV file name
    test='zero_kir_test.csv',  # Change to your test CSV file name
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header
)

In [None]:
TEXT.build_vocab(train_data, max_size=15000, vectors=custom_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [None]:
# Create the iterator and place the tensor it returned on GPU(if it is available)
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

test_iterator = data.BucketIterator(
    (test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [None]:
# Build the model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [None]:
# Create the instance of the model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [None]:
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
model_path = 'zero_kinn_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

In [None]:
from sklearn.metrics import f1_score

# Evaluation function
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc

            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

# Evaluate the model
test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1 Score: {test_f1:.3f}')


## Kirundi - after fine tuning

In [None]:
kir_embeddings = vocab.Vectors(name='W2V-Kir-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [None]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',  # Change to the directory containing your CSV files
    train='zero_kir_train.csv',  # Change to your Kirundi train CSV file name
    test='zero_kir_test.csv',  # Change to your test CSV file name
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header
)

In [None]:
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))

In [None]:
TEXT.build_vocab(train_data, max_size=15000, vectors=kir_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [None]:
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [None]:
# Build the model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [None]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters(),lr=1e-5)

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
model_path = 'zero_kinn_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))

In [None]:
# Fine-tuning the model on Kirundi training data
N_EPOCHS = 100

# Initialize variables to track the best model
best_valid_loss = float('inf')
model_path = 'zero_Cnn_tuned_model.pt'

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.content).squeeze(1)
        loss = criterion(predictions, batch.label.long())  # Convert batch.label to LongTensor
        acc = (predictions.argmax(1) == batch.label.long()).sum().item() / len(batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label.long())  # Convert batch.label to LongTensor
            acc = (predictions.argmax(1) == batch.label.long()).sum().item() / len(batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Training loop with validation and checkpointing
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')
    
    # Save the model if the validation loss improves
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_path)
        print(f'Saved best model with validation loss: {best_valid_loss:.3f}')


In [None]:
torch.save(model.state_dict(), 'zero_Cnn_tuned_model.pt')

In [None]:
model.eval()

In [None]:
from sklearn.metrics import f1_score

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc

            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1 Score: {test_f1:.3f}')


## 5-Fold CV

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
from sklearn.model_selection import KFold
from nltk.tokenize import word_tokenize

# Set random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

# Load the data
df = pd.read_csv('cleaned/zero_kir_train.csv')

# Initialize the KFold split
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

# Define the CNN model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

# Define training and evaluation functions
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.content).squeeze(1)
        loss = criterion(predictions, batch.label.long())
        acc = (predictions.argmax(1) == batch.label.long()).sum().item() / len(batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label.long())
            acc = (predictions.argmax(1) == batch.label.long()).sum().item() / len(batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# 5-Fold Cross-Validation
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f'Fold {fold+1}/{kf.get_n_splits()}')

    # Split data
    train_df = df.iloc[train_idx]
    valid_df = df.iloc[val_idx]

    # Convert DataFrame to examples
    train_examples = [data.Example.fromlist([row['content'], row['label'], row['title']], fields) for _, row in train_df.iterrows()]
    valid_examples = [data.Example.fromlist([row['content'], row['label'], row['title']], fields) for _, row in valid_df.iterrows()]

    # Create datasets
    train_data = data.Dataset(train_examples, fields)
    valid_data = data.Dataset(valid_examples, fields)

    # Build vocab
    TEXT.build_vocab(train_data, max_size=15000, vectors='glove.6B.100d')
    TITLE.vocab = TEXT.vocab
    LABEL.build_vocab(train_data)

    # Create iterators
    BATCH_SIZE = 32
    device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

    train_iterator, valid_iterator = data.BucketIterator.splits(
        (train_data, valid_data),
        batch_size=BATCH_SIZE,
        sort_key=lambda x: len(x.content),
        device=device
    )

    # Initialize model, optimizer, and criterion
    model = CNN(vocab_size=len(TEXT.vocab),
                embedding_dim=100,  # Adjust according to the vectors used
                n_filters=100,
                filter_sizes=[3, 4, 5],
                output_dim=len(LABEL.vocab),
                dropout=0.5,
                pad_idx=TEXT.vocab.stoi[TEXT.pad_token])

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    # Train the model
    best_valid_loss = float('inf')
    for epoch in range(10):  # Adjust epochs if needed
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f'cnn_model_fold_{fold+1}.pt')
            print(f'Saved best model for fold {fold+1} with validation loss: {best_valid_loss:.3f}')

    fold_results.append((train_loss, train_acc, valid_loss, valid_acc))

# Print average results
avg_train_loss = np.mean([result[0] for result in fold_results])
avg_train_acc = np.mean([result[1] for result in fold_results])
avg_valid_loss = np.mean([result[2] for result in fold_results])
avg_valid_acc = np.mean([result[3] for result in fold_results])

print(f'Average Train Loss: {avg_train_loss:.3f}, Average Train Acc: {avg_train_acc*100:.2f}%')
print(f'Average Val. Loss: {avg_valid_loss:.3f}, Average Val. Acc: {avg_valid_acc*100:.2f}%')


## Evaluating forgetting on Kinyarwanda

In [None]:
custom_embeddings = vocab.Vectors(name='W2V-Kin-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [None]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',
    train='zero_kin_train.csv', 
    test='zero_kin_test.csv',
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header(title)
)

In [None]:
TEXT.build_vocab(train_data, max_size=15000, vectors=custom_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [None]:
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [None]:
# Build the model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [None]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
model_path = 'zero_Cnn_tuned_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

In [None]:
from sklearn.metrics import f1_score

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc

            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1 Score: {test_f1:.3f}')
