In [2]:
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import torchtext.vocab as vocab
import torch.nn.functional as F
import nltk
from nltk.tokenize import word_tokenize

In [6]:
custom_embeddings = vocab.Vectors(name='W2V-Kin-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [244]:
# Ensure the necessary NLTK tokenizer data is downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/casarulez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.float)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [4]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',
    train='train.csv', 
    test='test.csv',
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header(title)
)

In [5]:
# Perform train/validation set split
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))

# Build the vocabulary
TEXT.build_vocab(train_data.title, train_data.content, max_size=15000, vectors=custom_embeddings)
TITLE.vocab = TEXT.vocab
LABEL.build_vocab(train_data)

In [6]:
# Create the iterator and place the tensor it returned on GPU(if it is available)
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [7]:
# Build the model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [10]:
# Create the instance of the model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]


In [11]:
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [12]:
def multiclass_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.max(preds, 1)[1]
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [13]:
def multiclass_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.max(preds, 1)[1]
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

# Train the model
def train(model, iterator, optimizer, criterion):
    """Training the model"""
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()

        predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
        loss = criterion(predictions, batch.label.type(torch.long).to(device))
        acc = multiclass_accuracy(predictions, batch.label.type(torch.long).to(device))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Evaluate the model
def evaluate(model, iterator, criterion):
    """Evaluating the model"""
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
            loss = criterion(predictions, batch.label.type(torch.long).to(device))
            acc = multiclass_accuracy(predictions, batch.label.type(torch.long).to(device))
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [14]:
from tqdm import tqdm

if __name__ == "__main__":
    N_EPOCHS = 8

    for epoch in range(N_EPOCHS):
        # Wrapping train_iterator with tqdm for a progress bar
        train_iterator = tqdm(train_iterator, desc=f"Training Epoch {epoch + 1}")
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        
        # Wrapping valid_iterator with tqdm for a progress bar
        valid_iterator = tqdm(valid_iterator, desc=f"Validating Epoch {epoch + 1}")
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
        print(f'\n| Epoch: {epoch + 1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}% |'
              f' Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}% |')

    # Wrapping test_iterator with tqdm for a progress bar
    test_iterator = tqdm(test_iterator, desc="Testing")
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}% |')

    print(f'The model has {count_parameters(model):,} trainable parameters')


Training Epoch 1: 100%|███████████████████████| 466/466 [01:11<00:00,  6.48it/s]
Validating Epoch 1: 100%|███████████████████████| 52/52 [00:01<00:00, 37.37it/s]



| Epoch: 01 | Train Loss: 1.057 | Train Acc: 68.41% | Val. Loss: 0.656 | Val. Acc: 79.16% |


Training Epoch 2: 100%|███████████████████████| 466/466 [01:10<00:00,  6.60it/s]
Validating Epoch 2: 100%|███████████████████████| 52/52 [00:01<00:00, 37.56it/s]



| Epoch: 02 | Train Loss: 0.564 | Train Acc: 82.96% | Val. Loss: 0.511 | Val. Acc: 84.01% |


Training Epoch 3: 100%|███████████████████████| 466/466 [01:10<00:00,  6.57it/s]
Validating Epoch 3: 100%|███████████████████████| 52/52 [00:01<00:00, 39.73it/s]



| Epoch: 03 | Train Loss: 0.423 | Train Acc: 86.86% | Val. Loss: 0.461 | Val. Acc: 86.30% |


Training Epoch 4: 100%|███████████████████████| 466/466 [01:12<00:00,  6.43it/s]
Validating Epoch 4: 100%|███████████████████████| 52/52 [00:01<00:00, 35.85it/s]



| Epoch: 04 | Train Loss: 0.327 | Train Acc: 90.01% | Val. Loss: 0.422 | Val. Acc: 87.36% |


Training Epoch 5: 100%|███████████████████████| 466/466 [01:12<00:00,  6.44it/s]
Validating Epoch 5: 100%|███████████████████████| 52/52 [00:01<00:00, 33.87it/s]



| Epoch: 05 | Train Loss: 0.261 | Train Acc: 91.95% | Val. Loss: 0.406 | Val. Acc: 87.80% |


Training Epoch 6: 100%|███████████████████████| 466/466 [01:10<00:00,  6.58it/s]
Validating Epoch 6: 100%|███████████████████████| 52/52 [00:01<00:00, 34.49it/s]



| Epoch: 06 | Train Loss: 0.203 | Train Acc: 93.87% | Val. Loss: 0.419 | Val. Acc: 87.88% |


Training Epoch 7: 100%|███████████████████████| 466/466 [01:11<00:00,  6.50it/s]
Validating Epoch 7: 100%|███████████████████████| 52/52 [00:01<00:00, 30.89it/s]



| Epoch: 07 | Train Loss: 0.161 | Train Acc: 95.35% | Val. Loss: 0.433 | Val. Acc: 87.89% |


Training Epoch 8: 100%|███████████████████████| 466/466 [01:14<00:00,  6.25it/s]
Validating Epoch 8: 100%|███████████████████████| 52/52 [00:01<00:00, 37.01it/s]



| Epoch: 08 | Train Loss: 0.131 | Train Acc: 96.33% | Val. Loss: 0.441 | Val. Acc: 87.88% |


Testing: 100%|████████████████████████████████| 130/130 [00:03<00:00, 36.16it/s]

| Test Loss: 0.465 | Test Acc: 87.40% |
The model has 845,962 trainable parameters





In [15]:
torch.save(model.state_dict(), 'kinn_model.pt')

## Kirundi - without fine tuning

In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [4]:
# Load your dataset (modify according to your actual dataset path and format)
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',  # Change to the directory containing your CSV files
    train='train.csv',  # Change to your train CSV file name
    test='kir_test.csv',  # Change to your test CSV file name
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header
)

In [7]:
TEXT.build_vocab(train_data, max_size=15000, vectors=custom_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [8]:
# Create the iterator and place the tensor it returned on GPU(if it is available)
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

test_iterator = data.BucketIterator(
    (test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [9]:
# Build the model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [10]:
# Create the instance of the model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [11]:
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [12]:
model_path = 'kinn_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

CNN(
  (embedding): Embedding(15002, 50, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 150, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 150, kernel_size=(4, 50), stride=(1, 1))
    (2): Conv2d(1, 150, kernel_size=(5, 50), stride=(1, 1))
  )
  (fc): Linear(in_features=450, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [13]:
from sklearn.metrics import f1_score

# Evaluation function
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc

            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

# Evaluate the model
test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1 Score: {test_f1:.3f}')


Test Loss: 3.049 | Test Acc: 21.90% | Test F1 Score: 0.232


## Kirundi - after fine tuning

In [87]:
kir_embeddings = vocab.Vectors(name='W2V-Kir-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [88]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [89]:
# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [90]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',  # Change to the directory containing your CSV files
    train='kir_train.csv',  # Change to your Kirundi train CSV file name
    test='kir_test.csv',  # Change to your test CSV file name
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header
)

In [91]:
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))

In [92]:
TEXT.build_vocab(train_data, max_size=15000, vectors=kir_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [93]:
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [94]:
# Build the model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [95]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [96]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters(),lr=1e-5)

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [97]:
model_path = 'kinn_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [102]:
# Fine-tuning the model on Kirundi training data
N_EPOCHS = 100

# Initialize variables to track the best model
best_valid_loss = float('inf')
model_path = 'Cnn_tuned_model.pt'

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.content).squeeze(1)
        loss = criterion(predictions, batch.label.long())  # Convert batch.label to LongTensor
        acc = (predictions.argmax(1) == batch.label.long()).sum().item() / len(batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label.long())  # Convert batch.label to LongTensor
            acc = (predictions.argmax(1) == batch.label.long()).sum().item() / len(batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Training loop with validation and checkpointing
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')
    
    # Save the model if the validation loss improves
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_path)
        print(f'Saved best model with validation loss: {best_valid_loss:.3f}')


Epoch: 01, Train Loss: 1.883, Train Acc: 43.64%, Val. Loss: 2.010, Val. Acc: 39.34%
Saved best model with validation loss: 2.010
Epoch: 02, Train Loss: 1.881, Train Acc: 43.49%, Val. Loss: 1.997, Val. Acc: 39.60%
Saved best model with validation loss: 1.997
Epoch: 03, Train Loss: 1.857, Train Acc: 43.81%, Val. Loss: 1.983, Val. Acc: 39.86%
Saved best model with validation loss: 1.983
Epoch: 04, Train Loss: 1.855, Train Acc: 45.02%, Val. Loss: 1.970, Val. Acc: 39.86%
Saved best model with validation loss: 1.970
Epoch: 05, Train Loss: 1.839, Train Acc: 45.42%, Val. Loss: 1.958, Val. Acc: 40.64%
Saved best model with validation loss: 1.958
Epoch: 06, Train Loss: 1.815, Train Acc: 44.67%, Val. Loss: 1.944, Val. Acc: 40.64%
Saved best model with validation loss: 1.944
Epoch: 07, Train Loss: 1.837, Train Acc: 43.33%, Val. Loss: 1.931, Val. Acc: 40.90%
Saved best model with validation loss: 1.931
Epoch: 08, Train Loss: 1.774, Train Acc: 45.21%, Val. Loss: 1.919, Val. Acc: 40.90%
Saved best mo

In [71]:
torch.save(model.state_dict(), 'Cnn_tuned_model.pt')

In [103]:
model.eval()

CNN(
  (embedding): Embedding(15002, 50, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 150, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 150, kernel_size=(4, 50), stride=(1, 1))
    (2): Conv2d(1, 150, kernel_size=(5, 50), stride=(1, 1))
  )
  (fc): Linear(in_features=450, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [104]:
from sklearn.metrics import f1_score

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc

            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1 Score: {test_f1:.3f}')


Test Loss: 1.074 | Test Acc: 65.92% | Test F1 Score: 0.628


## Evaluating forgetting on Kinyarwanda

In [105]:
custom_embeddings = vocab.Vectors(name='W2V-Kin-50.txt',
                                  cache='cache',
                                  unk_init=torch.Tensor.normal_)

In [106]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Define fields to hold the data using NLTK tokenizer
LABEL = data.LabelField(dtype=torch.long, use_vocab=True)
TITLE = data.Field(tokenize=word_tokenize)
TEXT = data.Field(tokenize=word_tokenize)

fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]

In [107]:
train_data, test_data = data.TabularDataset.splits(
    path='cleaned',
    train='train.csv', 
    test='test.csv',
    format='csv',
    fields=fields,
    skip_header=True  # dataset has a header(title)
)

In [108]:
TEXT.build_vocab(train_data, max_size=15000, vectors=custom_embeddings)
TITLE.vocab = TEXT.vocab  # Sharing the same vocab between TITLE and TEXT fields
LABEL.build_vocab(train_data)

In [109]:
BATCH_SIZE = 32
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.content),
    device=device
)

In [110]:
# Build the model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,
                 filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=n_filters,
                      kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [111]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50  # Updated to match the GloVe embeddings dimension
N_FILTERS = 150
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [112]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [113]:
model_path = 'Cnn_tuned_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

CNN(
  (embedding): Embedding(15002, 50, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 150, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 150, kernel_size=(4, 50), stride=(1, 1))
    (2): Conv2d(1, 150, kernel_size=(5, 50), stride=(1, 1))
  )
  (fc): Linear(in_features=450, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [114]:
from sklearn.metrics import f1_score

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.content).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = (predictions.argmax(1) == batch.label).sum().item() / len(batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc

            all_preds.extend(predictions.argmax(1).cpu().numpy())
            all_labels.extend(batch.label.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')

    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1

test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1 Score: {test_f1:.3f}')


Test Loss: 2.697 | Test Acc: 22.07% | Test F1 Score: 0.177
