## Initial Dependencies and modules

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModelForMaskedLM

## Importing dataset

In [2]:
kinn_train_path = 'KINNEWS_train.csv'
kinn_test_path = 'KINNEWS_test.csv'

## Dataset preprocessing

In [None]:
import pandas as pd


df = pd.read_csv(kinn_train_path)


# Concatenating title and content
separator = " [SEP] "
df['text'] = df['title'] + separator + df['content']
df = df.drop(columns=['title', 'content'])

print(df.head())


In [None]:
# Making the labels 0 based (0 to 13 instead of 1 to 14)
df['label'] = df['label'] - 1
df.head()

In [None]:
#Train-val spit
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].astype(str), df['label'], test_size=0.2, random_state=42)

In [None]:
#Tokenizing inputs
tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_large")

# Tokenize the input texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

In [None]:
#Defining the dataset for model training
class KinyarwandaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = KinyarwandaDataset(train_encodings, train_labels.tolist())
val_dataset = KinyarwandaDataset(val_encodings, val_labels.tolist())

## Training on Kinyarwanda

In [None]:
# import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("castorini/afriberta_large", num_labels=14).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/Users/casarulez/Projects/DDSI/AfriBERT/results',          # Output directory
    num_train_epochs=25,              # Number of training epochs
    per_device_train_batch_size=32,   # Batch size for training
    per_device_eval_batch_size=32,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    no_cuda=True                     # Disable CUDA
)

# Function to check dataset shapes
def check_dataset_shapes(dataset):
    for i, data in enumerate(dataset):
        inputs, labels = data
        print(f"Batch {i} - inputs shape: {inputs.shape}, labels shape: {labels.shape}")
        if i == 2:  # Check first few batches
            break


# Create Trainer instance
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
)

In [None]:
# Train the model
trainer.train()

## Resuming training from 500 steps checkpoint

In [None]:
trainer.train(resume_from_checkpoint='/Users/casarulez/Projects/DDSI/BantuBERT/results/checkpoint-500')

## Preparing the test dataset

In [None]:
# Load the test dataset
df_test = pd.read_csv(kinn_test_path)

separator = " [SEP] "
df_test['text'] = df_test['title'] + separator + df_test['content']

df_test = df_test.drop(columns=['title', 'content'])
df_test['label'] = df_test['label'] - 1

print(df_test.head())


## Loading the model from the optimum checkpoint and testing metrics - Kinyarwanda

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_large")

# Load the model from the checkpoint
checkpoint_path = '/Users/casarulez/Projects/DDSI/AfriBERT/results/checkpoint-1000'
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm  # Import tqdm

# Set a maximum sequence length that matches the model's expected input size
max_length = 512  # You can adjust this based on your model's maximum input length

# Tokenize the test data
encodings = tokenizer(list(df_test['text']), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
labels = torch.tensor(df_test['label'].values)

# Create a DataLoader
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=32)

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing Batches"):
        input_ids, attention_mask, label_ids = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(label_ids.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


## Random prediction - Kinyarwanda

In [None]:
num_samples = 5
random_indices = np.random.choice(len(df_test), num_samples, replace=False)

# Extract texts and labels for these indices
sample_texts = df_test.iloc[random_indices]['text'].tolist()
sample_labels = df_test.iloc[random_indices]['label'].tolist()

# Tokenize the sample texts
encodings = tokenizer(list(df_test['text']), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

# Prepare inputs for the model
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
labels = torch.tensor(sample_labels)

# Make predictions
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).cpu().numpy()

# Print the results
for idx, (text, true_label, pred) in enumerate(zip(sample_texts, sample_labels, preds)):
    print(f"Sample {idx + 1}:")
    print(f"Text: {text}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {pred}")
    print()

## Loading the Kirundi test dataset

In [None]:
# Load the test dataset
kir_test = pd.read_csv('/Users/casarulez/Projects/DDSI/KIRNEWS/cleaned/test.csv')

separator = " [SEP] "
kir_test['text'] = kir_test['title'] + separator + kir_test['content']

kir_test = kir_test.drop(columns=['title', 'content'])
kir_test['label'] = kir_test['label'] - 1

print(kir_test.head())


## Loading the model from the optimum checkpoint and testing metrics - Kirundi (Before fine tuning)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

checkpoint_path = '/Users/casarulez/Projects/DDSI/AfriBERT/results/checkpoint-1000'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_large")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm  # Import tqdm

# Tokenize the test data
max_length=512
encodings = tokenizer(list(kir_test['text']), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
labels = torch.tensor(kir_test['label'].values)

# Create a DataLoader
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=32)

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing Batches"):
        input_ids, attention_mask, label_ids = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(label_ids.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


## Random prediction - Kirundi

In [None]:
num_samples = 5
random_indices = np.random.choice(len(df_test), num_samples, replace=False)

# Extract texts and labels for these indices
sample_texts = kir_test.iloc[random_indices]['text'].tolist()
sample_labels = kir_test.iloc[random_indices]['label'].tolist()

# Tokenize the sample texts
encodings = tokenizer(sample_texts, truncation=True, padding=True, return_tensors='pt')

# Prepare inputs for the model
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
labels = torch.tensor(sample_labels)

# Make predictions
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).cpu().numpy()

# Print the results
for idx, (text, true_label, pred) in enumerate(zip(sample_texts, sample_labels, preds)):
    print(f"Sample {idx + 1}:")
    print(f"Text: {text}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {pred}")
    print()

# Fine tuning on Kirundi

## Loading the Kirundi training dataset

In [None]:
# Load the test dataset
kir_train = pd.read_csv('/Users/casarulez/Projects/DDSI/KIRNEWS/cleaned/train.csv')

separator = " [SEP] "
kir_train['text'] = kir_train['title'] + separator + kir_train['content']

kir_train = kir_train.drop(columns=['title', 'content'])
kir_train['label'] = kir_train['label'] - 1

print(kir_test.head())

In [None]:
#Train-val spit
train_texts, val_texts, train_labels, val_labels = train_test_split(kir_train['text'].astype(str), kir_train['label'], test_size=0.2, random_state=42)

In [None]:
#Tokenizing inputs
tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_large")

# Tokenize the input texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

In [None]:
#Defining the dataset for model training
class KirundiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = KirundiDataset(train_encodings, train_labels.tolist())
val_dataset = KirundiDataset(val_encodings, val_labels.tolist())

In [None]:
#Defining the training parameters
import torch

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")


model = model = AutoModelForSequenceClassification.from_pretrained('/Users/casarulez/Projects/DDSI/AfriBERT/results/checkpoint-1000', num_labels=14).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/Users/casarulez/Projects/DDSI/BantuBERT/results/kirundi',          # Output directory
    num_train_epochs=8,              # Number of training epochs
    per_device_train_batch_size=32,   # Batch size for training
    per_device_eval_batch_size=32,    # Batch size for evaluation
    warmup_steps=100,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    no_cuda=True                     # Disable CUDA
)

# Create Trainer instance
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
)

In [None]:
# Train the model
trainer.train()

In [None]:
checkpoint_path = '/Users/casarulez/Projects/DDSI/AfriBERT/results/kirundi/checkpoint-500'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_large")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm  # Import tqdm

# Tokenize the test data
max_length=512
encodings = tokenizer(list(kir_test['text']), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
labels = torch.tensor(kir_test['label'].values)

# Create a DataLoader
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=32)

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing Batches"):
        input_ids, attention_mask, label_ids = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(label_ids.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

## Testing forgetting on kinyarwanda

In [7]:
# Load the test dataset
df_test = pd.read_csv(kinn_test_path)

separator = " [SEP] "
df_test['text'] = df_test['title'] + separator + df_test['content']

df_test = df_test.drop(columns=['title', 'content'])
df_test['label'] = df_test['label'] - 1

print(df_test.head())


   label                                               text
0      1  ikipe y’ u rwanda amavubi yahesheje u rwanda a...
1     10  urubyiruko itorero erc giterane cy’ububyutse k...
2      3  rusizi bambaye udupfukamunwa n’ubwo bamwe bata...
3      4  abanyarwanda batatu begukanye ibihembo pam awa...
4     10  light family choir igiye gukora igitaramo cy’a...


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
checkpoint_path = '/Users/casarulez/Projects/DDSI/AfriBERT/results/kirundi/checkpoint-500'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_large")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)



In [9]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm  # Import tqdm

# Set a maximum sequence length that matches the model's expected input size
max_length = 512  # You can adjust this based on your model's maximum input length

# Tokenize the test data
encodings = tokenizer(list(df_test['text']), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
labels = torch.tensor(df_test['label'].values)

# Create a DataLoader
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=32)

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing Batches"):
        input_ids, attention_mask, label_ids = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(label_ids.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing Batches: 100%|█████████████████████| 133/133 [13:53<00:00,  6.26s/it]

Accuracy: 0.8061
F1 Score: 0.7986



