## Initial Dependencies and modules

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


## Importing dataset

In [4]:
kinn_train_path = 'KINNEWS_train.csv'
kinn_test_path = 'KINNEWS_test.csv'
embd_path =  'W2V-Kin-100.txt'

## Dataset preprocessing

In [7]:
import pandas as pd


df = pd.read_csv(kinn_train_path)


# Concatenating title and content
separator = " [SEP] "
df['text'] = df['title'] + separator + df['content']
df = df.drop(columns=['title', 'content'])

print(df.head())


Index(['label', 'title', 'content'], dtype='object')
   label                                               text
0      3  bugesera nyuma yâ€™inzara yaharanzwe karahahira ...
1      5  mutzig beer fest itegerejwe nâ€™abantu benshi ki...
2      2  abakinnyi bamagare batanu berekeje isiganwa ny...
3      3  kireheumugabo afungiwe gufatanwa imiti irimo c...
4      7  sobanukirwa internet g izahindura ubuzima munt...


In [8]:
# Making the labels 0 based (0 to 13 instead of 1 to 14)
df['label'] = df['label'] - 1
df.head()

Unnamed: 0,label,text
0,2,bugesera nyuma yâ€™inzara yaharanzwe karahahira ...
1,4,mutzig beer fest itegerejwe nâ€™abantu benshi ki...
2,1,abakinnyi bamagare batanu berekeje isiganwa ny...
3,2,kireheumugabo afungiwe gufatanwa imiti irimo c...
4,6,sobanukirwa internet g izahindura ubuzima munt...


In [9]:
#Train-val spit
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].astype(str), df['label'], test_size=0.2, random_state=42)

In [67]:
#Tokenizing inputs
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the input texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

KeyboardInterrupt: 

In [12]:
#Defining the dataset for model training
class KinyarwandaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
train_dataset = KinyarwandaDataset(train_encodings, train_labels.tolist())
val_dataset = KinyarwandaDataset(val_encodings, val_labels.tolist())

## Training on Kinyarwanda

In [29]:
#Defining the training parameters
import torch

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")


model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=14).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/Users/casarulez/Projects/DDSI/mBERT/results',          # Output directory
    num_train_epochs=8,              # Number of training epochs
    per_device_train_batch_size=32,   # Batch size for training
    per_device_eval_batch_size=32,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    no_cuda=True                     # Disable CUDA
)

# Create Trainer instance
trainer = Trainer(
    model=model,                         # The instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
10,2.6232,2.606862
20,2.5854,2.558029
30,2.5324,2.484083
40,2.4654,2.380598
50,2.2983,2.24652
60,2.2443,2.207114
70,2.1617,2.150123
80,2.1607,2.123417
90,2.1077,2.07734
100,2.0565,2.027808


KeyboardInterrupt: 

## Preparing the test dataset

In [40]:
# Load the test dataset
df_test = pd.read_csv(kinn_test_path)

separator = " [SEP] "
df_test['text'] = df_test['title'] + separator + df_test['content']

df_test = df_test.drop(columns=['title', 'content'])
df_test['label'] = df_test['label'] - 1

print(df_test.head())


Index(['label', 'title', 'content'], dtype='object')
   label                                               text
0      1  ikipe yâ€™ u rwanda amavubi yahesheje u rwanda a...
1     10  urubyiruko itorero erc giterane cyâ€™ububyutse k...
2      3  rusizi bambaye udupfukamunwa nâ€™ubwo bamwe bata...
3      4  abanyarwanda batatu begukanye ibihembo pam awa...
4     10  light family choir igiye gukora igitaramo cyâ€™a...


## Loading the model from the optimum checkpoint and testing metrics - Kinyarwanda

In [54]:
from transformers import BertTokenizer, BertForSequenceClassification

checkpoint_path = '/Users/casarulez/Projects/DDSI/mBERT/results/checkpoint-1000'

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained(checkpoint_path)

In [57]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm  # Import tqdm

# Tokenize the test data
encodings = tokenizer(list(df_test['text']), truncation=True, padding=True, return_tensors='pt')
labels = torch.tensor(df_test['label'].values)

# Create a DataLoader
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=32)

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing Batches"):
        input_ids, attention_mask, label_ids = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(label_ids.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


Processing Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 133/133 [07:26<00:00,  3.35s/it]

Accuracy: 0.7884
F1 Score: 0.7747





## Random prediction - Kinyarwanda

In [66]:
num_samples = 5
random_indices = np.random.choice(len(df_test), num_samples, replace=False)

# Extract texts and labels for these indices
sample_texts = df_test.iloc[random_indices]['text'].tolist()
sample_labels = df_test.iloc[random_indices]['label'].tolist()

# Tokenize the sample texts
encodings = tokenizer(sample_texts, truncation=True, padding=True, return_tensors='pt')

# Prepare inputs for the model
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
labels = torch.tensor(sample_labels)

# Make predictions
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).cpu().numpy()

# Print the results
for idx, (text, true_label, pred) in enumerate(zip(sample_texts, sample_labels, preds)):
    print(f"Sample {idx + 1}:")
    print(f"Text: {text}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {pred}")
    print()

Sample 1:
Text: imikorere yâ€™amahoteli rwanda ntiri rwego mpuzamahanga [SEP] mu itangazo yashyize ahagaragara cyumweru gishize rdb ivuga amahoteli abiri gusa serena hotels gorillas group mahoteli rwanda ariyo yujuje ibisabwa aka karere eric musanganya ukuriye amashyirahamwe yâ€™amahoteli rwanda yatangarije ikinyamakuru the east african uru rwego rukwiye kwisubiraho yavuze amahugurwa kwakira abantu guteka nayo yongerewe ayo mahugurwa agamije kongera ubushobozi guteka gutanga serivisi kurinda ingo kwakira abantu umwe bahawe mahugurwa odette kabarayinga yatangaje kuba mujyi kigali hagenda hagaragara amahoteli rwego mpuzamahanga bisaba guhindura imikorere haba ruhande rwâ€™abakozi ruhande rwâ€™abanyiramahoteli ati dukwiye kubaha mahugurwa tubishyizeho umwete kugira adufashe kongera ubumenyi bwâ€™uburyo duha serivisi abakiriya bacu abanyarwanda cyangwa abanyamahanga emmanuel n hitimana
True Label: 7
Predicted Label: 7

Sample 2:
Text: muhanga inyama zikomeje guhenda [SEP] mukamudenge umwe 

## Loading the Kirundi test dataset

In [71]:
# Load the test dataset
kir_test = pd.read_csv('/Users/casarulez/Projects/DDSI/KIRNEWS/cleaned/test.csv')

separator = " [SEP] "
kir_test['text'] = kir_test['title'] + separator + kir_test['content']

kir_test = kir_test.drop(columns=['title', 'content'])
kir_test['label'] = kir_test['label'] - 1

print(kir_test.head())


   label                                               text
0      1  intamba mâ€™urugamba zâ€™abakobwa zaryoheye abarun...
1      0  unviriza hano ikiganiro câ€™umukuru wâ€™igihugu ya...
2     13  ibintu bidasanzwe wokorera umukunzi wawe bikam...
3     12  na nyamwema bitwararitswe bokwiga nkâ€™abandi [S...
4      0  sentare ijejwe kubahiriza ibwirizwa nshingiro ...


## Loading the model from the optimum checkpoint and testing metrics - Kirundi (Before fine tuning)

In [72]:
from transformers import BertTokenizer, BertForSequenceClassification

checkpoint_path = '/Users/casarulez/Projects/DDSI/mBERT/results/checkpoint-1000'

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained(checkpoint_path)

In [73]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm  # Import tqdm

# Tokenize the test data
encodings = tokenizer(list(kir_test['text']), truncation=True, padding=True, return_tensors='pt')
labels = torch.tensor(kir_test['label'].values)

# Create a DataLoader
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=32)

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing Batches"):
        input_ids, attention_mask, label_ids = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(label_ids.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


Processing Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 29/29 [01:35<00:00,  3.28s/it]

Accuracy: 0.5872
F1 Score: 0.5917





## Random prediction - Kirundi

In [75]:
num_samples = 5
random_indices = np.random.choice(len(df_test), num_samples, replace=False)

# Extract texts and labels for these indices
sample_texts = kir_test.iloc[random_indices]['text'].tolist()
sample_labels = kir_test.iloc[random_indices]['label'].tolist()

# Tokenize the sample texts
encodings = tokenizer(sample_texts, truncation=True, padding=True, return_tensors='pt')

# Prepare inputs for the model
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
labels = torch.tensor(sample_labels)

# Make predictions
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).cpu().numpy()

# Print the results
for idx, (text, true_label, pred) in enumerate(zip(sample_texts, sample_labels, preds)):
    print(f"Sample {idx + 1}:")
    print(f"Text: {text}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {pred}")
    print()

Sample 1:
Text: sadio man ngo bazogerageza gutsinda fc barcelona [SEP] mugihe hasigaye indwi gusa kugira liverpool yakirwe fc barcelona i camp nou mugice ca kane gishira icanyuma champions league sadio man umunya sngal akinira liverpool avugako bazogerageza gutsinda fc barcelona nubwo umurwi ukomeye fc barcelone numwe m mirwi myiza kuriyi isi ariko tuzogerageza tubatsinde ivyo vyandikwa nâ€™ikinyamakuru la vie sngalaise ni mugihe barcelona imaze inkino idatsindirwa i camp nou kwâ€™itariki rusama mwaka liverpool izoba ifise igikorwa gikomeye
True Label: 1
Predicted Label: 1

Sample 2:
Text: ikiziga câ€™umukinyi mpuzamakungu wâ€™umurundi papy fatty cashikanywe kibuga câ€™indege a bujumbura [SEP] inyuma yâ€™indwi zitatu umukinyi mpuzamakungu wâ€™umurundi yakinira umurwi nserukira gihugu wâ€™uburundi mupira wâ€™amaguruintamba mu rugamba papy fatty asandabiye mu kibuga kubera indwara yâ€™umutima mu gihe yariko arakinira umurwi wiwe malanti chiefs mu cicaro ca mbere mu gihugu a eswatini caho

# Fine tuning on Kirundi

## Loading the Kirundi training dataset

In [76]:
# Load the test dataset
kir_train = pd.read_csv('/Users/casarulez/Projects/DDSI/KIRNEWS/cleaned/train.csv')

separator = " [SEP] "
kir_train['text'] = kir_train['title'] + separator + kir_train['content']

kir_train = kir_train.drop(columns=['title', 'content'])
kir_train['label'] = kir_train['label'] - 1

print(kir_test.head())

   label                                               text
0      1  intamba mâ€™urugamba zâ€™abakobwa zaryoheye abarun...
1      0  unviriza hano ikiganiro câ€™umukuru wâ€™igihugu ya...
2     13  ibintu bidasanzwe wokorera umukunzi wawe bikam...
3     12  na nyamwema bitwararitswe bokwiga nkâ€™abandi [S...
4      0  sentare ijejwe kubahiriza ibwirizwa nshingiro ...


In [77]:
#Train-val spit
train_texts, val_texts, train_labels, val_labels = train_test_split(kir_train['text'].astype(str), kir_train['label'], test_size=0.2, random_state=42)

In [78]:
#Tokenizing inputs
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the input texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

In [79]:
#Defining the dataset for model training
class KirundiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [80]:
train_dataset = KirundiDataset(train_encodings, train_labels.tolist())
val_dataset = KirundiDataset(val_encodings, val_labels.tolist())

In [87]:
#Defining the training parameters
import torch

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")


model = BertForSequenceClassification.from_pretrained('/Users/casarulez/Projects/DDSI/mBERT/results/checkpoint-1000', num_labels=14).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/Users/casarulez/Projects/DDSI/mBERT/results/kirundi',          # Output directory
    num_train_epochs=8,              # Number of training epochs
    per_device_train_batch_size=32,   # Batch size for training
    per_device_eval_batch_size=32,    # Batch size for evaluation
    warmup_steps=100,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    no_cuda=True                     # Disable CUDA
)

# Create Trainer instance
trainer = Trainer(
    model=model,                         # The instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
)

In [88]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
10,1.6867,1.434821
20,1.3633,1.33871
30,1.1579,1.233007
40,1.2281,1.105718
50,1.0547,0.99402
60,0.909,0.990086
70,0.9877,0.916449
80,0.9117,0.860813
90,0.87,0.900843
100,0.7239,0.783481


KeyboardInterrupt: 

In [89]:
from transformers import BertTokenizer, BertForSequenceClassification

checkpoint_path = '/Users/casarulez/Projects/DDSI/mBERT/results/kirundi/checkpoint-500'

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained(checkpoint_path)

In [90]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm  # Import tqdm

# Tokenize the test data
encodings = tokenizer(list(kir_test['text']), truncation=True, padding=True, return_tensors='pt')
labels = torch.tensor(kir_test['label'].values)

# Create a DataLoader
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=32)

# Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing Batches"):
        input_ids, attention_mask, label_ids = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(label_ids.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Processing Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 29/29 [01:34<00:00,  3.27s/it]

Accuracy: 0.8462
F1 Score: 0.8422



