<a href="https://colab.research.google.com/github/smaliyu/AfriNLP/blob/main/hausa_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Necessary imports
try:
  import wandb
except:
  !pip install -q wandb
  import wandb
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (XLMRobertaTokenizer, XLMRobertaForSequenceClassification,
                          BertTokenizer, BertForSequenceClassification,
                          AutoTokenizer, AutoModelForSequenceClassification)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import os


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.7/257.7 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
# Define hyperparameters in a config dictionary
config = dict(
    epochs=3,
    batch_size=16,
    learning_rate=5e-5,
    dataset="Igbo Tweets",
    architecture="Transformer-based",
    num_labels=None,
)



In [4]:
# Define LabelEncoder
label_encoder = LabelEncoder()

# Load and preprocess datasets
def load_and_preprocess_data(language):
    train_data = pd.read_csv(f'https://raw.githubusercontent.com/smaliyu/AfriNLP/main/datasets/{language}/{language}_train.csv')
    test_data = pd.read_csv(f'https://raw.githubusercontent.com/smaliyu/AfriNLP/main/datasets/{language}/{language}_test.csv')
    train_texts = train_data['tweet'].tolist()
    train_labels = label_encoder.fit_transform(train_data['label'].tolist())
    test_texts = test_data['tweet'].tolist()
    test_labels = label_encoder.transform(test_data['label'].tolist())
    return train_texts, train_labels, test_texts, test_labels

# Tokenization function
def tokenize(texts, tokenizer, max_length=128):
    return tokenizer(texts, add_special_tokens=True, max_length=max_length,
                     padding='max_length', return_attention_mask=True, truncation=True)

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



# Training Loop
def train(model, train_loader, optimizer, device, num_epochs):
    model.train()
    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        total_loss = 0
        for batch in train_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        wandb.log({"epoch": epoch, "train_loss": avg_train_loss})

# Evaluation Loop
def evaluate(model, test_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions


In [5]:
# Initialize WandB with the config
wandb.init(project="hate-models", config=config)
config = wandb.config

    # Models and languages
models = ["xlm-roberta-base", "bert-base-multilingual-cased","morit/XLM-T-full-xnli","Davlan/naija-twitter-sentiment-afriberta-large"]
languages = ["hausa"]
results = []
# Loop over models and languages
for model_name in models:
     # Update tokenizer in config based on current model
    wandb.config.update({"tokenizer": model_name}, allow_val_change=True)


    for language in languages:
            print(f"Training {model_name} on {language} dataset")
            train_texts, train_labels, test_texts, test_labels = load_and_preprocess_data(language)

            # Update number of labels in config based on current dataset
            wandb.config.update({"num_labels": len(np.unique(train_labels))}, allow_val_change=True)

            # Tokenizer and Model
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=wandb.config.num_labels)

             # Tokenize data
            train_encodings = tokenize(train_texts, tokenizer)
            test_encodings = tokenize(test_texts, tokenizer)

            # Create Datasets and DataLoaders
            train_dataset = TextDataset(train_encodings, train_labels)
            test_dataset = TextDataset(test_encodings, test_labels)
            train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

            # Define optimizer
            optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

            # Device setup
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            model.to(device)

            # Add WandB watch
            wandb.watch(model, log='all', log_freq=10)

            # Train
            train(model, train_loader, optimizer, device, config.epochs)
            predictions = evaluate(model, test_loader, device)

            # Calculate evaluation metrics
            test_accuracy = accuracy_score(test_labels, predictions)
            test_precision = precision_score(test_labels, predictions, average='macro')
            test_recall = recall_score(test_labels, predictions, average='macro')
            test_f1 = f1_score(test_labels, predictions, average='macro')

            # Store results in dictionary
            result = {
              "Model": model_name,
               "Language": language,
              "Accuracy": test_accuracy,
              "Precision": test_precision,
              "Recall": test_recall,
              "F1-score": test_f1
              }

            # Append results to list
            results.append(result)


            wandb.log({"test_accuracy": test_accuracy})

            print(f"Test Accuracy for {model_name} on {language}: {test_accuracy}")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv("hausa_model_evaluation_results.csv", index=False)



# Finish WandB run
wandb.finish()



[34m[1mwandb[0m: Currently logged in as: [33mlukmanaj[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training xlm-roberta-base on hausa dataset


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Test Accuracy for xlm-roberta-base on hausa: 0.8245749613601236
Training bert-base-multilingual-cased on hausa dataset


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Test Accuracy for bert-base-multilingual-cased on hausa: 0.8176197836166924
Training morit/XLM-T-full-xnli on hausa dataset


tokenizer_config.json:   0%|          | 0.00/524 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Test Accuracy for morit/XLM-T-full-xnli on hausa: 0.8253477588871716
Training Davlan/naija-twitter-sentiment-afriberta-large on hausa dataset


tokenizer_config.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/503M [00:00<?, ?B/s]

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Test Accuracy for Davlan/naija-twitter-sentiment-afriberta-large on hausa: 0.8222565687789799


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▅█▁▅█▁▅█▁▅█
test_accuracy,▇▁█▅
train_loss,█▆▄▇▅▅▇▄▃▆▃▁

0,1
epoch,2.0
test_accuracy,0.82226
train_loss,0.18975
