<a href="https://colab.research.google.com/github/smaliyu/AfriNLP/blob/main/hausa_yoruba_hate_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Attempt to finetune hate detection on tweet models while trying out wandb.

In [1]:
# Necessary imports
try:
  import wandb
except:
  !pip install -q wandb
  import wandb
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (XLMRobertaTokenizer, XLMRobertaForSequenceClassification,
                          BertTokenizer, BertForSequenceClassification,
                          AutoTokenizer, AutoModelForSequenceClassification)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import os


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.5/257.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
# Initialize WandB
wandb.init(project="hate-models")


[34m[1mwandb[0m: Currently logged in as: [33mlukmanaj[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# Existing functions (load_dataset, preprocess_data, tokenize, TextDataset)
def load_dataset(language):
    train_data = pd.read_csv(f'https://raw.githubusercontent.com/smaliyu/AfriNLP/main/datasets/{language}/{language}_train.csv')
    test_data = pd.read_csv(f'https://raw.githubusercontent.com/smaliyu/AfriNLP/main/datasets/{language}/{language}_test.csv')
    return train_data, test_data

def preprocess_data(data, label_encoder):
    texts = data['tweet'].tolist()
    labels = label_encoder.fit_transform(data['label'].tolist())
    return texts, labels

# Tokenization function
def tokenize(texts, tokenizer, max_length=128):
    return tokenizer(texts, add_special_tokens=True, max_length=max_length,
                     padding='max_length', return_attention_mask=True, truncation=True)

# Custom dataset class to prepare dataset for the encoder model
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [7]:
# Define LabelEncoder
label_encoder = LabelEncoder()

# Load and preprocess datasets
hausa_train, hausa_test = load_dataset('hausa')
yoruba_train, yoruba_test = load_dataset('yoruba')

hausa_texts, hausa_labels = preprocess_data(hausa_train, label_encoder)
yoruba_texts, yoruba_labels = preprocess_data(yoruba_train, label_encoder)
hausa_test_texts, hausa_test_labels = preprocess_data(hausa_test, label_encoder)
yoruba_test_texts, yoruba_test_labels = preprocess_data(yoruba_test,label_encoder)

In [8]:
# Training Loop
def train(model, train_loader, optimizer, device, num_epochs):
    model.train()
    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        total_loss = 0
        for batch in train_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        wandb.log({"epoch": epoch, "train_loss": avg_train_loss})

# Evaluation Loop
def evaluate(model, test_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions


In [9]:
# Function to perform training and evaluation for a given model and dataset
def train_and_evaluate(model_name, train_texts, train_labels, test_texts, test_labels, language):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(np.unique(train_labels)))

    # Tokenize data
    train_encodings = tokenize(train_texts, tokenizer)
    test_encodings = tokenize(test_texts, tokenizer)

    # Create Datasets and DataLoaders
    train_dataset = TextDataset(train_encodings, train_labels)
    test_dataset = TextDataset(test_encodings, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Define optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    # Device setup
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    # Train
    num_epochs = 3
    train(model, train_loader, optimizer, device, num_epochs)

    # Evaluate
    predictions = evaluate(model, test_loader, device)
    test_accuracy = accuracy_score(test_labels, predictions)
    wandb.log({"test_accuracy": test_accuracy})

    print(f"Test Accuracy for {model_name} on {language}: {test_accuracy}")



In [10]:
# Models to train
models = ["xlm-roberta-base", "bert-base-multilingual-cased"]
languages = {"hausa": (hausa_texts, hausa_labels, hausa_test_texts, hausa_test_labels),
             "yoruba": (yoruba_texts, yoruba_labels, yoruba_test_texts, yoruba_test_labels)}

# Loopning over models and languages
for model_name in models:
    for language, (train_texts, train_labels, test_texts, test_labels) in languages.items():
        print(f"Training {model_name} on {language} dataset")
        train_and_evaluate(model_name, train_texts, train_labels, test_texts, test_labels, language)

# Finish WandB run
wandb.finish()


Training xlm-roberta-base on hausa dataset


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Test Accuracy for xlm-roberta-base on hausa: 0.6081916537867079
Training xlm-roberta-base on yoruba dataset


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Test Accuracy for xlm-roberta-base on yoruba: 0.8286004056795132
Training bert-base-multilingual-cased on hausa dataset


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Test Accuracy for bert-base-multilingual-cased on hausa: 0.8153013910355487
Training bert-base-multilingual-cased on yoruba dataset


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Test Accuracy for bert-base-multilingual-cased on yoruba: 0.8164300202839757


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▅█▁▅█▁▅█▁▅█
test_accuracy,▁███
train_loss,███▇▄▂▅▂▁▆▄▂

0,1
epoch,2.0
test_accuracy,0.81643
train_loss,0.46165


In [None]:
# @title All the code in one place
# Necessary imports
try:
  import wandb
except:
  !pip install -q wandb
  import wandb
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (XLMRobertaTokenizer, XLMRobertaForSequenceClassification,
                          BertTokenizer, BertForSequenceClassification,
                          AutoTokenizer, AutoModelForSequenceClassification)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import os

# Initialize WandB
wandb.init(project="hate-models")

# Existing functions (load_dataset, preprocess_data, tokenize, TextDataset)
def load_dataset(language):
    train_data = pd.read_csv(f'https://raw.githubusercontent.com/smaliyu/AfriNLP/main/datasets/{language}/{language}_train.csv')
    test_data = pd.read_csv(f'https://raw.githubusercontent.com/smaliyu/AfriNLP/main/datasets/{language}/{language}_test.csv')
    return train_data, test_data

def preprocess_data(data, label_encoder):
    texts = data['tweet'].tolist()
    labels = label_encoder.fit_transform(data['label'].tolist())
    return texts, labels

# Tokenization function
def tokenize(texts, tokenizer, max_length=128):
    return tokenizer(texts, add_special_tokens=True, max_length=max_length,
                     padding='max_length', return_attention_mask=True, truncation=True)

# Custom dataset class to prepare dataset for the encoder model
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



# Training Loop
def train(model, train_loader, optimizer, device, num_epochs):
    model.train()
    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        total_loss = 0
        for batch in train_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        wandb.log({"epoch": epoch, "train_loss": avg_train_loss})

# Evaluation Loop
def evaluate(model, test_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

# Function to perform training and evaluation for a given model and dataset
def train_and_evaluate(model_name, train_texts, train_labels, test_texts, test_labels, language):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(np.unique(train_labels)))

    # Tokenize data
    train_encodings = tokenize(train_texts, tokenizer)
    test_encodings = tokenize(test_texts, tokenizer)

    # Create Datasets and DataLoaders
    train_dataset = TextDataset(train_encodings, train_labels)
    test_dataset = TextDataset(test_encodings, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Define optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    # Device setup
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    # Train
    num_epochs = 3
    train(model, train_loader, optimizer, device, num_epochs)

    # Evaluate
    predictions = evaluate(model, test_loader, device)
    test_accuracy = accuracy_score(test_labels, predictions)
    wandb.log({"test_accuracy": test_accuracy})

    print(f"Test Accuracy for {model_name} on {language}: {test_accuracy}")

# Models to train
models = ["xlm-roberta-base", "bert-base-multilingual-cased"]
languages = {"hausa": (hausa_texts, hausa_labels, hausa_test_texts, hausa_test_labels),
             "yoruba": (yoruba_texts, yoruba_labels, yoruba_test_texts, yoruba_test_labels)}

# Loop over models and languages
for model_name in models:
    for language, (train_texts, train_labels, test_texts, test_labels) in languages.items():
        print(f"Training {model_name} on {language} dataset")
        train_and_evaluate(model_name, train_texts, train_labels, test_texts, test_labels, language)

# Finish WandB run
wandb.finish()
