<a href="https://colab.research.google.com/github/smaliyu/AfriNLP/blob/main/hausa_base_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training  a Base Model for Hate Detection in Tweets

In this notebook, I attempt to train a model that detects hate speech in tweets.

In [1]:
# mount drive to persist output
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# change working directory to drive
%cd /content/drive/MyDrive

/content/drive/MyDrive


In [14]:
# clone the repo for access to the datasets and content
!git clone https://github.com/smaliyu/AfriNLP.git

Cloning into 'AfriNLP'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 8 (delta 0), reused 5 (delta 0), pack-reused 0[K
Receiving objects: 100% (8/8), 645.27 KiB | 2.14 MiB/s, done.


In [17]:
# change working directory to the root directory of repo
%cd AfriNLP

/content/drive/MyDrive/AfriNLP


In [26]:
!pwd

/content/drive/MyDrive/AfriNLP


In [4]:
# necessary imports

import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder



# Custom dataset class to prepare dataset for the encoder model

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)




In [5]:
# Load the dataset
data = pd.read_csv('/content/AfriNLP/datasets/hausa(1).csv')

# Extract texts and labels
texts = data['tweet'].tolist()
labels = data['label'].tolist()

# Label Encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)



In [6]:
# To see the mapping of encoded labels to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'Hate': 0, 'Indeterminate': 1, 'Normal': 2, 'Offensive': 3}


In [7]:
%env CUDA_LAUNCH_BLOCKING=1


env: CUDA_LAUNCH_BLOCKING=1


In [8]:
# Tokenization
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
max_length = 128
# Split the data into train and test sets first
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, encoded_labels, random_state=42, test_size=0.2)

# Now tokenize each set
def tokenize(texts):
    return tokenizer(texts, add_special_tokens=True, max_length=max_length, padding='max_length', return_attention_mask=True, truncation=True)

train_encodings = tokenize(train_texts)
test_encodings = tokenize(test_texts)

# Create Custom Dataset
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Model and Optimizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=4)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
model.to(device)
# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        optimizer.zero_grad()

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

# Evaluation
model.eval()
predictions = []

with torch.inference_mode():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

accuracy = accuracy_score(test_labels, predictions)
print(f"Test Accuracy: {accuracy}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Average Training Loss: 0.7748873967732957
Epoch 2/5, Average Training Loss: 0.5640595891137621
Epoch 3/5, Average Training Loss: 0.4514137096814255
Epoch 4/5, Average Training Loss: 0.3820714123880685
Epoch 5/5, Average Training Loss: 0.32411224989415105
Test Accuracy: 0.7932835820895522


In [20]:
# Save only the model's state dictionary
torch.save(model.state_dict(), 'hausa_base_model_state_dict.pth')



In [21]:
!git config --global user.email "lukman.j.aliyu@gmail.com"
!git config --global user.name "lukmanaj"


In [22]:
!git add .
!git commit -m "Add base model for hausa"


[main 745179a] Add base model for hausa
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 hausa_base_model_state_dict.pth
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
!git status