In [8]:
!pip install googletrans

Collecting googletrans
  Downloading googletrans-4.0.2-py3-none-any.whl.metadata (10 kB)
Downloading googletrans-4.0.2-py3-none-any.whl (18 kB)
Installing collected packages: googletrans
Successfully installed googletrans-4.0.2


In [10]:
import requests
from bs4 import BeautifulSoup
import csv

# URL of the webpage with the 1000 most common Zulu words
url = "https://www.1000mostcommonwords.com/1000-most-common-zulu-words/"

# Send a GET request
response = requests.get(url)
response.raise_for_status()  # Make sure the request succeeded

# Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find the table containing the words
table = soup.find("table")

# Extract rows
rows = table.find_all("tr")

# Prepare list for storing pairs
words = []

# Iterate over rows, skip header
for row in rows[1:]:
    cols = row.find_all("td")
    if len(cols) >= 3:
        number = cols[0].text.strip()  # optional
        zulu_word = cols[1].text.strip()
        english_word = cols[2].text.strip()
        words.append((zulu_word, english_word))

# Save to CSV
output_file = "zulu_words_scraped.csv"
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Zulu", "English"])
    writer.writerows(words)

print(f"Scraped {len(words)} words and saved to {output_file}")


Scraped 1000 words and saved to zulu_words_scraped.csv


In [11]:
import csv

# Load English sentiment lexicon (word -> valence)
english_sentiment = {
    "happy": 3,
    "joy": 4,
    "sad": -3,
    "anger": -4,
    "love": 3,
    "hate": -4,
    # Add more words or load from AFINN
}

# Load scraped Zulu-English words
zulu_words = []
with open("zulu_words_scraped.csv", newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        zulu_words.append((row["Zulu"], row["English"]))

# Map sentiment
zulu_sentiment = []
for zulu, english in zulu_words:
    sentiment = english_sentiment.get(english.lower(), 0)  # default neutral = 0
    zulu_sentiment.append((zulu, english, sentiment))

# Save CSV with sentiment
with open("zulu_words_with_sentiment.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Zulu", "English", "Sentiment"])
    writer.writerows(zulu_sentiment)




In [34]:
from datasets import load_dataset
import csv
from collections import defaultdict

# -------------------------------
# Load dataset
# -------------------------------
zulu_dataset = load_dataset("michsethowusu/zulu-sentiments-corpus")
zulu_sentences = zulu_dataset["train"]["Zulu"][:300]
zulu_labels = zulu_dataset["train"]["sentiment"][:300]

# -------------------------------
# Collect word-level sentiment counts
# -------------------------------
word_scores = defaultdict(list)

for sentence, label in zip(zulu_sentences, zulu_labels):
    sentiment_value = 3 if label.lower() == "positive" else -3
    words = sentence.split()
    for word in words:
        word_scores[word].append(sentiment_value)

# -------------------------------
# Compute normalized sentiment
# -------------------------------
normalized_word_sentiment = []
for word, scores in word_scores.items():
    avg_sentiment = sum(scores) / len(scores)  # average across all occurrences
    normalized_word_sentiment.append([word, avg_sentiment])

# -------------------------------
# Save to CSV
# -------------------------------
with open("zulu_word_sentiment_normalized.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["word", "sentiment"])  # header
    writer.writerows(normalized_word_sentiment)

print(f"Saved {len(normalized_word_sentiment)} normalized word-level sentiment entries to zulu_word_sentiment_normalized.csv")


Saved 1850 normalized word-level sentiment entries to zulu_word_sentiment_normalized.csv


In [20]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [41]:
# =========================================
# Zulu Word Sentiment Regression Head with Custom NN
# =========================================

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# -------------------------------
# 1️⃣ Load lexicon CSV
# -------------------------------
df = pd.read_csv("zulu_word_sentiment.csv")
print(df.head())

words = df["word"].tolist()
labels = df["sentiment"].astype(float).tolist()

# -------------------------------
# 2️⃣ Load tokenizer and embeddings
# -------------------------------
model_name = "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)
base_model.eval()  # freeze base

# -------------------------------
# 3️⃣ Tokenize words and get embeddings
# -------------------------------
inputs = tokenizer(words, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = base_model(**inputs)
    # Use the [CLS] token embedding for each word
    embeddings = outputs.last_hidden_state[:, 0, :]

labels_tensor = torch.tensor(labels).unsqueeze(1)  # shape [N,1]

# -------------------------------
# 4️⃣ Create simple regression head
# -------------------------------
class SentimentHead(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.fc(x)

regression_head = SentimentHead(embeddings.shape[1])

# -------------------------------
# 5️⃣ Training setup
# -------------------------------
dataset = TensorDataset(embeddings, labels_tensor)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.Adam(regression_head.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# -------------------------------
# 6️⃣ Train
# -------------------------------
regression_head.train()
for epoch in range(20):
    total_loss = 0
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        preds = regression_head(batch_x)
        loss = loss_fn(preds, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

# -------------------------------
# 7️⃣ Inference
# -------------------------------
def predict_sentiment(word):
    base_model.eval()
    regression_head.eval()
    with torch.no_grad():
        inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)
        emb = base_model(**inputs).last_hidden_state[:, 0, :]
        score = regression_head(emb).item()
    return score

example_word = "jabula"
print(f"Sentiment score for '{example_word}': {predict_sentiment(example_word):.2f}")


            word  sentiment
0      niyokwaba         -3
1     eniyokwaba         -3
2      niyokwaba         -3
3     aniyukwaba         -3
4  eningeyukwaba         -3


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 8.6964
Epoch 2, Loss: 8.5594
Epoch 3, Loss: 8.4661
Epoch 4, Loss: 8.4073
Epoch 5, Loss: 8.3888
Epoch 6, Loss: 8.1771
Epoch 7, Loss: 8.0946
Epoch 8, Loss: 7.9329
Epoch 9, Loss: 7.8125
Epoch 10, Loss: 7.6899
Epoch 11, Loss: 7.5546
Epoch 12, Loss: 7.4666
Epoch 13, Loss: 7.2791
Epoch 14, Loss: 7.0718
Epoch 15, Loss: 6.6732
Epoch 16, Loss: 6.4531
Epoch 17, Loss: 6.3689
Epoch 18, Loss: 6.3419
Epoch 19, Loss: 5.9967
Epoch 20, Loss: 5.7522
Sentiment score for 'jabula': 1.49


In [43]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------
# Load Zulu dataset
# -------------------------------
zulu_dataset = load_dataset("michsethowusu/zulu-sentiments-corpus")
zulu_sentences = zulu_dataset["train"]["Zulu"][:153]
zulu_labels = zulu_dataset["train"]["sentiment"][:153]  # 'Positive' / 'Negative'

# -------------------------------
# Load Afro-XLMR base and your trained head
# -------------------------------
model_name = "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name).to(device)
base_model.eval()

# Assume `regression_head` is your trained head
regression_head.eval()
regression_head = regression_head.to(device)

# -------------------------------
# Function to get sentence embeddings
# -------------------------------
def get_sentence_embedding(sentences):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = base_model(**inputs)
        # Use [CLS] embedding as sentence representation
        emb = outputs.last_hidden_state[:, 0, :]
    return emb

# -------------------------------
# Zero-shot prediction
# -------------------------------
sentence_embs = get_sentence_embedding(zulu_sentences)
with torch.no_grad():
    scores = regression_head(sentence_embs).squeeze()  # shape [num_sentences]
    # Map scores to positive/negative
    preds = torch.where(scores >= 0, 1, 0)

# -------------------------------
# Compute accuracy
# -------------------------------
label_map = {"Negative": 0, "Positive": 1}
zulu_labels_int = torch.tensor([label_map[l] for l in zulu_labels]).to(device)
accuracy = (preds == zulu_labels_int).float().mean().item()
print(f"Zero-shot accuracy on Zulu samples: {accuracy:.4f}")

# # -------------------------------
# # Optional: print predictions
# # -------------------------------
# for sent, score, pred in zip(zulu_sentences, scores, preds):
#     print(f"'{sent}' -> Score: {score:.2f}, Pred: {'Positive' if pred==1 else 'Negative'}")


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Zero-shot accuracy on Zulu samples: 0.6078
