In [41]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ==========================================
# 1. Load Encoders + Tokenizers
# ==========================================
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer_zu = AutoTokenizer.from_pretrained("MoseliMotsoehli/zuBERTa")

# ------------------------------------------
# Encoder wrapper with projection
# ------------------------------------------
class EncoderWrapper(nn.Module):
    def __init__(self, model_name, proj_dim=256):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

        # freeze encoder
        for p in self.encoder.parameters():
            p.requires_grad = False

        self.proj = nn.Linear(self.encoder.config.hidden_size, proj_dim)

    def forward(self, ids, mask):
        out = self.encoder(ids, attention_mask=mask)
        cls = out.last_hidden_state[:, 0, :]
        p = self.proj(cls)
        return nn.functional.normalize(p, dim=1)

model_en = EncoderWrapper("bert-base-uncased").to(device)
model_zu = EncoderWrapper("MoseliMotsoehli/zuBERTa").to(device)

# ==========================================
# 2. Load SST-2 sentiment (English)
# ==========================================
sst = load_dataset("glue", "sst2")
sst_sentences = sst["train"]["sentence"]
sst_labels = torch.tensor(sst["train"]["label"])

print("Loaded SST-2:", len(sst_sentences))

# ==========================================
# 3. Dataset for (EN–ZU + EN sentiment)
# ==========================================
class ContrastiveSentimentDataset(Dataset):
    def __init__(self, parallel_csv, tokenizer_en, tokenizer_zu,
                 sst_sentences, sst_labels, max_len=64):

        df = pd.read_csv(parallel_csv)
        self.en_parallel = df["en"].tolist()
        self.zu_parallel = df["zu"].tolist()

        self.sst_sentences = sst_sentences
        self.sst_labels = sst_labels.tolist()

        self.tokenizer_en = tokenizer_en
        self.tokenizer_zu = tokenizer_zu
        self.max_len = max_len

    def __len__(self):
        return len(self.en_parallel)

    def __getitem__(self, idx):

        # parallel EN / ZU
        en_enc = self.tokenizer_en(
            self.en_parallel[idx], truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )
        zu_enc = self.tokenizer_zu(
            self.zu_parallel[idx], truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )

        # sentiment example (random EN sentence)
        s_idx = torch.randint(0, len(self.sst_sentences), (1,)).item()
        s_text = self.sst_sentences[s_idx]
        s_lab = self.sst_labels[s_idx]

        s_enc = self.tokenizer_en(
            s_text, truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )

        return {
            "en_ids": en_enc["input_ids"].squeeze(),
            "en_mask": en_enc["attention_mask"].squeeze(),

            "zu_ids": zu_enc["input_ids"].squeeze(),
            "zu_mask": zu_enc["attention_mask"].squeeze(),

            "sent_ids": s_enc["input_ids"].squeeze(),
            "sent_mask": s_enc["attention_mask"].squeeze(),
            "label": torch.tensor(s_lab),
        }

# ==========================================
# 4. Losses
# ==========================================
def contrastive_loss(en_emb, zu_emb, temperature=0.05):
    N = en_emb.size(0)
    labels = torch.arange(N).to(en_emb.device)

    sim = en_emb @ zu_emb.t() / temperature
    sim_t = zu_emb @ en_emb.t() / temperature

    ce = nn.CrossEntropyLoss()
    return (ce(sim, labels) + ce(sim_t, labels)) / 2

class SentimentHead(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.fc = nn.Linear(dim, 2)
    def forward(self, x):
        return self.fc(x)

# ==========================================
# 5. Joint Training Loop
# ==========================================
def train_joint(parallel_csv, epochs=5, batch_size=16, lambda_sent=1.0):

    dataset = ContrastiveSentimentDataset(
        parallel_csv, tokenizer_en, tokenizer_zu,
        sst_sentences, sst_labels
    )
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    sentiment_head = SentimentHead(256).to(device)
    ce = nn.CrossEntropyLoss()

    params = (
        list(model_en.proj.parameters()) +
        list(model_zu.proj.parameters()) +
        list(sentiment_head.parameters())
    )

    opt = torch.optim.AdamW(params, lr=2e-4)

    for epoch in range(epochs):
        total_loss = 0

        for batch in tqdm(loader, desc=f"Epoch {epoch+1}"):

            en_emb = model_en(batch["en_ids"].to(device), batch["en_mask"].to(device))
            zu_emb = model_zu(batch["zu_ids"].to(device), batch["zu_mask"].to(device))

            loss_con = contrastive_loss(en_emb, zu_emb)

            sent_emb = model_en(batch["sent_ids"].to(device), batch["sent_mask"].to(device))
            sent_logits = sentiment_head(sent_emb)
            loss_sent = ce(sent_logits, batch["label"].to(device))

            loss = loss_con + lambda_sent * loss_sent

            opt.zero_grad()
            loss.backward()
            opt.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1} Loss: {total_loss / len(loader):.4f}")

    torch.save(model_en.proj.state_dict(), "proj_en.pt")
    torch.save(model_zu.proj.state_dict(), "proj_zu.pt")
    torch.save(sentiment_head.state_dict(), "sentiment_head.pt")

    print("Saved: proj_en.pt, proj_zu.pt, sentiment_head.pt")

    return

# ==========================================
# 6. Run Training
# ==========================================
train_joint("en-zu.training.csv", epochs=5, batch_size=16, lambda_sent=1.0)


Device: cuda
Loaded SST-2: 67349


Epoch 1: 100%|██████████| 297/297 [00:49<00:00,  6.04it/s]


Epoch 1 Loss: 2.1127


Epoch 2: 100%|██████████| 297/297 [00:46<00:00,  6.45it/s]


Epoch 2 Loss: 1.3756


Epoch 3: 100%|██████████| 297/297 [00:47<00:00,  6.26it/s]


Epoch 3 Loss: 1.0851


Epoch 4: 100%|██████████| 297/297 [00:46<00:00,  6.36it/s]


Epoch 4 Loss: 0.9047


Epoch 5: 100%|██████████| 297/297 [00:47<00:00,  6.32it/s]

Epoch 5 Loss: 0.7798
Saved: proj_en.pt, proj_zu.pt, sentiment_head.pt





In [44]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load the Zulu sentiment dataset
ds = load_dataset("michsethowusu/zulu-sentiments-corpus")
zulu_texts = ds["train"]["text"] if "text" in ds["train"].column_names else ds["train"]["Zulu"]
zulu_labels = ds["train"]["sentiment"]
label_map = {"Negative": 0, "Positive": 1}
y_true = np.array([label_map[l] for l in zulu_labels])

print("Loaded Zulu dataset: total examples =", len(zulu_texts))

# 2. Load your model components
tokenizer_zu = AutoTokenizer.from_pretrained("MoseliMotsoehli/zuBERTa")
class EncoderWrapper(nn.Module):
    def __init__(self, model_name, proj_dim=256):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        for p in self.encoder.parameters():
            p.requires_grad = False
        self.proj = nn.Linear(self.encoder.config.hidden_size, proj_dim)
    def forward(self, ids, mask):
        out = self.encoder(ids, attention_mask=mask)
        cls = out.last_hidden_state[:, 0, :]
        p = self.proj(cls)
        return nn.functional.normalize(p, dim=1)

model_zu = EncoderWrapper("MoseliMotsoehli/zuBERTa", proj_dim=256).to(device)
model_zu.proj.load_state_dict(torch.load("proj_zu.pt", map_location=device))
model_zu.eval()

class SentimentHead(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.fc = nn.Linear(dim, 2)
    def forward(self, x):
        return self.fc(x)

sentiment_head = SentimentHead(256).to(device)
sentiment_head.load_state_dict(torch.load("sentiment_head.pt", map_location=device))
sentiment_head.eval()

# 3. Prediction loop (may batch for speed)
preds = []
batch_size = 32
for i in range(0, len(zulu_texts), batch_size):
    batch_texts = zulu_texts[i:i+batch_size]
    enc = tokenizer_zu(batch_texts, return_tensors="pt", truncation=True,
                       padding=True, max_length=64).to(device)
    with torch.no_grad():
        emb = model_zu(enc["input_ids"], enc["attention_mask"])
        logits = sentiment_head(emb)
        batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
    preds.extend(batch_preds)

preds = np.array(preds)

# 4. Compute accuracy
accuracy = (preds == y_true[:len(preds)]).mean()
print(f"Zero‑shot Zulu Sentiment Accuracy: {accuracy*100:.2f}%")


Loaded Zulu dataset: total examples = 187435
Zero‑shot Zulu Sentiment Accuracy: 48.04%
