In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from tqdm import tqdm

# -------------------------------
# 1. Dataset
# -------------------------------
class TranslationDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_len=64):
        df = pd.read_csv(csv_path)
        self.en = df["en"].tolist()
        self.zu = df["zu"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.en)

    def __getitem__(self, idx):
        en_text = self.en[idx]
        zu_text = self.zu[idx]
        en_enc = self.tokenizer(
            en_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        zu_enc = self.tokenizer(
            zu_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "en_input_ids": en_enc["input_ids"].squeeze(),
            "en_attention_mask": en_enc["attention_mask"].squeeze(),
            "zu_input_ids": zu_enc["input_ids"].squeeze(),
            "zu_attention_mask": zu_enc["attention_mask"].squeeze(),
        }

# -------------------------------
# 2. Contrastive Encoder with frozen encoder
# -------------------------------
class ContrastiveEncoderFrozen(nn.Module):
    def __init__(self, model_name="FacebookAI/xlm-roberta-large", proj_dim=256):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        # Freeze encoder weights
        for param in self.encoder.parameters():
            param.requires_grad = False
        # Trainable projection layer
        self.projection = nn.Linear(self.encoder.config.hidden_size, proj_dim)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # no gradients for encoder
            outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
            cls_emb = outputs.last_hidden_state[:, 0, :]
        proj = self.projection(cls_emb)
        return nn.functional.normalize(proj, p=2, dim=1)

# -------------------------------
# 3. Contrastive Loss
# -------------------------------
def contrastive_loss(a, b, temperature=0.05):
    sim_matrix = torch.matmul(a, b.T) / temperature
    labels = torch.arange(a.size(0)).to(a.device)
    return (nn.CrossEntropyLoss()(sim_matrix, labels) + nn.CrossEntropyLoss()(sim_matrix.T, labels)) / 2

# -------------------------------
# 4. Evaluation
# -------------------------------
def evaluate(model, dataloader, device):
    model.eval()
    sims = []
    with torch.no_grad():
        for batch in dataloader:
            en_emb = model(
                input_ids=batch["en_input_ids"].to(device),
                attention_mask=batch["en_attention_mask"].to(device)
            )
            zu_emb = model(
                input_ids=batch["zu_input_ids"].to(device),
                attention_mask=batch["zu_attention_mask"].to(device)
            )
            sim = torch.sum(en_emb * zu_emb, dim=1)  # cosine since embeddings are normalized
            sims.extend(sim.cpu().tolist())
    print(f"Validation Avg Cosine Similarity: {sum(sims)/len(sims):.4f}")

# -------------------------------
# 5. Training Loop
# -------------------------------
def train_contrastive_encoder(train_csv, val_csv, epochs=15, batch_size=8):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")

    train_ds = TranslationDataset(train_csv, tokenizer)
    val_ds = TranslationDataset(val_csv, tokenizer)

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=batch_size)

    model = ContrastiveEncoderFrozen().to(device)
    optimizer = torch.optim.AdamW(model.projection.parameters(), lr=2e-4)  # only projection is trainable

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_dl, desc=f"Epoch {epoch+1}/{epochs}"):
            en_emb = model(
                input_ids=batch["en_input_ids"].to(device),
                attention_mask=batch["en_attention_mask"].to(device)
            )
            zu_emb = model(
                input_ids=batch["zu_input_ids"].to(device),
                attention_mask=batch["zu_attention_mask"].to(device)
            )
            loss = contrastive_loss(en_emb, zu_emb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}: Train Loss = {total_loss/len(train_dl):.4f}")
        evaluate(model, val_dl, device)

    # Save trained encoder
    torch.save(model.state_dict(), "contrastive_encoder_frozen.pt")
    print("✅ Saved trained contrastive encoder as 'contrastive_encoder_frozen.pt'")
    return model, tokenizer

# -------------------------------
# 6. Run Training
# -------------------------------
if __name__ == "__main__":
    train_csv = "en-zu.training.csv"
    val_csv = "en-zu.eval.csv"
    train_contrastive_encoder(train_csv, val_csv, epochs=10, batch_size=8)


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Epoch 1/10: 100%|██████████| 593/593 [01:55<00:00,  5.15it/s]


Epoch 1: Train Loss = 1.8992
Validation Avg Cosine Similarity: 0.9449


Epoch 2/10: 100%|██████████| 593/593 [01:54<00:00,  5.19it/s]


Epoch 2: Train Loss = 1.4173
Validation Avg Cosine Similarity: 0.9052


Epoch 3/10: 100%|██████████| 593/593 [01:54<00:00,  5.16it/s]


Epoch 3: Train Loss = 1.2706
Validation Avg Cosine Similarity: 0.8850


Epoch 4/10: 100%|██████████| 593/593 [01:54<00:00,  5.19it/s]


Epoch 4: Train Loss = 1.1789
Validation Avg Cosine Similarity: 0.8940


Epoch 5/10: 100%|██████████| 593/593 [01:54<00:00,  5.19it/s]


Epoch 5: Train Loss = 1.1619
Validation Avg Cosine Similarity: 0.8857


Epoch 6/10: 100%|██████████| 593/593 [01:54<00:00,  5.19it/s]


Epoch 6: Train Loss = 1.1415
Validation Avg Cosine Similarity: 0.8865


Epoch 7/10: 100%|██████████| 593/593 [01:54<00:00,  5.18it/s]


Epoch 7: Train Loss = 1.0805
Validation Avg Cosine Similarity: 0.8478


Epoch 8/10: 100%|██████████| 593/593 [01:54<00:00,  5.19it/s]


Epoch 8: Train Loss = 1.0514
Validation Avg Cosine Similarity: 0.8715


Epoch 9/10: 100%|██████████| 593/593 [01:54<00:00,  5.19it/s]


Epoch 9: Train Loss = 1.0574
Validation Avg Cosine Similarity: 0.8664


Epoch 10/10: 100%|██████████| 593/593 [01:54<00:00,  5.20it/s]


Epoch 10: Train Loss = 1.0264
Validation Avg Cosine Similarity: 0.8507
✅ Saved trained contrastive encoder as 'contrastive_encoder_frozen.pt'


In [6]:
from datasets import load_dataset
import numpy as np
import torch.nn.functional as F

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")
train_texts = dataset["train"]["sentence"]
train_labels = np.array(dataset["train"]["label"])
val_texts = dataset["validation"]["sentence"]
val_labels = np.array(dataset["validation"]["label"])

# Encode sentences using trained contrastive encoder
def encode_texts(texts, tokenizer, encoder, device):
    all_emb = []
    encoder.eval()
    with torch.no_grad():
        for t in texts:
            enc = tokenizer(t, return_tensors="pt", truncation=True, max_length=64).to(device)
            emb = encoder(enc["input_ids"], enc["attention_mask"])
            all_emb.append(emb.squeeze(0).cpu())
    return torch.stack(all_emb)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = ContrastiveEncoderFrozen().to(device)
encoder.load_state_dict(torch.load("contrastive_encoder_frozen.pt"))
encoder.eval()
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")

X_train = encode_texts(train_texts, tokenizer, encoder, device)
y_train = torch.tensor(train_labels)
X_val = encode_texts(val_texts, tokenizer, encoder, device)
y_val = torch.tensor(val_labels)

# Sentiment classifier
class SentimentHead(nn.Module):
    def __init__(self, emb_dim=256, hidden_dim=128):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
    def forward(self, x):
        return self.fc(x)

classifier = SentimentHead().to(device)
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Train sentiment head
for epoch in range(10):
    classifier.train()
    optimizer.zero_grad()
    logits = classifier(X_train.to(device))
    loss = criterion(logits, y_train.to(device))
    loss.backward()
    optimizer.step()

    classifier.eval()
    with torch.no_grad():
        val_preds = torch.argmax(classifier(X_val.to(device)), dim=1)
        acc = (val_preds == y_val.to(device)).float().mean().item()
    print(f"Epoch {epoch+1} - Loss: {loss.item():.4f} - Val Acc: {acc:.4f}")

torch.save(classifier.state_dict(), "english_sentiment_head.pt")
print("✅ Saved English sentiment head")


Epoch 1 - Loss: 0.6904 - Val Acc: 0.5092
Epoch 2 - Loss: 0.6878 - Val Acc: 0.5092
Epoch 3 - Loss: 0.6856 - Val Acc: 0.5092
Epoch 4 - Loss: 0.6836 - Val Acc: 0.5092
Epoch 5 - Loss: 0.6819 - Val Acc: 0.5092
Epoch 6 - Loss: 0.6802 - Val Acc: 0.5092
Epoch 7 - Loss: 0.6787 - Val Acc: 0.5092
Epoch 8 - Loss: 0.6771 - Val Acc: 0.5092
Epoch 9 - Loss: 0.6754 - Val Acc: 0.5103
Epoch 10 - Loss: 0.6737 - Val Acc: 0.5115
✅ Saved English sentiment head


In [8]:
# Load contrastive encoder and sentiment head
encoder.load_state_dict(torch.load("contrastive_encoder_frozen.pt"))
classifier.load_state_dict(torch.load("english_sentiment_head.pt"))
encoder.eval()
classifier.eval()

# Load Zulu evaluation data
ds_zu = load_dataset("michsethowusu/zulu-sentiments-corpus")
zulu_texts = ds_zu["train"]["Zulu"]
true_labels = np.array([0 if l=="Negative" else 1 for l in ds_zu["train"]["sentiment"]])

# Encode Zulu with contrastive encoder
X_zu = encode_texts(zulu_texts[:2000], tokenizer, encoder, device)

with torch.no_grad():
    logits = classifier(X_zu.to(device))
    preds = torch.argmax(logits, dim=1).cpu().numpy()

from sklearn.metrics import accuracy_score
acc = accuracy_score(true_labels[:len(preds)], preds)
print(f"✅ Zulu → English Sentiment Accuracy (zero-shot): {acc*100:.2f}%")


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-28a52169101937(…):   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/187435 [00:00<?, ? examples/s]

✅ Zulu → English Sentiment Accuracy (zero-shot): 57.85%
