In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import os

# -------------------------------
# 1. Load dataset
# -------------------------------
dataset = load_dataset("glue", "sst2")
train_sentences = dataset["train"]["sentence"][:2000]  # small subset for speed
train_labels = torch.tensor(dataset["train"]["label"][:2000])
val_sentences = dataset["validation"]["sentence"]
val_labels = torch.tensor(dataset["validation"]["label"])

# -------------------------------
# 2. Load tokenizer & model (frozen)
# -------------------------------
model_name = "Davlan/afro-xlmr-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
afro = AutoModel.from_pretrained(model_name)
afro.eval()  # freeze model
for p in afro.parameters():
    p.requires_grad = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
afro.to(device)

# -------------------------------
# 3. Small classifier head
# -------------------------------
class SentimentClassifier(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )
    def forward(self, x):
        return self.fc(x)

classifier = SentimentClassifier(afro.config.hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=1e-3)

# -------------------------------
# 4. Helper: get embeddings for a batch
# -------------------------------
def get_embeddings(sentences):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=64).to(device)
    with torch.no_grad():
        return afro(**inputs).last_hidden_state[:,0,:]  # CLS token

# -------------------------------
# 5. Training loop (on-the-fly embeddings)
# -------------------------------
epochs = 10
batch_size = 64

for epoch in range(epochs):
    classifier.train()
    permutation = torch.randperm(len(train_sentences))
    total_loss = 0

    for i in range(0, len(train_sentences), batch_size):
        idx = permutation[i:i+batch_size]
        xb_sentences = [train_sentences[j] for j in idx]
        xb = get_embeddings(xb_sentences)
        yb = train_labels[idx].to(device)

        optimizer.zero_grad()
        logits = classifier(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    classifier.eval()
    correct = 0
    with torch.no_grad():
        for i in range(0, len(val_sentences), batch_size):
            batch_sents = val_sentences[i:i+batch_size]
            batch_labels = val_labels[i:i+batch_size].to(device)
            batch_embs = get_embeddings(batch_sents)
            logits = classifier(batch_embs)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == batch_labels).sum().item()
    val_acc = correct / len(val_sentences)

    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_sentences):.4f} - Val Acc: {val_acc:.4f}")

# -------------------------------
# 6. Save classifier
# -------------------------------
os.makedirs("models", exist_ok=True)
torch.save(classifier.state_dict(), "models/sentiment_classifier.pth")
print("Classifier saved to models/sentiment_classifier.pth")

# -------------------------------
# 7. Test demo
# -------------------------------
demo_sentences = ["I loved this movie!", "That was the worst acting ever."]
demo_embs = get_embeddings(demo_sentences)
classifier.eval()
with torch.no_grad():
    logits = classifier(demo_embs)
    preds = torch.argmax(logits, dim=1)
    for s, p in zip(demo_sentences, preds):
        print(f"'{s}' -> {'positive' if p==1 else 'negative'}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Loss: 0.0111 - Val Acc: 0.6858
Epoch 2 - Loss: 0.0093 - Val Acc: 0.7099
Epoch 3 - Loss: 0.0084 - Val Acc: 0.7443
Epoch 4 - Loss: 0.0074 - Val Acc: 0.6950
Epoch 5 - Loss: 0.0070 - Val Acc: 0.6892
Epoch 6 - Loss: 0.0060 - Val Acc: 0.7580
Epoch 7 - Loss: 0.0053 - Val Acc: 0.7362
Epoch 8 - Loss: 0.0048 - Val Acc: 0.6995
Epoch 9 - Loss: 0.0042 - Val Acc: 0.7534
Epoch 10 - Loss: 0.0040 - Val Acc: 0.7385
Classifier saved to models/sentiment_classifier.pth
'I loved this movie!' -> positive
'That was the worst acting ever.' -> negative


In [None]:
from datasets import load_dataset

# Load the Zulu sentiment dataset
zulu_dataset = load_dataset("michsethowusu/zulu-sentiments-corpus")
print(zulu_dataset)
zulu_sentences = zulu_dataset["train"]["Zulu"][:153]      # Zulu text
zulu_labels = zulu_dataset["train"]["sentiment"][:153]    # 'Positive' / 'Negative'

# Get embeddings
zulu_embs = get_embeddings(zulu_sentences)

# Predict with your English-trained classifier
classifier.eval()
with torch.no_grad():
    logits = classifier(zulu_embs)
    preds = torch.argmax(logits, dim=1)

# Show predictions
# for sent, pred in zip(zulu_sentences, preds):
    # print(f"'{sent}' -> {'positive' if pred==1 else 'negative'}")

# Compute accuracy
label_map = {"Negative": 0, "Positive": 1}
zulu_labels_int = torch.tensor([label_map[l] for l in zulu_labels]).to(device)
accuracy = (preds == zulu_labels_int).float().mean().item()
print(f"Zero-shot accuracy on demo Zulu samples: {accuracy:.4f}")


DatasetDict({
    train: Dataset({
        features: ['Zulu', 'sentiment', '__index_level_0__'],
        num_rows: 187435
    })
})
Zero-shot accuracy on demo Zulu samples: 0.6797
