In [6]:
!pip install gensim
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from gensim.models import KeyedVectors  # for fastText English embeddings
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import kagglehub




In [15]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# ==========================================================
# 1. Model Utilities
# ==========================================================
def load_model(model_name: str):
    """Load a pre-trained transformer model and tokenizer."""
    print(f"Loading model: {model_name} ...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return tokenizer, model, device


def get_sentence_embedding(sentence: str, tokenizer, model, device):
    """Compute mean-pooled sentence embedding."""
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embedding


# ==========================================================
# 2. Projection Utilities
# ==========================================================
def compute_linear_projection(X, Y):
    """Compute least-squares linear projection matrix W."""
    print("Computing linear projection (least squares)...")
    return np.linalg.pinv(X) @ Y


def project_sentence(sentence, W, tokenizer, model, device):
    """Project a Zulu sentence into English embedding space."""
    emb = get_sentence_embedding(sentence, tokenizer, model, device)
    return emb @ W


# ==========================================================
# 3. Dataset Utilities
# ==========================================================
def load_csv_zulu_dataset(csv_path: str):
    """Load Zulu-English sentence pairs from CSV."""
    df = pd.read_csv(csv_path)
    if "zu" not in df.columns or "en" not in df.columns:
        raise ValueError("CSV must have columns 'zu' for Zulu and 'en' for English sentences")
    print(f"✅ Loaded {len(df)} sentence pairs from CSV.")

    # Print first 5 examples
    print("\nFirst 5 Zulu sentences:")
    for i, s in enumerate(df["zu"].head(5), 1):
        print(f"{i}: {s}")

    print("\nFirst 5 English sentences:")
    for i, s in enumerate(df["en"].head(5), 1):
        print(f"{i}: {s}")

    return df["zu"].tolist(), df["en"].tolist()


# ==========================================================
# 4. Dummy Sentiment Classifier
# ==========================================================
class DummyEnglishSentimentClassifier:
    """A trivial sentiment classifier based on sum of vector elements."""
    def predict(self, X):
        return ["positive" if X.sum() > 0 else "negative"]


# ==========================================================
# 5. Main Pipeline
# ==========================================================
def run_pipeline(csv_path: str = "/content/en-zu.training.csv"):
    # Step 1: Load dataset
    zulu_sentences, english_sentences = load_csv_zulu_dataset(csv_path)

    # Step 2: Load models
    tokenizer_zu, model_zu, device_zu = load_model("MoseliMotsoehli/zuBERTa")
    tokenizer_en, model_en, device_en = load_model("bert-base-uncased")

    # Step 3: Compute embeddings
    X, Y = [], []
    print("\nComputing sentence embeddings...")
    for zu, en in zip(zulu_sentences, english_sentences):
        try:
            z_emb = get_sentence_embedding(zu, tokenizer_zu, model_zu, device_zu)
            e_emb = get_sentence_embedding(en, tokenizer_en, model_en, device_en)
            X.append(z_emb)
            Y.append(e_emb)
        except Exception as e:
            print(f"Skipping pair: {zu[:50]} / {en[:50]} | {e}")

    X = np.array(X)
    Y = np.array(Y)
    print(f"Computed embeddings for {len(X)} pairs.")

    # Step 4: Compute linear projection
    W = compute_linear_projection(X, Y)
    np.save("projection_W.npy", W)

    # Step 5: Project a Zulu sentence and predict sentiment
    test_sentence = zulu_sentences[0]
    projected_embedding = project_sentence(test_sentence, W, tokenizer_zu, model_zu, device_zu)
    classifier = DummyEnglishSentimentClassifier()
    sentiment = classifier.predict(projected_embedding)

    print("\n========================================")
    print(f"Zulu sentence: {test_sentence}")
    print("Projected into English embedding space.")
    print("Predicted Sentiment:", sentiment[0])
    print("========================================")

    # Step 6: Cosine similarity between projected and English embeddings
    projected_all = X @ W
    sims = [cosine_similarity([pz], [e])[0, 0] for pz, e in zip(projected_all, Y)]
    print("Mean cosine similarity between projected Zulu and English embeddings:", np.mean(sims))


# ==========================================================
# Entry Point
# ==========================================================
if __name__ == "__main__":
    run_pipeline()



























# import torch
# import numpy as np
# import pandas as pd
# import kagglehub
# from kagglehub import KaggleDatasetAdapter
# from transformers import AutoTokenizer, AutoModel


# # ==========================================================
# # 1. Model Utilities
# # ==========================================================
# def load_model(model_name: str):
#     """Load a pre-trained transformer model and tokenizer."""
#     print(f"Loading model: {model_name} ...")
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModel.from_pretrained(model_name)
#     model.eval()
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)
#     return tokenizer, model, device


# def get_sentence_embedding(sentence: str, tokenizer, model, device):
#     """Compute mean-pooled sentence embedding."""
#     inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=128).to(device)
#     with torch.no_grad():
#         outputs = model(**inputs)
#         embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
#     return embedding


# # ==========================================================
# # 2. Projection Utilities
# # ==========================================================
# def compute_linear_projection(X, Y):
#     """Compute least-squares linear projection matrix W."""
#     print("Computing linear projection (least squares)...")
#     return np.linalg.pinv(X) @ Y


# def project_sentence(sentence, W, tokenizer, model, device):
#     """Project a Zulu sentence into English embedding space."""
#     emb = get_sentence_embedding(sentence, tokenizer, model, device)
#     return emb @ W


# # ==========================================================
# # 3. Dataset Utilities
# # ==========================================================
# def load_kaggle_zulu_dataset():
#     """Download and load Zulu-English dataset using pandas for safety."""
#     print("Downloading Kaggle dataset...")
#     df = kagglehub.load_dataset(
#         KaggleDatasetAdapter.PANDAS,
#         "olaniyanjulius/zulu-to-english-phrases",
#         "zulu_english_phrases.jsonl",
#         pandas_kwargs={"lines": True}
#     )
#     print(f"Loaded {len(df)} sentence pairs from Kaggle dataset.")

#     # Print first 5 Zulu sentences
#     print("\nFirst 5 Zulu sentences:")
#     for i, s in enumerate(df["Zulu"][:5], 1):
#         print(f"{i}: {s}")

#     # Print first 5 English sentences
#     print("\nFirst 5 English sentences:")
#     for i, s in enumerate(df["English"][:5], 1):
#         print(f"{i}: {s}")

#     return df["Zulu"].tolist(), df["English"].tolist()


# # ==========================================================
# # 4. Dummy Sentiment Classifier
# # ==========================================================
# class DummyEnglishSentimentClassifier:
#     """A trivial sentiment classifier based on sum of vector elements."""
#     def predict(self, X):
#         return ["positive" if X.sum() > 0 else "negative"]


# # ==========================================================
# # 5. Main Pipeline
# # ==========================================================
# def run_pipeline():
#     # Step 1: Load dataset
#     zulu_sentences, english_sentences = load_kaggle_zulu_dataset()

#     # Step 2: Load models
#     tokenizer_zu, model_zu, device_zu = load_model("MoseliMotsoehli/zuBERTa")
#     tokenizer_en, model_en, device_en = load_model("bert-base-uncased")

#     # Step 3: Compute embeddings
#     X, Y = [], []
#     print("Computing sentence embeddings...")
#     for zu, en in zip(zulu_sentences, english_sentences):
#         try:
#             z_emb = get_sentence_embedding(zu, tokenizer_zu, model_zu, device_zu)
#             e_emb = get_sentence_embedding(en, tokenizer_en, model_en, device_en)
#             X.append(z_emb)
#             Y.append(e_emb)
#         except Exception as e:
#             print(f"Skipping pair: {zu} / {en} | {e}")

#     X = np.array(X)
#     Y = np.array(Y)
#     print(f"Computed embeddings for {len(X)} pairs.")

#     # Step 4: Compute linear projection
#     W = compute_linear_projection(X, Y)
#     np.save("projection_W.npy", W)

#     # Step 5: Test on a Zulu sentence
#     test_sentence = zulu_sentences[0]
#     projected_embedding = project_sentence(test_sentence, W, tokenizer_zu, model_zu, device_zu)

#     # Step 6: Sentiment prediction
#     classifier = DummyEnglishSentimentClassifier()
#     sentiment = classifier.predict(projected_embedding)

#     print("\n========================================")
#     print(f"Zulu sentence: {test_sentence}")
#     print("Projected into English embedding space.")
#     print("Predicted Sentiment:", sentiment[0])
#     print("========================================")
#     projected = X @ W
#     from sklearn.metrics.pairwise import cosine_similarity
#     sims = [cosine_similarity([pz], [e])[0,0] for pz, e in zip(projected, Y)]
#     print("Mean cosine similarity between projected Zulu and English embeddings:", np.mean(sims))

# # ==========================================================
# # Entry Point
# # ==========================================================
# if __name__ == "__main__":
#     run_pipeline()


✅ Loaded 4739 sentence pairs from CSV.

First 5 Zulu sentences:
1: Ama-albhamu akhe e-solo eminyaka yawo-60 angamanye anezingoma ezimnandi kakhulu kwengake ngazizwa.
2: Ukusuka lapho kuzobe sekulandela ukuthi uma ufuna ukusindisa inyonyana, uzobe uphoqelela ukuthi zonke izindawo ezingamazinki zenze isiqinisekiso sokuthi iBhrithani ishiye ngomhla zingama-31 ku-Okthoba.
3: Ngemuva kweminyaka engamashumi amathathu u-St.John Paul II enxuse abaseMozambique ukuthi bayiqede impi yabo yombango, uFrancis kulindeleke ukuthi asisayine lesi sivumelwano esisha sangomhla lulu-1 ku-Agasti aphinde akhuthaze ukuthi sifezekiswe ngokugcwele lapho ehlangana neziphathimandla zikahulumeni ngoLwesine, okuwusuku lwakhe lokuqala olugcwele esifundeni. Ufike ngoLwesithathu kusihlwa kodwa ubengenayo imicimbi yomphakathi ebihleliwe ngemuva komcimbi wakhe omfushane wokwamukelwa esikhumulweni sezindiza lapho bekushaywa khona izigubhu zomdabu, kudanswa kukikizelwa kunjeya.
4: Wynford yw sylfaenydd Stafell Fyw yng Ngh

In [21]:
# Install dependencies (if not already)
# !pip install transformers datasets torch scikit-learn

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split

# ==========================================================
# 1. Load SST-2 dataset
# ==========================================================
dataset = load_dataset("glue", "sst2")
train_data = dataset["train"]
test_data = dataset["validation"]

train_sentences = train_data["sentence"]
train_labels = train_data["label"]  # 0=negative, 1=positive
test_sentences = test_data["sentence"]
test_labels = test_data["label"]

print(f"Train examples: {len(train_sentences)}, Test examples: {len(test_sentences)}")

# ==========================================================
# 2. Load English embedding model (BERT)
# ==========================================================
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-uncased")
model_en = AutoModel.from_pretrained("bert-base-uncased")
model_en.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_en.to(device)

# ==========================================================
# 3. Compute embeddings
# ==========================================================
def get_sentence_embedding(sentence):
    inputs = tokenizer_en(sentence, return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model_en(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embedding

print("Computing embeddings for training data...")
X_train = np.array([get_sentence_embedding(s) for s in train_sentences])
y_train = np.array(train_labels)

print("Computing embeddings for test data...")
X_test = np.array([get_sentence_embedding(s) for s in test_sentences])
y_test = np.array(test_labels)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# ==========================================================
# 4. Define classifier head
# ==========================================================
class EnglishClassifierHead(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )
    def forward(self, x):
        return self.fc(x)

# ==========================================================
# 5. Train classifier
# ==========================================================
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

classifier = EnglishClassifierHead(X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=1e-3)

for epoch in range(30):  # short demo; increase for better performance
    classifier.train()
    optimizer.zero_grad()
    outputs = classifier(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    classifier.eval()
    with torch.no_grad():
        preds = torch.argmax(classifier(X_test_tensor), dim=1)
        acc = (preds == y_test_tensor).float().mean().item()
    print(f"Epoch {epoch+1} - Loss: {loss.item():.4f} - Val Accuracy: {acc:.4f}")

torch.save(classifier.state_dict(), "english_sentiment_classifier.pt")
print("✅ Saved trained English sentiment classifier to 'english_sentiment_classifier.pt'")

# ==========================================================
# 6. Test predictions
# ==========================================================
test_sentences_demo = [
    "This is horrible!",
]

for s in test_sentences_demo:
    emb = get_sentence_embedding(s)
    emb_tensor = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)
    classifier.eval()
    with torch.no_grad():
        logits = classifier(emb_tensor)
        pred = torch.argmax(logits, dim=1).item()
        print(f"Sentence: '{s}' -> Sentiment: {'positive' if pred==1 else 'negative'}")


Train examples: 67349, Test examples: 872
Computing embeddings for training data...
Computing embeddings for test data...
Train shape: (67349, 768), Test shape: (872, 768)
Epoch 1 - Loss: 0.6885 - Val Accuracy: 0.5092
Epoch 2 - Loss: 0.6635 - Val Accuracy: 0.5161
Epoch 3 - Loss: 0.6421 - Val Accuracy: 0.6743
Epoch 4 - Loss: 0.6139 - Val Accuracy: 0.8119
Epoch 5 - Loss: 0.5848 - Val Accuracy: 0.7924
Epoch 6 - Loss: 0.5590 - Val Accuracy: 0.7993
Epoch 7 - Loss: 0.5327 - Val Accuracy: 0.8119
Epoch 8 - Loss: 0.5059 - Val Accuracy: 0.8222
Epoch 9 - Loss: 0.4824 - Val Accuracy: 0.8142
Epoch 10 - Loss: 0.4613 - Val Accuracy: 0.8142
Epoch 11 - Loss: 0.4411 - Val Accuracy: 0.8211
Epoch 12 - Loss: 0.4242 - Val Accuracy: 0.8268
Epoch 13 - Loss: 0.4102 - Val Accuracy: 0.8234
Epoch 14 - Loss: 0.3972 - Val Accuracy: 0.8211
Epoch 15 - Loss: 0.3861 - Val Accuracy: 0.8211
Epoch 16 - Loss: 0.3769 - Val Accuracy: 0.8268
Epoch 17 - Loss: 0.3684 - Val Accuracy: 0.8303
Epoch 18 - Loss: 0.3615 - Val Accuracy

In [26]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score
import torch.nn as nn

# ===============================================
# 1. Load saved projection matrix and English classifier
# ===============================================
W = np.load("projection_W.npy")  # linear projection matrix
print("Projection matrix W shape:", W.shape)

class EnglishClassifierHead(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )
    def forward(self, x):
        return self.fc(x)

embedding_dim = 768
classifier = EnglishClassifierHead(embedding_dim)
classifier.load_state_dict(torch.load("english_sentiment_classifier.pt", map_location="cpu"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier.to(device)
classifier.eval()

# ===============================================
# 2. Load Zulu model
# ===============================================
tokenizer_zu = AutoTokenizer.from_pretrained("MoseliMotsoehli/zuBERTa")
model_zu = AutoModel.from_pretrained("MoseliMotsoehli/zuBERTa")
model_zu.to(device)
model_zu.eval()

# ===============================================
# 3. Helper: Sentence embedding
# ===============================================
def get_sentence_embedding(sentence, tokenizer, model, device):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Check if model outputs last_hidden_state
        if hasattr(outputs, "last_hidden_state"):
            emb = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        else:
            raise ValueError("Model output missing last_hidden_state")
    return emb

# ===============================================
# 4. Load Zulu dataset
# ===============================================
ds = load_dataset("michsethowusu/zulu-sentiments-corpus")
zulu_texts = ds["train"]["Zulu"]
true_labels = ds["train"]["sentiment"]
label_map = {"Negative": 0, "Positive": 1}
true_labels_numeric = [label_map[l] for l in true_labels]

# ===============================================
# 5. Predict
# ===============================================
pred_labels = []

print("Processing Zulu phrases and predicting sentiment...\n")

for idx, text in enumerate(zulu_texts[:2000]):  # demo limit
    try:
        # 1. Get Zulu embedding
        z_emb = get_sentence_embedding(text, tokenizer_zu, model_zu, device)
        # print(f"Example {idx} - Zulu emb shape: {z_emb.shape}, mean: {z_emb.mean():.4f}, std: {z_emb.std():.4f}")

        # 2. Project to English embedding space
        if z_emb.shape[0] != W.shape[0]:
            print(f"⚠️ Shape mismatch: z_emb {z_emb.shape}, W {W.shape}")
            pred_labels.append(0)
            continue

        e_emb = np.dot(z_emb, W)
        # print(f"Example {idx} - Projected emb shape: {e_emb.shape}, mean: {e_emb.mean():.4f}, std: {e_emb.std():.4f}")

        # 3. Predict using English classifier
        e_emb_tensor = torch.tensor(e_emb, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            logits = classifier(e_emb_tensor)
            pred = torch.argmax(logits, dim=1).item()
        pred_labels.append(pred)

        # 4. Print phrase and sentiment
        true_sent = true_labels_numeric[idx]
        pred_sent_str = "Positive" if pred == 1 else "Negative"
        true_sent_str = "Positive" if true_sent == 1 else "Negative"
        # print(f"Example {idx}:\nZulu phrase: {text}\nPredicted: {pred_sent_str}, True: {true_sent_str}")
        # print("──────────────────────────────────────────────")

    except Exception as e:
        print(f"Skipping example {idx} due to error: {e}")
        pred_labels.append(0)

# ===============================================
# 6. Compute accuracy
# ===============================================
acc = accuracy_score(true_labels_numeric[:len(pred_labels)], pred_labels)
print(f"\n✅ Zulu → English → Sentiment Accuracy: {acc * 100:.2f}%")


Projection matrix W shape: (768, 768)
Processing Zulu phrases and predicting sentiment...


✅ Zulu → English → Sentiment Accuracy: 57.20%
