In [1]:
# --- Cell 2: LaBSE Embedding Generation ---
import torch
import pandas as pd
import numpy as np
import os
import gc
from sentence_transformers import SentenceTransformer

# Configuration
# LaBSE output dimension is 768
EMBEDDING_MODEL_NAME = "sentence-transformers/LaBSE"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# LaBSE is truly multilingual, so we don't need strict language codes like 'kas_Arab'.
# It detects script/language features automatically from the text.
LANG_FILTER = ['Kashmiri', 'Santali', 'Manipuri']

def cleanup_memory():
    gc.collect()
    torch.cuda.empty_cache()

def get_embeddings_managed(df, cache_path, is_train=True):
    # 1. Check Cache
    if os.path.exists(cache_path):
        print(f"ðŸ“‚ Found cached embeddings at {cache_path}. Loading...")
        return torch.load(cache_path, map_location='cpu')

    print(f"âš¡ Generating LaBSE embeddings for {len(df)} samples...")
    
    # 2. Load Model (Inside function to allow deletion later)
    print("   -> Loading LaBSE Model...")
    model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)
    
    # 3. Filter Data
    if is_train:
        # Keep only the target languages
        df = df[df['language'].isin(LANG_FILTER)]
    
    # 4. Generate Embeddings
    sentences = df['Sentence'].tolist()
    
    # Model.encode handles batching and GPU memory automatically!
    # show_progress_bar=True gives you a nice visual
    print("   -> Encoding sentences (Batching handled automatically)...")
    embeddings_numpy = model.encode(
        sentences, 
        batch_size=64, 
        show_progress_bar=True, 
        convert_to_numpy=True,
        normalize_embeddings=True # LaBSE works best with normalized vectors
    )
    
    # 5. Save
    final_tensor = torch.tensor(embeddings_numpy)
    
    print(f"ðŸ’¾ Saving to {cache_path}...")
    torch.save(final_tensor, cache_path)
    
    # 6. Cleanup
    del model
    cleanup_memory()
    print("   -> Model unloaded. Memory cleared.")
    
    return final_tensor

print("âœ… Embedding Logic Ready (LaBSE).")

  from .autonotebook import tqdm as notebook_tqdm


âœ… Embedding Logic Ready (LaBSE).


In [2]:
# --- Cell 3: LaBSE-Gemma Classifier Definition ---
import torch
import torch.nn as nn
from transformers import AutoModel, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

class LaBSEGemmaClassifier(nn.Module):
    def __init__(self, input_dim=768, num_labels=6, device="cuda"):
        super().__init__()
        self.device = device
        
        # 1. Load Gemma (4-bit)
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        
        print("   -> Loading Gemma-3-1B Backbone...")
        self.gemma_backbone = AutoModel.from_pretrained(
            "google/gemma-3-1b-it",
            quantization_config=bnb_config,
            device_map={"": device},
            trust_remote_code=True
        )
        
        # 2. Stability Fixes
        self.gemma_backbone = prepare_model_for_kbit_training(self.gemma_backbone)
        
        # 3. LoRA Adapters
        peft_config = LoraConfig(
            r=16, 
            lora_alpha=32, 
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], 
            lora_dropout=0.05, 
            bias="none", 
            task_type=TaskType.FEATURE_EXTRACTION 
        )
        self.gemma_backbone = get_peft_model(self.gemma_backbone, peft_config)
        
        # 4. Projector (LaBSE 768 -> Gemma Hidden)
        self.hidden_size = self.gemma_backbone.config.hidden_size 
        
        self.projector = nn.Sequential(
            nn.Linear(input_dim, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Linear(self.hidden_size, self.hidden_size)
        ).to(device)
        
        # 5. Head
        self.classifier = nn.Linear(self.hidden_size, num_labels).to(device)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, embeddings, labels=None):
        # embeddings: [Batch, 768]
        projected = self.projector(embeddings)     # -> [Batch, Gemma_Dim]
        inputs_embeds = projected.unsqueeze(1)     # -> [Batch, 1, Gemma_Dim]
        
        outputs = self.gemma_backbone(inputs_embeds=inputs_embeds)
        pooled = outputs.last_hidden_state[:, 0, :]
        
        logits = self.classifier(pooled)
        
        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)
            
        return logits, loss

print("âœ… Model Class Ready (768-Dim Input).")

âœ… Model Class Ready (768-Dim Input).


In [4]:
# --- Cell 4: Main Execution ---
import pandas as pd
import numpy as np
import tqdm
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Mappings
EMOTION_MAP = {'anger': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'sad': 4, 'surprise': 5}
ID_TO_EMOTION = {v: k for k, v in EMOTION_MAP.items()}
LANG_FILTER = ['Kashmiri', 'Santali', 'Manipuri']
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def main():
    print("=== STARTING LaBSE + GEMMA PIPELINE ===")
    
    # 1. Load Data
    try:
        df_train = pd.read_csv("dataset/competition_train.csv")
        df_test = pd.read_csv("dataset/competition_test.csv")
    except FileNotFoundError:
        # Fallback for local testing if needed
        df_train = pd.read_csv("competition_train.csv")
        df_test = pd.read_csv("competition_test.csv")

    # 2. Get Embeddings (LaBSE)
    # Note: is_train=True filters the dataset to only include valid languages
    X_all = get_embeddings_managed(df_train, "train_labse.pt", is_train=True)
    X_test = get_embeddings_managed(df_test, "test_labse.pt", is_train=False)
    
    # Ensure Memory is Clean
    cleanup_memory()
    
    # 3. Prepare Training Data
    # We must filter df_train to match the size of X_all
    df_train_clean = df_train[df_train['language'].isin(LANG_FILTER)].copy()
    y_all = torch.tensor(df_train_clean['emotion'].map(EMOTION_MAP).values, dtype=torch.long)
    
    # Split
    X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.15, random_state=42, stratify=y_all)
    
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=32)
    
    # 4. Initialize Classifier
    print("\n=== Initializing Gemma Classifier ===")
    # Input dim is 768 for LaBSE
    model = LaBSEGemmaClassifier(input_dim=768, num_labels=6, device=DEVICE)
    
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=3e-4)
    
    # 5. Training Loop
    EPOCHS = 5
    print(f"\n=== Training for {EPOCHS} Epochs ===")
    
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        
        for batch_X, batch_y in tqdm.tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            batch_X, batch_y = batch_X.to(DEVICE), batch_y.to(DEVICE)
            
            optimizer.zero_grad()
            logits, loss = model(batch_X, labels=batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        # Validation
        model.eval()
        val_preds = []
        val_targets = []
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X = batch_X.to(DEVICE)
                logits, _ = model(batch_X)
                preds = torch.argmax(logits, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_targets.extend(batch_y.numpy())
        
        acc = accuracy_score(val_targets, val_preds)
        print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f} | Val Acc: {acc:.4f}")

    # 6. Generate Submission
    print("\n=== Generating Submission ===")
    test_loader = DataLoader(TensorDataset(X_test), batch_size=32)
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        for batch in tqdm.tqdm(test_loader, desc="Predicting"):
            batch_X = batch[0].to(DEVICE)
            logits, _ = model(batch_X)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            
    submission_df = pd.DataFrame({
        'id': df_test['id'],
        'emotion': [ID_TO_EMOTION[p] for p in all_preds]
    })
    
    submission_df.to_csv('submission.csv', index=False)
    print("\nâœ… Success! 'submission.csv' is ready.")

# Run
main()

=== STARTING LaBSE + GEMMA PIPELINE ===
âš¡ Generating LaBSE embeddings for 7176 samples...
   -> Loading LaBSE Model...


   -> Encoding sentences (Batching handled automatically)...


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 113/113 [00:09<00:00, 12.48it/s]


ðŸ’¾ Saving to train_labse.pt...
   -> Model unloaded. Memory cleared.
âš¡ Generating LaBSE embeddings for 2392 samples...
   -> Loading LaBSE Model...
   -> Encoding sentences (Batching handled automatically)...


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 38/38 [00:02<00:00, 13.98it/s]


ðŸ’¾ Saving to test_labse.pt...
   -> Model unloaded. Memory cleared.

=== Initializing Gemma Classifier ===
   -> Loading Gemma-3-1B Backbone...


: 

In [None]:
sub = pd.read_csv()