In [1]:
!pip install -q transformers accelerate datasets evaluate scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:


import os
import torch
import pandas as pd
import numpy as np
from google.colab import drive
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification, # <-- CHANGED: Optimized for Sentence tasks
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
# 1. Mount Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 2. Data Loading (Simpler: No token splitting needed)
data_path = "/content/drive/MyDrive/Code-Switching data/"
files = {"train": "train.tsv", "validation": "val.tsv", "test": "test.tsv"}

def load_data(filepath):
    df = pd.read_csv(filepath, sep='\t')
    print(f"Columns in {filepath}: {df.columns.tolist()}") # Added for debugging
    # We only need the text and the label.
    # We rename 'sentiment' to 'labels' for HuggingFace compatibility
    df = df[['text', 'sentiment']].rename(columns={'sentiment': 'label'})

    # Drop rows with missing values to prevent errors
    df = df.dropna()
    # Ensure text is string
    df['text'] = df['text'].astype(str)
    return df

dfs = {split: load_data(os.path.join(data_path, f)) for split, f in files.items()}

# Create Label Mappings
label_list = sorted(list(set(dfs['train']['label'].unique())))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(f"Labels found: {label2id}")

# Convert string labels to integers
for split in dfs:
    dfs[split]['label'] = dfs[split]['label'].map(label2id)

# Create HuggingFace Datasets
dataset = DatasetDict({
    split: Dataset.from_pandas(df) for split, df in dfs.items()
})

Columns in /content/drive/MyDrive/Code-Switching data/train.tsv: ['id', 'text', 'sentiment']
Columns in /content/drive/MyDrive/Code-Switching data/val.tsv: ['id', 'text', 'sentiment']
Columns in /content/drive/MyDrive/Code-Switching data/test.tsv: ['id', 'text', 'sentiment']
Labels found: {'Negative': 0, 'Neutral': 1, 'Positive': 2}


In [5]:
# 3. Tokenization (Sentence Level)
model_checkpoint = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    # Truncate to 512. We don't pad here; DataCollator will do it dynamically (faster)
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove raw text columns to avoid "too many dimensions" error
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
if "__index_level_0__" in tokenized_datasets["train"].column_names:
    tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__"])

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Map:   0%|          | 0/8561 [00:00<?, ? examples/s]

Map:   0%|          | 0/1071 [00:00<?, ? examples/s]

Map:   0%|          | 0/1070 [00:00<?, ? examples/s]

In [6]:
# 4. Model Initialization
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# 5. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [8]:
# 6. Training Arguments
# We can use a higher learning rate for the CLS head, but 2e-5 is safe for MuRIL
batch_size = 32

args = TrainingArguments(
    output_dir="muril_sentiment_sequence_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,             # 15 Epochs usually enough for fine-tuning
    weight_decay=0.01,
    lr_scheduler_type="cosine",      # Cosine is great for convergence
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    save_total_limit=2,
    logging_steps=50,
    report_to="none"
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

  trainer = Trainer(


In [9]:
# 7. Train
print("Starting Training...")
trainer.train()

Starting Training...


model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0468,1.02034,0.526611,0.429426,0.55318,0.526611
2,0.8912,0.856674,0.590103,0.510889,0.595915,0.590103
3,0.7643,0.769006,0.690943,0.679956,0.684635,0.690943
4,0.626,0.707817,0.716153,0.706119,0.711574,0.716153
5,0.5042,0.668841,0.730159,0.732538,0.736919,0.730159
6,0.4438,0.676119,0.745098,0.748211,0.760013,0.745098
7,0.3558,0.743916,0.741363,0.740932,0.740779,0.741363
8,0.3041,0.793203,0.735761,0.734815,0.741425,0.735761
9,0.2508,0.811314,0.738562,0.739143,0.740896,0.738562
10,0.1895,0.879284,0.735761,0.73842,0.747784,0.735761


TrainOutput(global_step=2680, training_loss=0.5524265184331296, metrics={'train_runtime': 956.0287, 'train_samples_per_second': 134.321, 'train_steps_per_second': 4.205, 'total_flos': 2863984926928734.0, 'train_loss': 0.5524265184331296, 'epoch': 10.0})

In [10]:
# 8. Final Evaluation & Save
print("\nSaving best model...")
trainer.save_model("muril_sentiment_best_model")
tokenizer.save_pretrained("muril_sentiment_best_model")

print("\nEvaluating on Test Set...")
test_results = trainer.evaluate(tokenized_datasets["test"])

print("\n" + "="*30)
print(f"FINAL TEST RESULTS")
print("="*30)
print(f"Accuracy:  {test_results['eval_accuracy']:.4f}")
print(f"F1 Score:  {test_results['eval_f1']:.4f}")
print("="*30)


Saving best model...

Evaluating on Test Set...



FINAL TEST RESULTS
Accuracy:  0.7028
F1 Score:  0.7063


In [13]:
from transformers import pipeline

# Define the path to your saved model
save_path = "./muril_sentiment_best_model"

# Load your saved model
classifier = pipeline("text-classification", model=save_path, tokenizer=save_path, device=0)

# Test with some Nepanglish sentences
examples = [
    "Yo movie ekdam ramro chha",          # Positive
    "Plot ali weak thiyo tara acting babal", # Mixed/Positive
    "Time waste matra bho yar",
    "This was the waste of time"           # Negative
]

print("\n--- Predictions ---")
for text in examples:
    result = classifier(text)
    print(f"Text: {text}")
    print(f"Label: {result[0]['label']}, Score: {result[0]['score']:.4f}\n")

The tokenizer you are loading from './muril_sentiment_best_model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Device set to use cuda:0



--- Predictions ---
Text: Yo movie ekdam ramro chha
Label: Positive, Score: 0.9553

Text: Plot ali weak thiyo tara acting babal
Label: Negative, Score: 0.8773

Text: Time waste matra bho yar
Label: Negative, Score: 0.9071

Text: This was the waste of time
Label: Negative, Score: 0.9110



In [14]:
# Diagnostic Test
examples = [
    "Acting babal",             # Test if it even understands 'babal'
    "Acting ramro",             # Test standard Nepali
    "Plot ali weak thiyo",      # Test the negative part
    "Plot weak thiyo tara acting babal" # The full sentence
]

print("--- Diagnostics ---")
for text in examples:
    result = classifier(text)
    print(f"'{text}' -> {result[0]['label']} ({result[0]['score']:.4f})")

--- Diagnostics ---
'Acting babal' -> Positive (0.9552)
'Acting ramro' -> Positive (0.9534)
'Plot ali weak thiyo' -> Negative (0.6134)
'Plot weak thiyo tara acting babal' -> Negative (0.7625)
