In [1]:
pip install transformers datasets scikit-learn pandas



In [2]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [3]:
# Step 1: Load and parse the text file
file_path = "codemix_sentiment_data.txt"
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

data = []
for line in lines:
    if line.strip() == "" or ":" not in line:
        continue
    match = re.match(r"(POS|NEG|NTL):\s*(.+)", line.strip())
    if match:
        label, text = match.groups()
        data.append({"text": text.strip(), "label": label})

df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,text,label
0,We need Mr chari 's review on master,NTL
1,worst government . #YSRCP chala chethha ga par...,NEG
2,bayya nuvvu emina cheppu kani bagoledu ani che...,NEG
3,Dube gadini vadilesi manchhi Pani chesaru @RCB...,POS
4,I came to watch thyview 's review crying after...,POS


In [5]:
# Step 2: Map labels to integers
label_map = {"NEG": 0, "NTL": 1, "POS": 2}
df["label"] = df["label"].map(label_map)

In [6]:
# Step 3: Convert to Hugging Face Dataset
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [8]:
# Step 4: Load tokenizer and tokenize
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)


Map:   0%|          | 0/17881 [00:00<?, ? examples/s]

Map:   0%|          | 0/1987 [00:00<?, ? examples/s]

In [10]:
# Step 5: Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Step 6: Training setup
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

In [12]:
# Step 7: Evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average="macro"),
    }


In [13]:
# Step 8: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [14]:
# Step 9: Train and evaluate
trainer.train()
trainer.evaluate()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msubramanyams9999[0m ([33msubramanyams9999-vellore-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.6315,0.629034,0.721188,0.707386
2,0.5253,0.530855,0.771012,0.752097
3,0.4595,0.499561,0.792149,0.775396


{'eval_loss': 0.49956148862838745,
 'eval_accuracy': 0.7921489682939105,
 'eval_f1_macro': 0.7753961466347743,
 'eval_runtime': 13.5035,
 'eval_samples_per_second': 147.147,
 'eval_steps_per_second': 9.257,
 'epoch': 3.0}

In [15]:
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")

('sentiment_model/tokenizer_config.json',
 'sentiment_model/special_tokens_map.json',
 'sentiment_model/spiece.model',
 'sentiment_model/added_tokens.json',
 'sentiment_model/tokenizer.json')

In [16]:
example_sentences = [
    # Positive
    "Movie chala bagundi bro, visuals are amazing!",
    "RCB team super ga aadindi today 👏",
    "Nuvvu cheppina song naku chala nachindi ❤️",
    "Chaala clean and funny movie, recommended to everyone.",
    "Super acting by the hero! Full goosebumps moment 🔥",

    # Negative
    "Worst movie I have seen in years, total time waste.",
    "Idhi review aa? chala chetta ga undi bro.",
    "E roju service chala poor ga undi, staff respond cheyyaledu.",
    "Nuv cheppina app lo bugs ekkuva, totally disappointed.",
    "Dislike this actor’s performance – emotion ledu at all.",

    # Neutral
    "Movie release date is next Friday.",
    "RCB vs MI match starts at 7:30 PM.",
    "I watched the trailer yesterday night.",
    "Class ki 10 members attend ayyaru.",
    "Andaru review chustunnaru YouTube lo."
]


In [18]:
# Label index to name mapping
label_map = {
    "LABEL_0": "Negative",
    "LABEL_1": "Neutral",
    "LABEL_2": "Positive"
}

# Run prediction with readable labels
for sentence in example_sentences:
    result = sentiment_pipeline(sentence)[0]
    label_name = label_map[result['label']]
    print(f"Text: {sentence}")
    print(f"Predicted Sentiment: {label_name} (Confidence: {result['score']:.2f})")
    print("-" * 60)


Text: Movie chala bagundi bro, visuals are amazing!
Predicted Sentiment: Positive (Confidence: 0.97)
------------------------------------------------------------
Text: RCB team super ga aadindi today 👏
Predicted Sentiment: Positive (Confidence: 0.98)
------------------------------------------------------------
Text: Nuvvu cheppina song naku chala nachindi ❤️
Predicted Sentiment: Neutral (Confidence: 0.51)
------------------------------------------------------------
Text: Chaala clean and funny movie, recommended to everyone.
Predicted Sentiment: Positive (Confidence: 0.97)
------------------------------------------------------------
Text: Super acting by the hero! Full goosebumps moment 🔥
Predicted Sentiment: Positive (Confidence: 0.99)
------------------------------------------------------------
Text: Worst movie I have seen in years, total time waste.
Predicted Sentiment: Negative (Confidence: 0.69)
------------------------------------------------------------
Text: Idhi review aa? ch