In [None]:
#loading and preparing data
from datasets import load_dataset,Dataset
import pandas as pd

In [None]:
base_model='xlm-roberta-base'

In [None]:
from datasets import load_dataset, ClassLabel

dataset = load_dataset("csv", data_files="../Data/Cleaned/review.csv")["train"]
#chage neutral to positive
def map_rating(rating):
    if rating == 0:
        return 0  # Negative
    else:
        return 1  # Positive
    
dataset = dataset.map(lambda x: {"rating": map_rating(x["rating"])})

class_labels = ClassLabel(
    num_classes=2,
    names=["Negative", "Positive"]
)

dataset = dataset.cast_column("rating", class_labels)
# Stratified split using rating label
dataset = dataset.train_test_split(
    test_size=0.2,
    stratify_by_column="rating",
    seed=42
)

dataset = dataset.map(lambda x: {"labels": x["rating"]})
dataset = dataset.remove_columns(["rating"])


#drop none values
train_dataset = dataset["train"]
test_dataset = dataset["test"]

train_dataset = train_dataset.filter(lambda example: example["review"] is not None)
test_dataset = test_dataset.filter(lambda example: example["review"] is not None)

train_dataset[0]

In [None]:
dataset["test"]

In [None]:
from transformers import (
    XLMRobertaTokenizerFast,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer
)

model="xlm-roberta-base"


tokenizer = XLMRobertaTokenizerFast.from_pretrained(
    model
)



In [None]:
def tokenize(text):
    return tokenizer(text['review'],padding='max_length',truncation=True,max_length=128)

In [None]:
def preprocess(ds):
    ds = ds.map(tokenize, remove_columns=["review"])  # remove raw text (saves memory
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return ds


train_dataset = preprocess(train_dataset)
test_dataset  = preprocess(test_dataset)



In [None]:
import numpy as np
import torch
from sklearn.utils.class_weight import compute_class_weight

labels = train_dataset["labels"]
labels = np.array(labels)
all_classes = np.unique(labels)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=all_classes,
    y=labels
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)


In [None]:

id2label = {
  0: "Negative",
  1: "Positive"
}

label2id = {
  "Negative": 0,
    "Neutral": 1,
    "Positive": 2
}
model=XLMRobertaForSequenceClassification.from_pretrained(model,num_labels=2, id2label=id2label,
    label2id=label2id)

In [None]:
#show all layers of model

for param in model.base_model.parameters():
    param.requires_grad = False


In [None]:
trainable, total = 0, 0
for p in model.parameters():
    total += p.numel()
    if p.requires_grad:
        trainable += p.numel()

print(f"Trainable params: {trainable:,}")
print(f"Total params: {total:,}")
print(f"Trainable %: {100 * trainable / total:.2f}%")


In [None]:
import torch.nn as nn
from transformers import Trainer

class WeightedTrainer(Trainer):
    # Add **kwargs to the end of the arguments list
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Pull weights to the correct device
        weights = class_weights.to(logits.device)
        
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        
        # Flatten the logits and labels for safety
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    return {
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision_macro": precision_score(labels, preds, average="macro"),
        "recall_macro": recall_score(labels, preds, average="macro"),
    }


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    save_steps=500,
    eval_steps=500,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    gradient_accumulation_steps=4, # 4x4 = 16
    learning_rate=2e-5,            # Standard for transformers
)


In [None]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
# This will use your test_dataset and compute_metrics function
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
# Save the model and tokenizer
model_path = "./Models/xlm-finetuned-sentiment-save"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)


In [None]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizerFast

model_path = "./Models/xlm-finetuned-sentiment-save"

# Load tokenizer and model
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_path)
model = XLMRobertaForSequenceClassification.from_pretrained(model_path)

# Make sure model is in evaluation mode
model.eval()



In [None]:
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

texts = [
    "this is not good product donot buy",
    "I love this product!",
    "Average quality, nothing special.",
    "This is absolutely terrible! I hate it, worst purchase ever!",  # very negative
    "fraud seller. Samaan त ekdum naramro रहेछ।",
    'delivery late and product was damaged. very bad experience.',
    ' good but delivery boy was very rude , product is damaged not working'
]

# Tokenize
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

# Inference
with torch.no_grad():
    outputs = model(**inputs)

# Model outputs logits
logits = outputs.logits

# Convert logits to probabilities
probs = torch.softmax(logits, dim=-1)
print(probs)

# Get predicted labels
labels = torch.argmax(probs, dim=1)
label_map = {0: "negative", 1: "positive"}

for text, label in zip(texts, labels):
    print(f"Text: {text}")
    print(f"Sentiment: {label_map[label.item()]}\n")


In [None]:
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
print("Predicted sentiment:", label_map[pred.item()])
