In [203]:
#loading and preparing data
from datasets import load_dataset,Dataset
import pandas as pd

In [204]:
base_model='xlm-roberta-base'

In [None]:
from datasets import load_dataset, ClassLabel

dataset = load_dataset("csv", data_files="../Data/Cleaned/review.csv")["train"]
#chage neutral to positive
def map_rating(rating):
    if rating == 0:
        return 0  # Negative
    else:
        return 1  # Positive
    
dataset = dataset.map(lambda x: {"rating": map_rating(x["rating"])})

class_labels = ClassLabel(
    num_classes=2,
    names=["Negative", "Positive"]
)

dataset = dataset.cast_column("rating", class_labels)
# Stratified split using rating label
dataset = dataset.train_test_split(
    test_size=0.2,
    stratify_by_column="rating",
    seed=42
)

dataset = dataset.map(lambda x: {"labels": x["rating"]})
dataset = dataset.remove_columns(["rating"])


#drop none values
train_dataset = dataset["train"]
test_dataset = dataset["test"]

train_dataset = train_dataset.filter(lambda example: example["review"] is not None)
test_dataset = test_dataset.filter(lambda example: example["review"] is not None)

train_dataset[0]

AttributeError: 'Value' object has no attribute 'names'

In [206]:
dataset["test"]

Dataset({
    features: ['review', 'labels'],
    num_rows: 486
})

In [207]:
from transformers import (
    XLMRobertaTokenizerFast,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer
)

model="xlm-roberta-base"


tokenizer = XLMRobertaTokenizerFast.from_pretrained(
    model
)



In [208]:
def tokenize(text):
    return tokenizer(text['review'],padding='max_length',truncation=True,max_length=128)

In [209]:
def preprocess(ds):
    ds = ds.map(tokenize, remove_columns=["review"])  # remove raw text (saves memory
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return ds


train_dataset = preprocess(train_dataset)
test_dataset  = preprocess(test_dataset)



Map: 100%|██████████| 1940/1940 [00:00<00:00, 2042.02 examples/s]
Map: 100%|██████████| 486/486 [00:00<00:00, 2688.89 examples/s]


In [210]:
import numpy as np
import torch
from sklearn.utils.class_weight import compute_class_weight

labels = train_dataset["labels"]
labels = np.array(labels)
all_classes = np.unique(labels)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=all_classes,
    y=labels
)

class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)


Class weights: tensor([1.2065, 0.8539])


In [211]:

id2label = {
  0: "Negative",
  1: "Positive"
}

label2id = {
  "Negative": 0,
    "Neutral": 1,
    "Positive": 2
}
model=XLMRobertaForSequenceClassification.from_pretrained(model,num_labels=2, id2label=id2label,
    label2id=label2id)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [212]:
#show all layers of model

for param in model.base_model.parameters():
    param.requires_grad = False


In [213]:
trainable, total = 0, 0
for p in model.parameters():
    total += p.numel()
    if p.requires_grad:
        trainable += p.numel()

print(f"Trainable params: {trainable:,}")
print(f"Total params: {total:,}")
print(f"Trainable %: {100 * trainable / total:.2f}%")


Trainable params: 592,130
Total params: 278,045,186
Trainable %: 0.21%


In [214]:
import torch.nn as nn
from transformers import Trainer

class WeightedTrainer(Trainer):
    # Add **kwargs to the end of the arguments list
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Pull weights to the correct device
        weights = class_weights.to(logits.device)
        
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        
        # Flatten the logits and labels for safety
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss


In [215]:
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    return {
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision_macro": precision_score(labels, preds, average="macro"),
        "recall_macro": recall_score(labels, preds, average="macro"),
    }


In [216]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    save_steps=500,
    eval_steps=500,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    gradient_accumulation_steps=4, # 4x4 = 16
    learning_rate=2e-5,            # Standard for transformers
)


In [217]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = WeightedTrainer(


In [218]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=366, training_loss=0.6777826528080174, metrics={'train_runtime': 2866.1468, 'train_samples_per_second': 2.031, 'train_steps_per_second': 0.128, 'total_flos': 382826585548800.0, 'train_loss': 0.6777826528080174, 'epoch': 3.0})

In [219]:
# This will use your test_dataset and compute_metrics function
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.6637184023857117, 'eval_f1_macro': 0.7015230920406158, 'eval_precision_macro': 0.8463332237137133, 'eval_recall_macro': 0.7022257135375752, 'eval_runtime': 85.7252, 'eval_samples_per_second': 5.669, 'eval_steps_per_second': 1.423, 'epoch': 3.0}


In [220]:
# Save the model and tokenizer
model_path = "./Models/xlm-finetuned-sentiment-save"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)


('./Models/xlm-finetuned-sentiment-save\\tokenizer_config.json',
 './Models/xlm-finetuned-sentiment-save\\special_tokens_map.json',
 './Models/xlm-finetuned-sentiment-save\\tokenizer.json')

In [221]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizerFast

model_path = "./Models/xlm-finetuned-sentiment-save"

# Load tokenizer and model
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_path)
model = XLMRobertaForSequenceClassification.from_pretrained(model_path)

# Make sure model is in evaluation mode
model.eval()



The tokenizer you are loading from './Models/xlm-finetuned-sentiment-save' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [234]:
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

texts = [
    "this is not good product donot buy",
    "I love this product!",
    "Average quality, nothing special.",
    "This is absolutely terrible! I hate it, worst purchase ever!",  # very negative
    "fraud seller. Samaan त ekdum naramro रहेछ।",
    'delivery late and product was damaged. very bad experience.',
]

# Tokenize
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

# Inference
with torch.no_grad():
    outputs = model(**inputs)

# Model outputs logits
logits = outputs.logits

# Convert logits to probabilities
probs = torch.softmax(logits, dim=-1)
print(probs)

# Get predicted labels
labels = torch.argmax(probs, dim=1)
label_map = {0: "negative", 1: "positive"}

for text, label in zip(texts, labels):
    print(f"Text: {text}")
    print(f"Sentiment: {label_map[label.item()]}\n")


tensor([[0.4761, 0.5239],
        [0.4886, 0.5114],
        [0.4947, 0.5053],
        [0.5178, 0.4822],
        [0.5131, 0.4869],
        [0.5210, 0.4790]], device='cuda:0')
Text: this is not good product donot buy
Sentiment: positive

Text: I love this product!
Sentiment: positive

Text: Average quality, nothing special.
Sentiment: positive

Text: This is absolutely terrible! I hate it, worst purchase ever!
Sentiment: negative

Text: fraud seller. Samaan त ekdum naramro रहेछ।
Sentiment: negative

Text: delivery late and product was damaged. very bad experience.
Sentiment: negative



In [223]:
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
print("Predicted sentiment:", label_map[pred.item()])


Predicted sentiment: Positive
