In [None]:
# Install required packages
!pip install transformers
!pip install torch
!pip install nltk
!pip install tqdm

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import re
import nltk
from tqdm import tqdm
import warnings
from google.colab import files
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
torch.manual_seed(42)

# Constants
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 2e-5
MODEL_NAME = "xlm-roberta-base"

# Label mapping
LABEL_MAPPING = {
    -1: 0,  # Negative
    0: 1,   # Neutral
    1: 2    # Positive
}

REVERSE_LABEL_MAPPING = {
    0: "Negative",
    1: "Neutral",
    2: "Positive"
}

class MarathiSentimentDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = [LABEL_MAPPING[target] for target in targets]  # Map labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def clean_text(text):
    # Basic cleaning
    text = re.sub(r'http\S+|www\S+|pic\.twitter\.com\S*', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def train_model():
    # Upload data files
    print("Please upload your CSV files (tweets-train.csv, tweets-valid.csv, tweets-extra.csv)")
    uploaded = files.upload()

    # Load data
    df_train = pd.read_csv("tweets-train (1).csv")
    df_valid = pd.read_csv("tweets-valid.csv")
    df_extra = pd.read_csv("tweets-extra.csv")

    # Combine all training data
    df = pd.concat([df_train, df_valid, df_extra], ignore_index=True)

    # Clean texts
    df['tweet'] = df['tweet'].apply(clean_text)

    # Split data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['tweet'].values,
        df['label'].values,
        test_size=0.1,
        random_state=42,
        stratify=df['label'].values
    )

    # Initialize tokenizer and model
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=3,
        problem_type="single_label_classification"
    )

    # Create datasets
    train_dataset = MarathiSentimentDataset(
        texts=train_texts,
        targets=train_labels,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    val_dataset = MarathiSentimentDataset(
        texts=val_texts,
        targets=val_labels,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True,
        num_workers=0  # Changed for Colab compatibility
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=VALID_BATCH_SIZE,
        shuffle=False,
        num_workers=0  # Changed for Colab compatibility
    )

    # Setup training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Training loop
    best_accuracy = 0
    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')

        # Training phase
        model.train()
        train_losses = []
        for batch in tqdm(train_loader, desc='Training'):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            train_losses.append(loss.item())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

        # Validation phase
        model.eval()
        val_losses = []
        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                targets = batch['targets'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=targets
                )

                loss = outputs.loss
                val_losses.append(loss.item())

                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                predictions.extend(preds)
                true_labels.extend(targets.cpu().numpy())

        # Calculate metrics
        accuracy = accuracy_score(true_labels, predictions)
        print(f'\nValidation Accuracy: {accuracy:.4f}')
        print('\nClassification Report:')
        print(classification_report(true_labels, predictions, target_names=list(REVERSE_LABEL_MAPPING.values())))

        # Save best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save({
                'model_state_dict': model.state_dict(),
                'tokenizer': tokenizer,
                'label_mapping': LABEL_MAPPING,
                'config': {
                    'max_len': MAX_LEN,
                    'model_name': MODEL_NAME
                }
            }, 'best_marathi_sentiment_model.pth')
            print(f'Best model saved with accuracy: {best_accuracy:.4f}')

            # Download the best model
            files.download('best_marathi_sentiment_model.pth')

if __name__ == "__main__":
    train_model()

Please upload your CSV files (tweets-train.csv, tweets-valid.csv, tweets-extra.csv)


Saving tweets-train (1).csv to tweets-train (1).csv
Saving tweets-test.csv to tweets-test.csv
Saving tweets-valid.csv to tweets-valid.csv
Saving tweets-extra.csv to tweets-extra.csv


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda

Epoch 1/5


Training: 100%|██████████| 908/908 [06:11<00:00,  2.45it/s]
Validation: 100%|██████████| 51/51 [00:10<00:00,  4.94it/s]



Validation Accuracy: 0.8171

Classification Report:
              precision    recall  f1-score   support

    Negative       0.81      0.89      0.85       470
     Neutral       0.80      0.68      0.73       454
    Positive       0.83      0.86      0.84       689

    accuracy                           0.82      1613
   macro avg       0.81      0.81      0.81      1613
weighted avg       0.82      0.82      0.81      1613

Best model saved with accuracy: 0.8171


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Epoch 2/5


Training: 100%|██████████| 908/908 [06:15<00:00,  2.42it/s]
Validation: 100%|██████████| 51/51 [00:10<00:00,  4.97it/s]



Validation Accuracy: 0.8320

Classification Report:
              precision    recall  f1-score   support

    Negative       0.84      0.90      0.87       470
     Neutral       0.81      0.70      0.75       454
    Positive       0.84      0.87      0.86       689

    accuracy                           0.83      1613
   macro avg       0.83      0.82      0.82      1613
weighted avg       0.83      0.83      0.83      1613

Best model saved with accuracy: 0.8320


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Epoch 3/5


Training: 100%|██████████| 908/908 [06:15<00:00,  2.42it/s]
Validation: 100%|██████████| 51/51 [00:10<00:00,  4.95it/s]



Validation Accuracy: 0.8326

Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.87      0.87       470
     Neutral       0.78      0.74      0.76       454
    Positive       0.83      0.87      0.85       689

    accuracy                           0.83      1613
   macro avg       0.83      0.83      0.83      1613
weighted avg       0.83      0.83      0.83      1613

Best model saved with accuracy: 0.8326


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Epoch 4/5


Training: 100%|██████████| 908/908 [06:15<00:00,  2.42it/s]
Validation: 100%|██████████| 51/51 [00:10<00:00,  4.94it/s]



Validation Accuracy: 0.8456

Classification Report:
              precision    recall  f1-score   support

    Negative       0.85      0.91      0.88       470
     Neutral       0.80      0.76      0.78       454
    Positive       0.87      0.86      0.87       689

    accuracy                           0.85      1613
   macro avg       0.84      0.84      0.84      1613
weighted avg       0.84      0.85      0.84      1613

Best model saved with accuracy: 0.8456


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Epoch 5/5


Training: 100%|██████████| 908/908 [06:15<00:00,  2.42it/s]
Validation: 100%|██████████| 51/51 [00:10<00:00,  5.01it/s]


Validation Accuracy: 0.8363

Classification Report:
              precision    recall  f1-score   support

    Negative       0.85      0.90      0.88       470
     Neutral       0.81      0.70      0.75       454
    Positive       0.84      0.88      0.86       689

    accuracy                           0.84      1613
   macro avg       0.83      0.83      0.83      1613
weighted avg       0.84      0.84      0.83      1613






In [None]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import re

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|pic\.twitter\.com\S*', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Load the model and configurations
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load('best_marathi_sentiment_model.pth', map_location=device)

# Initialize tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3
)

# Load the saved model state
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

# Get max length from checkpoint
max_len = checkpoint['config']['max_len']

# Define sentiment labels
sentiment_labels = {
    0: "Negative",
    1: "Neutral",
    2: "Positive"
}

def analyze_sentiment(text):
    # Clean the text
    text = clean_text(text)

    # Tokenize
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1).item()
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
        confidence = probabilities[0][prediction].item()

    return {
        'text': text,
        'sentiment': sentiment_labels[prediction],
        'confidence': f"{confidence:.2%}"
    }

# Example usage
if __name__ == "__main__":
    # Example texts - replace with your own Marathi text
    test_texts = [
        "आज खूप छान दिवस होता",
        "मला हे पुस्तक आवडले नाही",
        "आज हवामान ढगाळ आहे"
    ]

    print("Marathi Sentiment Analysis")
    print("-" * 50)

    # Interactive mode
    while True:
        print("\nChoose an option:")
        print("1. Analyze example texts")
        print("2. Enter your own text")
        print("3. Exit")

        choice = input("\nEnter your choice (1-3): ")

        if choice == '1':
            print("\nAnalyzing example texts:")
            for text in test_texts:
                result = analyze_sentiment(text)
                print(f"\nText: {result['text']}")
                print(f"Sentiment: {result['sentiment']}")
                print(f"Confidence: {result['confidence']}")

        elif choice == '2':
            text = input("\nEnter your Marathi text: ")
            result = analyze_sentiment(text)
            print(f"\nText: {result['text']}")
            print(f"Sentiment: {result['sentiment']}")
            print(f"Confidence: {result['confidence']}")

        elif choice == '3':
            print("\nThank you for using Marathi Sentiment Analysis!")
            break

        else:
            print("\nInvalid choice. Please try again.")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Marathi Sentiment Analysis
--------------------------------------------------

Choose an option:
1. Analyze example texts
2. Enter your own text
3. Exit

Enter your choice (1-3): 1

Analyzing example texts:

Text: आज खूप छान दिवस होता
Sentiment: Positive
Confidence: 99.76%

Text: मला हे पुस्तक आवडले नाही
Sentiment: Neutral
Confidence: 91.32%

Text: आज हवामान ढगाळ आहे
Sentiment: Neutral
Confidence: 99.08%

Choose an option:
1. Analyze example texts
2. Enter your own text
3. Exit

Enter your choice (1-3): 2

Enter your Marathi text: जनता रस्त्यावर आली तर सरकारला नमवू शकते. गरज आहे ती फक्त एका विचाराची आता अति होत चाललंय नितीन गडकरी साहेब. रस्ते तुम्हीच बांधणार आणि RTO व ट्रॅफिक पोलीस यांचे लगाम तुमच्याच हाती. नको आम्हांला पुणे बंगलोर सात तासात. सध्याचे 12 तास चालेल आम्हांला. अगदीच वाटलं तर विमानाने जाऊ

Text: जनता रस्त्यावर आली तर सरकारला नमवू शकते. गरज आहे ती फक्त एका विचाराची आता अति होत चाललंय नितीन गडकरी साहेब. रस्ते तुम्हीच बांधणार आणि RTO व ट्रॅफिक पोलीस यांचे लगाम तुमच्याच हाती. न