In [None]:
# Install necessary libraries
!pip install transformers
!pip install torch
!pip install sentencepiece

import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm

# Upload your training and dev datasets to Colab and load them
# Assuming you have CSV files with 'text' and 'label' columns
# Replace 'your_training_dataset.csv' and 'your_dev_dataset.csv' with your actual file names.

from google.colab import files


# Assuming you have uploaded files named 'train_data.csv' and 'dev_data.csv'
import io
import pandas as pd

train_data = pd.read_csv("TELUGU_TRAINING_DATA.csv")

# Load your development dataset
dev_data = pd.read_csv("TELUGU_DEVELOPMENT DATA.csv")

# Check the first few rows of your datasets
print(train_data.head())
print(dev_data.head())

# Tokenize the data
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize input sequences
train_encodings = tokenizer(list(train_data['TEXT DATA']), truncation=True, padding=True, max_length=64, return_tensors='pt')
dev_encodings = tokenizer(list(dev_data['TEXT DATA']), truncation=True, padding=True, max_length=64, return_tensors='pt')


Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
                                           TEXT DATA         LABEL
0                         Chakri anna thopuheeeyyyyy  Non stressed
1  No matter how much money someone has, they use...  Non stressed
2                         Sunny i really enjoyed him  Non stressed
3            ఫైన్ వేశారు కాని, నేను ఎప్పుడు కట్టలేదు  Non stressed
4  Good msg annadhi kottinandhuku kaadhu bro, bha...  Non stressed
                                           TEXT DATA         LABEL
0                         Kajal you are so beautiful  Non stressed
1                                  emana chepav broo  Non stressed
2  Although this movie is being remaked now but s...  N

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [None]:
label_mapping = {'stressed': 1, 'Non stressed': 0}  # Adjust based on your actual labels

# Apply the mapping to the 'LABELS' column
train_data['label'] = train_data['LABEL'].map(label_mapping)
dev_data['label'] = dev_data['LABEL'].map(label_mapping)

# Drop the original 'LABELS' column if it's no longer needed
train_data = train_data.drop(columns=['LABEL'])
dev_data = dev_data.drop(columns=['LABEL'])

# Convert the 'label' column to tensor
train_labels = torch.tensor(list(train_data['label']))
dev_labels = torch.tensor(list(dev_data['label']))


In [None]:


# Create PyTorch datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create DataLoader
train_dataset = CustomDataset(train_encodings, train_labels)
dev_dataset = CustomDataset(dev_encodings, dev_labels)

# Load pre-trained XLM-RoBERTa model and set up training parameters
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

for epoch in range(3):  # adjust the number of epochs as needed
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss}")

# Evaluation
dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)
model.eval()
all_preds = []

for batch in tqdm(dev_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    all_preds.extend(preds)

# Calculate accuracy on the dev set
accuracy = accuracy_score(dev_labels, all_preds)
print(f"Accuracy on Dev Set: {accuracy}")


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/638 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1, Loss: 44.09469955356326


  0%|          | 0/638 [00:00<?, ?it/s]

Epoch 2, Loss: 3.958871173497755


  0%|          | 0/638 [00:00<?, ?it/s]

Epoch 3, Loss: 6.191436002434784


  0%|          | 0/155 [00:00<?, ?it/s]

Accuracy on Dev Set: 0.9983857949959645


In [None]:
from google.colab import files
import pandas as pd

# Upload the file using the Colab interface
test_data = pd.read_csv("full_telugu_data_test - full_telugu_data_test (1).csv")

# Assuming you've uploaded a file named 'tamiltestfinal.csv'

# Access the uploaded CSV file using Pandas


# Now you can work with 'data', which is a Pandas DataFrame containing your CSV data


# Tokenize test data
test_encodings = tokenizer(list(test_data['Text data']), truncation=True, padding=True, max_length=64, return_tensors='pt')

# Create PyTorch dataset for test data
test_dataset = CustomDataset(test_encodings, labels=torch.zeros(len(test_encodings['input_ids']), dtype=torch.long))  # Dummy labels

# DataLoader for test dataset
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Prediction loop
model.eval()
predictions = []

for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)
    predictions.extend(predicted_labels.cpu().numpy())

# Map predicted labels back to original categories
reverse_label_mapping = {1: 'stressed', 0: 'Non stressed'}
test_data['predicted_label'] = [reverse_label_mapping[label] for label in predictions]

# Save test data with predicted labels
test_data.to_csv('YOUR_TEST_DATA_WITH_PREDICTIONS.csv', index=False)


  0%|          | 0/132 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
