In [None]:
!pip install transformers
!pip install torch
!pip install sentencepiece
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
from google.colab import files
import io
import pandas as pd
train_data = pd.read_csv("TAMIL_TRAINING_DATA.csv")
dev_data = pd.read_csv("TAMIL_DEVELOPMENT_DATA.csv")
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
train_encodings = tokenizer(list(train_data['TEXT DATA']), truncation=True, padding=True, max_length=64, return_tensors='pt')
dev_encodings = tokenizer(list(dev_data['TEXT DATA']), truncation=True, padding=True, max_length=64, return_tensors='pt')


In [None]:
label_mapping = {'stressed': 1, 'Non stressed': 0}  # Adjust based on your actual labels
train_data['label'] = train_data['LABELS'].map(label_mapping)
dev_data['label'] = dev_data['LABELS'].map(label_mapping)
train_data = train_data.drop(columns=['LABELS'])
dev_data = dev_data.drop(columns=['LABELS'])
train_labels = torch.tensor(list(train_data['label']))
dev_labels = torch.tensor(list(dev_data['label']))

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)
train_dataset = CustomDataset(train_encodings, train_labels)
dev_dataset = CustomDataset(dev_encodings, dev_labels)
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
for epoch in range(3):  # adjust the number of epochs as needed
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")




model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/688 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1, Loss: 46.144804268551525


  0%|          | 0/688 [00:00<?, ?it/s]

Epoch 2, Loss: 6.066102024284191


  0%|          | 0/688 [00:00<?, ?it/s]

Epoch 3, Loss: 1.4255153364138096


In [None]:
from google.colab import files
import pandas as pd
test_data = pd.read_csv("full_tamil_data_test - full_tamil_data_test.csv")
test_encodings = tokenizer(list(test_data['Text data']), truncation=True, padding=True, max_length=64, return_tensors='pt')
test_dataset = CustomDataset(test_encodings, labels=torch.zeros(len(test_encodings['input_ids']), dtype=torch.long))  # Dummy labels
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
model.eval()
predictions = []
for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)
    predictions.extend(predicted_labels.cpu().numpy())
reverse_label_mapping = {1: 'stressed', 0: 'Non stressed'}
test_data['predicted_label'] = [reverse_label_mapping[label] for label in predictions]
test_data.to_csv('YOUR_TEST_DATA_WITH_PREDICTIONS.csv', index=False)

  0%|          | 0/128 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


In [None]:

# Download the final CSV file automatically
files.download(output_file_name)