In [1]:
from collections import Counter
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-roberta")
model = AutoModelForSequenceClassification.from_pretrained("l3cube-pune/hing-roberta")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-roberta and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
class HSDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
train_df = pd.read_csv("data/splits/train.csv")
val_df = pd.read_csv("data/splits/val.csv")
test_df = pd.read_csv("data/splits/test.csv")

train_encodings = tokenizer(train_df['tweet_text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_df['tweet_text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(test_df['tweet_text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

train_dataset = HSDataset(train_encodings, train_df['offense'].tolist())
val_dataset = HSDataset(val_encodings, val_df['offense'].tolist())
test_dataset = HSDataset(test_encodings, test_df['offense'].tolist())

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
device

device(type='cuda')

In [6]:
class_instance_counts = torch.tensor(list(dict(Counter(train_df['offense'].tolist())).values()))
class_instance_probs = class_instance_counts/class_instance_counts.sum().item()
class_weights = 1 / class_instance_probs
class_weights.to(device)

tensor([1.4299, 3.3263], device='cuda:0')

In [7]:
class_instance_counts.sum().item()

6809

In [8]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits.squeeze()
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_steps=50,
    evaluation_strategy="steps",
    label_names=["labels"]
)

In [10]:
training_args.evaluation_strategy

<IntervalStrategy.STEPS: 'steps'>

In [11]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    class_weights=class_weights.to(device)
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
trainer.train()

Step,Training Loss,Validation Loss
50,0.6904,0.686846
100,0.681,0.672515
150,0.6647,0.610471
200,0.5955,0.589489
250,0.5876,0.645466
300,0.5561,0.561115
350,0.5001,0.650849
400,0.5424,0.729119
450,0.6855,0.718532
500,0.5894,0.88256


TrainOutput(global_step=1278, training_loss=0.47333437437555975, metrics={'train_runtime': 703.0576, 'train_samples_per_second': 29.055, 'train_steps_per_second': 1.818, 'total_flos': 3789491405370660.0, 'train_loss': 0.47333437437555975, 'epoch': 3.0})

In [13]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='binary')
    return {
        'f1': f1,
    }


In [14]:
trainer.compute_metrics = compute_metrics

In [15]:
test_f1_score = trainer.evaluate(eval_dataset=test_dataset)["eval_f1"]

In [16]:
test_f1_score

0.6937119675456389