In [1]:
import pandas as pd

df = pd.read_parquet("data/train.parquet")
df = df[:1000]

In [25]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.1)

dataset['train'][0]

{'text': '::: Uploaded on - 02/08/2018 ::: Downloaded on - 03/08/2018 00:44:52 ::: The case of the appellant/prosecution against the respondent(accused), in short, is as under :- PW-1 (victim) and accused were having love affairs. On06.09.2011, after dinner father of PW-1 and other family members weresleeping in the night. At about 02:00 a.m. in the night, his elder sonPankaj got up from sleep for urination. Pankaj noticed that his sister(PW-1) was talking with accused under the tree. Pankaj came back fortaking torch. In the meantime, PW-1 and accused ran away. Pankaj woke PW-2 and his family members and he narratedincident to them. Mother of PW-1 went to the house of accused. Shecame to know that accused came to house, but lateron, he left. Motherof PW-1 did not get phone number of accused. Due to fear ofdefamation, report was not lodged. Complainant took search of hisdaughter. Accused was not responding to the calls on his mobile phone. Accused was contacted from the mobile of anothe

In [3]:
from utils import get_all_ipc_codes

id2label = {ipc : f'Section {ipc} in The Indian Penal Code' for ipc in get_all_ipc_codes()}

label2id = {value: key for key,value in id2label.items()}

In [4]:
from transformers import AutoTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'

tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)



In [28]:

def tokenize(batch):
    # Tokenize the text
    tokenized = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    
    # Prepare the labels
    labels = []
    for label_list in batch['labels']:
        label_tensor = torch.zeros(len(id2label))
        for label in label_list:
            if label in label2id:
                label_tensor[label2id[label]] = 1
        labels.append(label_tensor.tolist())
    
    # Add labels to the tokenized output
    tokenized['labels'] = labels
    
    return tokenized

# Apply the tokenization function to the dataset
dataset = dataset.map(tokenize, batched=True, batch_size=None, remove_columns=['text', 'labels'])

# Set the format of the dataset to PyTorch tensors
dataset.set_format('torch')

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [29]:
distinct_labels = set(label for sublist in df['labels'] for label in sublist)

In [30]:
id2label = {ipc : f'Section {ipc} in The Indian Penal Code' for ipc in distinct_labels}

label2id = {value: key for key,value in id2label.items()}

In [37]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification",
    torch_dtype=torch.float16  # Add this line
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
training_args = TrainingArguments(
   output_dir="model/v1",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=2,
   weight_decay=0.01,
   eval_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

In [33]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [34]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
import evaluate

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Apply sigmoid to get probabilities
    probs = sigmoid(predictions)
    # Use 0.5 as the threshold for positive prediction
    y_pred = (probs > 0.5).astype(int)
    y_true = labels

    # Compute metrics
    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    precision_micro = precision_score(y_true, y_pred, average='micro')
    precision_macro = precision_score(y_true, y_pred, average='macro')
    recall_micro = recall_score(y_true, y_pred, average='micro')
    recall_macro = recall_score(y_true, y_pred, average='macro')

    # Compute AUROC
    auroc = evaluate.load("roc_auc")
    auroc_score = auroc.compute(references=y_true, prediction_scores=probs, average='macro')['roc_auc']

    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'precision_micro': precision_micro,
        'precision_macro': precision_macro,
        'recall_micro': recall_micro,
        'recall_macro': recall_macro,
        'auroc': auroc_score
    }

In [38]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=dataset["train"],
   eval_dataset=dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [39]:
trainer.train()

ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.