# Rhetorical Roles
In this project, we try to segment Indian Case documents into semantically coherent units. 

### The Dataset is organised in the following format

In [3]:
# Dataset Format
# {
#   "id": 4180,
#   "annotations": [
#     {
#       "result": [
#         {
#           "id": "d4814190a8ab41e98029ce8aded54acc",
#           "value": {
#             "start": 0,
#             "end": 95,
#             "text": "PETITIONER:\nTHE COMMISSIONER OF INCOME-TAXNEW DELHI\n\nVs.\n\nRESPONDENT:\nM/s. CHUNI LAL MOONGA RAM",
#             "labels": [
#               "PREAMBLE"
#             ]}
#           },
#         ]
#      }],
#   "data": {
#     "text": "      "
#   },
#   "meta": {
#     "group": "Tax"
#   }
#  }

In [5]:
# Necessary Imports
import json
import random
import torch
import gc
import re

import pandas as pd
import numpy as np

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score
from transformers import DataCollatorWithPadding

In [2]:
# The text is clustered in the following labels
encoded_rr = {
    "PREAMBLE":0,
    "FAC":1,
    "RLC":2,
    "ISSUE":3,
    "ARG_PETITIONER":4,
    "ARG_RESPONDENT":5,
    "ANALYSIS":6,
    "STA":7,
    "PRE_RELIED":8,
    "PRE_NOT_RELIED":9,
    "RATIO":10,
    "RPC":11,
    "NONE":12
}

In [3]:
d = open('rr_dev.json')
t = open('train.json')
dev_dataset = json.load(d)
train_dataset = json.load(t)

In [4]:
# Combining the dev and train dataset and then splitting it into train and test of 70:30 ratio
train_dataset.extend(dev_dataset)
random.shuffle(train_dataset)
print("Total size:  %d"%(len(train_dataset)))
split = (len(train_dataset)*7)//10 
print('Splitting data from 0 to %d for train' % (split))
print('Splitting data from %d to %d for test' % (split,len(train_dataset)))
train = train_dataset[:split]
dev = train_dataset[split:]

Total size:  277
Splitting data from 0 to 193 for train
Splitting data from 193 to 277 for test


In [8]:
train_data = []
dev_data = []

for rec in dev:
  for ele in rec['annotations'][0]['result']: 
    processed_text = ele['value']['text']
    processed_text = processed_text.replace('\n', ' ')
    processed_text = re.sub(' {2,}', ' ', processed_text)
    processed_text = processed_text.lower()
    dev_data.append({"label":encoded_rr[ele['value']['labels'][0]],"text": processed_text})

for rec in train:
  for ele in rec['annotations'][0]['result']:
    processed_text = ele['value']['text']
    processed_text = processed_text.replace('\n', ' ')
    processed_text = re.sub(' {2,}', ' ', processed_text)
    processed_text = processed_text.lower() 
    train_data.append({"label":encoded_rr[ele['value']['labels'][0]],"text": processed_text})

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=13)
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [11]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [12]:
dev_df = Dataset.from_pandas(pd.DataFrame.from_records(dev_data))
train_df = Dataset.from_pandas(pd.DataFrame.from_records(train_data))
dataset = DatasetDict({"train":train_df,"dev":dev_df})
tokenized_dataset = dataset.map(preprocess_function, batched=True)

100%|██████████| 22/22 [00:00<00:00, 22.97ba/s]
100%|██████████| 11/11 [00:00<00:00, 25.84ba/s]


In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 21342
    })
    dev: Dataset({
        features: ['label', 'text'],
        num_rows: 10523
    })
})

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return {'f1':f1_score(y_true=labels, y_pred=predictions,average='micro')}

In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    # overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy='steps',
    save_steps=2000,
    evaluation_strategy='steps',
    eval_steps=1000,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['dev'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

gc.collect()
torch.cuda.empty_cache()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 21342
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13340


Step,Training Loss,Validation Loss,F1
1000,1.4111,1.318136,0.589281
2000,1.1484,1.188574,0.617505
3000,0.998,1.234505,0.61893
4000,0.8983,1.247883,0.624442
5000,0.9201,1.196241,0.622731
6000,0.7114,1.313335,0.61893
7000,0.683,1.33412,0.624062
8000,0.6848,1.349922,0.622256
9000,0.495,1.532845,0.614749
10000,0.4985,1.498666,0.6195


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10523
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10523
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2000/special_tokens_map.json
T

In [15]:
gc.collect()
torch.cuda.empty_cache()

In [18]:
model1 = AutoModelForSequenceClassification.from_pretrained("saibo/legal-roberta-base", num_labels=13)
tokenizer1 = AutoTokenizer.from_pretrained("saibo/legal-roberta-base")

https://huggingface.co/saibo/legal-roberta-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /h/users/ssure044/.cache/huggingface/transformers/tmpi7u0mkoh
Downloading: 100%|██████████| 578/578 [00:00<00:00, 535kB/s]
storing https://huggingface.co/saibo/legal-roberta-base/resolve/main/config.json in cache at /h/users/ssure044/.cache/huggingface/transformers/dd1c0baa51f82ca8fc2586b6e1795e24c85057b7f86d42bd3148eedc68af37e8.2d768c59353f4c171f33ecad8a88f66bdf404fcd40279fe632de9d7802e1d9d4
creating metadata file for /h/users/ssure044/.cache/huggingface/transformers/dd1c0baa51f82ca8fc2586b6e1795e24c85057b7f86d42bd3148eedc68af37e8.2d768c59353f4c171f33ecad8a88f66bdf404fcd40279fe632de9d7802e1d9d4
loading configuration file https://huggingface.co/saibo/legal-roberta-base/resolve/main/config.json from cache at /h/users/ssure044/.cache/huggingface/transformers/dd1c0baa51f82ca8fc2586b6e1795e24c85057b7f86d42bd3148eedc68af37e8.2d768c59353f4c171f33ecad8a88f6

In [19]:
training_args = TrainingArguments(
    output_dir="./results1",
    # overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy='steps',
    save_steps=2000,
    evaluation_strategy='steps',
    eval_steps=1000,
    
)

trainer = Trainer(
    model=model1,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['dev'],
    tokenizer=tokenizer1,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

gc.collect()
torch.cuda.empty_cache()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 21342
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13340


Step,Training Loss,Validation Loss,F1
1000,1.7415,1.684011,0.450442
2000,1.5423,1.493979,0.530172
3000,1.3856,1.417865,0.546517
4000,1.249,1.422216,0.55526
5000,1.2292,1.341946,0.576642
6000,1.0712,1.374896,0.572175
7000,1.0733,1.392038,0.568754
8000,1.0374,1.353413,0.577117
9000,0.8995,1.407402,0.576737
10000,0.8953,1.399892,0.579778


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10523
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10523
  Batch size = 8
Saving model checkpoint to ./results1/checkpoint-2000
Configuration saved in ./results1/checkpoint-2000/config.json
Model weights saved in ./results1/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./results1/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./results1/checkpoint-2000/special_