In [22]:
import pandas as pd

In [3]:
df = pd.read_json('/Users/shreyasb/worskpace/shreyas/python/mtech-dissertation/data/train.jsonl', lines=True)

In [4]:
from utils import extract_ipc_sections

In [5]:
df.head()

Unnamed: 0,id,text,labels
0,1000008,"[(a), Section 5 r/w 27 of the Arms Act. The gi...","[Section 395 in The Indian Penal Code, Section..."
1,100009671,"[05.09.13 Item No. 44 Court No.17 A.B., Item N...","[Section 438 in The Indian Penal Code, Section..."
2,1000196,"[JUDGMENT R.K. Chowdhry, J., For offences unde...","[Section 120B in The Indian Penal Code, Sectio..."
3,100120460,"[It is further alleged that present applicant,...","[Section 420 in The Indian Penal Code, Section..."
4,100122782,[Petitioner is a medical practitioner having a...,"[Section 308 in The Indian Penal Code, Section..."


In [6]:
df['labels'] = df['labels'].apply(lambda x: extract_ipc_sections(x))

In [7]:
df['labels']

0                                            [395, 120, 5]
1                                          [438, 34, 498A]
2                                    [120B, 161, 467, 109]
3                           [420, 406, 155, 415, 409, 156]
4        [308, 336, 338, 337, 384, 325, 326, 323, 482, ...
                               ...                        
42830                             [325, 379, 307, 323, 34]
42831                              [506, 342, 376(2), 313]
42832                               [304B, 302, 498A, 304]
42833                             [34, 341, 325, 307, 438]
42834                                      [341, 323, 452]
Name: labels, Length: 42835, dtype: object

In [8]:
df = df.drop(columns=['id'])

In [9]:
df.head()

Unnamed: 0,text,labels
0,"[(a), Section 5 r/w 27 of the Arms Act. The gi...","[395, 120, 5]"
1,"[05.09.13 Item No. 44 Court No.17 A.B., Item N...","[438, 34, 498A]"
2,"[JUDGMENT R.K. Chowdhry, J., For offences unde...","[120B, 161, 467, 109]"
3,"[It is further alleged that present applicant,...","[420, 406, 155, 415, 409, 156]"
4,[Petitioner is a medical practitioner having a...,"[308, 336, 338, 337, 384, 325, 326, 323, 482, ..."


In [11]:
df['text'] = df['text'].apply(lambda x: " ".join(x))

In [12]:
df.head()

Unnamed: 0,text,labels
0,"(a), Section 5 r/w 27 of the Arms Act. The gis...","[395, 120, 5]"
1,05.09.13 Item No. 44 Court No.17 A.B. Item No....,"[438, 34, 498A]"
2,"JUDGMENT R.K. Chowdhry, J. For offences under ...","[120B, 161, 467, 109]"
3,"It is further alleged that present applicant, ...","[420, 406, 155, 415, 409, 156]"
4,Petitioner is a medical practitioner having a ...,"[308, 336, 338, 337, 384, 325, 326, 323, 482, ..."


In [13]:
df.to_parquet('data/train.parquet',index=False, engine='pyarrow')

In [14]:
df = pd.read_json('/Users/shreyasb/worskpace/shreyas/python/mtech-dissertation/data/test.jsonl', lines=True)

In [15]:
df['labels'] = df['labels'].apply(lambda x: extract_ipc_sections(x))
df['text'] = df['text'].apply(lambda x: " ".join(x))
df = df.drop(columns=['id'])

In [16]:
df.to_parquet('data/test.parquet',index=False, engine='pyarrow')

(13039, 2)

In [23]:
df = pd.read_parquet('data/train.parquet')

In [24]:
unique_labels = sorted(set([label for sublist in df['labels'] for label in sublist]))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
num_labels = len(unique_labels)

In [25]:
def encode_labels(labels, label2id):
    encoding = [0] * len(label2id)
    for label in labels:
        encoding[label2id[label]] = 1
    return encoding

df['encoded_labels'] = df['labels'].apply(lambda x: encode_labels(x, label2id))

In [26]:
df.head()

Unnamed: 0,text,labels,encoded_labels
0,"(a), Section 5 r/w 27 of the Arms Act. The gis...","[395, 120, 5]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,05.09.13 Item No. 44 Court No.17 A.B. Item No....,"[438, 34, 498A]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"JUDGMENT R.K. Chowdhry, J. For offences under ...","[120B, 161, 467, 109]","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,"It is further alleged that present applicant, ...","[420, 406, 155, 415, 409, 156]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
4,Petitioner is a medical practitioner having a ...,"[308, 336, 338, 337, 384, 325, 326, 323, 482, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [27]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader

In [28]:
tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')



In [30]:
import torch
class CourtCaseDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

dataset = CourtCaseDataset(df['text'], df['encoded_labels'], tokenizer)

In [31]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(
    'huawei-noah/TinyBERT_General_4L_312D',
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
)

In [35]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    
    # Apply sigmoid and threshold at 0.5 to get binary predictions
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(preds))
    y_pred = (probs > 0.5).int().numpy()
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, y_pred)

    # Calculate precision, recall, f1
    precision, recall, f1, _ = precision_recall_fscore_support(labels, y_pred, average='macro')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  # Replace with a proper validation dataset
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1447,0.136554
2,0.1341,0.130074
3,0.1225,0.125292


TrainOutput(global_step=8034, training_loss=0.1485160802590007, metrics={'train_runtime': 8241.4019, 'train_samples_per_second': 15.593, 'train_steps_per_second': 0.975, 'total_flos': 1854741803765760.0, 'train_loss': 0.1485160802590007, 'epoch': 3.0})

In [41]:
import torch

def predict(text):
    encoding = tokenizer(
        text,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

    # Move the model to the same device as the input
    model.to(encoding['input_ids'].device)

    output = model(**{k: v.to(encoding['input_ids'].device) for k, v in encoding.items()})
    logits = output.logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits)
    print(probs)
    predictions = (probs > 0.2).int()

    predicted_labels = [id2label[i] for i, p in enumerate(predictions[0]) if p == 1]
    return predicted_labels

In [44]:
predict(
    """
    (a), Section 5 r/w 27 of the Arms Act. The gist of theprosecution case relevant for the purpose of this proceedingmay be stated thus: With the growth of industry, commerceand trade in and around the city of Mumbai which generatessubstantial quantity of wealth, there has been increase oforganised activities by gangs of anti-socials to extractmoney from affluent sections of society like developers,hoteliers and other businessmen by putting them in fear ofdeath and then to demand substantial sums of money commonlyknown as "Khadani" i.e. protection money.One such gangwas operating in the city under Amar Naik @ Bhai, who dieda couple of years before the decision in the case at anencounter with the police.The prosecution alleged that inpursuance of a criminal conspiracy between 15.1.1994 to16.5.1994 the accused persons and others of the gangembarked upon preparatory acts like procuring theinformation about the names of the builders of M/s KalpataruConstruction Company which was engaged in developing aproperty at Pali Hill, named Nakshatra Building.PW-7Sudhir Tambe was the Senior Vice-President of the companywith its head office at Nariman Point.He used to sit inthe head- office.PW 6 Pachapur, Civil Engineer, was anemployee of the company who used to remain at the site tosupervise the construction.As the prosecution story runs,on 15.4.1994 between 11.30 a.m. and 12.00 noon while PW 6was on duty at the construction site, accused no.3, NitinVasant Venugurlekar armed with revolver and accused No.4Rajindera @ Rajan Mahadeo Margaj armed with a chopper andaccused no.5 Jayendra @ Jai Anandrao Jadhav also armed witha chopper visited the site of Nakshatra Building; theythreatened the workers at the site, forcibly brought PW 6Pachapur in a room on the ground floor and man-handled him.Accused no.3, pointing a revolver at him demanded the name,address and telephone number of the builders.PW 6disclosed the name of PW 7 Tambe and gave his telephonenumber to them.The accused then asked him to go to theoffice of the builders at Nariman Point and make thearrangement for a telephonic talk with Tambe.PW 6 rushedto the office and told Tambe of what had happened at theconstruction site.This was followed by telephonic callsfrom the accused who wanted to speak to Tambe.Attemptswere made by PW 6 and PW 7 to avoid any discussion with thegangsters.Two or three days thereafter when the accusedgot Tambe on the telephone he (Tambe) gave them some othertelephone numbers and asked them to contact those personsincluding one D.N.Ghosh, the Security Contractor.Eight/tendays thereafter again a telephone call was made to theoffice of Tambe which was received by PW 6 who was informedby the person making the call that they could not get D. N.Ghosh on the telephone numbers furnished by Tambe.Thereafter PW 6 handed over the receiver to Tambe.Thisincident was followed by several threats given by thegangsters to workers and also repeated telephone calls madeto the Head Office of the company to contact Tambe.Thestaff of the site office absented from work resulting invirtual closure of construction activity.On 11.5.1994 thedeceased Sanjay Patil telephoned to Tambe and warned himthat he is wasting time and should meet him without furtherdelay.After some days there was one more similar call fromSanjay Patil and he asked Tambe that he should talk to Bhaiand saying so he handed over the receiver to another personwho gave his identity as Amar Naik (since deceased), whotold Tambe that he should pay Rs.10 lacs.The later pleadedhis inability to pay such a heavy sum and after somediscussion agreed to pay Rs.5 lacs.He was asked to come toNakshatra Building site on 16.5.1994 along with money.Inthe meantime Tambe informed all the happenings to the Addl.Commissioner of Police Mr.Sanjeev Dayal and the then Dy.Commissioner of Police of Zone VII Mr. Rajanish Shethwithin whose jurisdiction Khar Police Station fell.On 16.5.1994 at about 12.00 noon the deceased SanjayPatil telephoned Tambe and inquired from him as to what hewas going to do about the payment and then Tambe repliedthat he will be leaving office at about 2.00 p.m. for PaliHill.Sanjay Patil cautioned him that he should not makeany haste and he should wait for his call so that he willtake necessary instructions from his boss i.e. Amar Naik.At about 2.00 p.m. on that day there was a telephone callfrom Sanjay Patil telling that Tambe should not meet him atthe Nakshatra Building site but instead he should meet himnear the Ceaser Palace Hotel.This telephonic conversationwas tape-recorded.Tambe was instructed on telephone thathis man shall carry a white plastic bag containing theamount of Rs.5 lacs and shall wait near the entrance gate ofCeaser Palace Hotel and the person coming to collect thesaid bag will introduce himself as Me Rawanacha Manus Hai.Tambe informed to the DCP all these happenings and handedover the tape in which the telephonic conversation wasrecorded by him.The DCP had made the arrangements to keepa regular watch near the building site.PW 1 Sunil Deshmukhwas deployed to wait in cognito near the gate of the CeaserPalace Hotel and to carry the white plastic bag containingbundles of papers which would give an appearance like thebundles of currency notes.The other officers, who werealso in cognito, had taken their position at strategicpoints near the hotel.At about 4.05 p.m. Sunil Deshmukhnoticed that one red coloured Maruti van halted in front ofthe Ceaser Palace Hotel.He noticed three persons gettingdown from the said van.Those three persons were coming inhis direction, and the van went ahead 50 to 60 feets andhalted there.The deceased Sanjay Patil and the accusedno.7 Bapu Sidhram Gaikwad got down from the said van andaccused no.6 Mohamed Ismail was sitting on the driver seatin the van.Heenquired from PW1 about his identity and when PW 1 repliedthat he has been sent by Tambe Sahib.PW 1 Sunil Deshmukhthen asked that person who are you (Tum Kaun Hai) and thenthe accused no.2 Umesh Bhatt told him that Hum Rawan KeAadmi Hai.L.....I.........T.......T.......T.......T.......T.......T..J J U D G M E N T D.P. MOHAPATRA,J This appeal, filed by accused no.1 Babu KuttanRamkrishna Pillai and accused no.2 Umesh @ Babu PurshottamBhatt of TADA ACT Spl.Thereafter accused no.1 Babu Kuttan extendedhis hand towards PW 1 who delivered the bag to him.At thisjuncture the police officers who were standing nearby incognito rushed to the place and surrounded the threepersons.When the police officers were trying to overpowerthem the deceased Sanjay Patil @ Avinash Amanna and theaccused no .7 Bapu Sidhram Gaikwad came forward withrevolvers in their hands and threatened the police party bysaying they should leave their men or else the policemenwill be killed.Saying so they fired in the direction ofthe police party.At this point PW 1 took out his revolverand pointed it in the direction of the accused and told themwe are all policemen and you should throw away yourrevolvers else we will fire.Even then the accused personsfired some rounds in the direction of the police party, thenPW 1 and one other officer tried to rush towards them butthey sat in the said Maruti van and sped away from theplace.After the situation calmed down, the police drew thepanchnamas Ex.22 in presence of some witnesses andconducted personal search of the three culprits.On suchsearch accused no.1 Babu Kuttan Pillai was found to possessthe plastic bag containing the paper bundles (Art.1),accused no.2 Umesh Bhatt was found to possess a big Rampuriknife which was hidden at the waist under the pant by leftside.After completion of investigation the police submittedthe charge-sheet.The three persons at the spot wereremanded to the police custody.Subsequently, the otheraccused persons were also arrested.They were put to testidentification parade.The learned Trial Judge onappreciation of the evidence on record convicted accusedno.1 Babu Kuttan Ramkrishna Pillai and the accused no.2Umesh @ Babu Purshottam Bhatt for the offence punishableunder section 395 of the Indian Penal Code and sentencedeach of them to suffer rigorous imprisonment of 5 years andto pay a fine of Rs.500, in default of payment of fine toundergo further Rigorous Imprisonment for 6 months.Theywere also convicted under Section 120 B of the IPC but noseparate sentence was passed.They were acquitted of theother offences with which they were charged.The remainingaccused persons i.e. accused nos. 3,4,5,6 and 7 wereacquitted of all the charges framed against them.1 and 2, have filed this appeal assailing the judgmentpassed by the Designated Court at Brihan Mumbai,convicting/sentencing them as above.On a reading of the judgment under challenge, we findthat the learned trial Judge has considered the entire caseled by the prosecution in great detail and after discussingthe charges framed against the appellants under sections3(2), 3(3) and 3(5) of TADA Act, rejected the prosecutioncase on that count.Thereafter the learned trial Judge inparagraph 17 onwards considered the question of what offencewas made out against the appellants.After a detaileddiscussion of the relevant evidence placed by theprosecution and after examining it in the light of thecontentions on behalf of the defence, the learned trialJudge believed the testimony of PW 1- Sunil Deshmukh, PW 7 -Tambe and PW 9 - L.J. Kamble and came to hold that theappellants are guilty of the offence of criminal conspiracypunishable under section 120-B and the offence of dacoitypunishable under section 395 IPC and convicted themthereunder and imposed the punishment as noted earlier.We have perused the evidence of these witnesses.
    """
)

tensor([[0.0108, 0.0197, 0.0091, 0.0188, 0.0881, 0.0068, 0.0281, 0.3253, 0.2975,
         0.2547, 0.0094, 0.0183, 0.0155, 0.0072, 0.0144, 0.0118, 0.0334, 0.0201,
         0.0093, 0.0117, 0.0095, 0.0071, 0.0225, 0.0083, 0.0161, 0.0118, 0.1682,
         0.0534, 0.0075, 0.0248, 0.0266, 0.2120, 0.0517, 0.0107, 0.0140, 0.0170,
         0.3326, 0.0208, 0.0158, 0.0153, 0.4377, 0.2036, 0.1597, 0.1358, 0.0456,
         0.0249, 0.0093, 0.0094, 0.3209, 0.1898, 0.0237, 0.0707, 0.0529, 0.0253,
         0.0118, 0.0099, 0.0195, 0.0062, 0.0060, 0.0415, 0.0076, 0.0689, 0.0181,
         0.0107, 0.0070, 0.0256, 0.0155, 0.0176, 0.0327, 0.0093, 0.0390, 0.0207,
         0.0110, 0.0078, 0.0110, 0.0131, 0.0665, 0.1002, 0.0192, 0.0157, 0.0463,
         0.0253, 0.0439, 0.0078, 0.0622, 0.0095, 0.0164, 0.0377, 0.0464, 0.0463,
         0.0364, 0.0060, 0.0093, 0.0317, 0.0095, 0.0088, 0.0676, 0.3393, 0.0119,
         0.0167]], grad_fn=<SigmoidBackward0>)


['147', '148', '149', '302', '307', '323', '324', '34', '506']