In [80]:
import numpy as np
import pandas as pd
import json

In [81]:
with open('/content/drive/MyDrive/NLP/Assignment-2/NER_TRAIN_JUDGEMENT.json', 'r') as f:
    df_train = json.load(f)

with open('/content/drive/MyDrive/NLP/Assignment-2/NER_TEST_JUDGEMENT.json', 'r') as f:
    df_test = json.load(f)

In [82]:
print(df_train[:5])
print(len(df_train))
print(len(df_test))

[{'id': '90d9a97c7b7749ec8a4f460fda6f937e', 'annotations': [{'result': [{'value': {'start': 90, 'end': 103, 'text': 'Hongkong Bank', 'labels': ['ORG']}, 'id': 'C8HPTIM1', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 267, 'end': 278, 'text': 'Rahul & Co.', 'labels': ['ORG']}, 'id': 'KOWE3RAM', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}]}], 'data': {'text': "\n\n(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy."}, 'meta': {'source': 'tax_districtcourts judgement https://indiankanoon.org/doc/1556717/'}}, {'id': 'a325c57ba5b84c6fa46bee65e6616633', 'annotations': [{'result': [{'value': {'start': 26, 'end': 30, 'text': 'Agya', 'labels': ['

In [83]:
from sklearn.model_selection import train_test_split

# Splitting Training Data into training and validation sets with an 85:15 ratio

In [84]:
train_data, val_data = train_test_split(df_train, test_size=0.15, random_state=42)

In [86]:
print(train_data[:5])
print(len(train_data))
print(val_data[:5])
print(len(val_data))

[{'id': 'b29019e16fc64e5da48f20706b152fae', 'annotations': [{'result': []}], 'data': {'text': 'Therefore, while interpreting statutory provisions, the courts should keep in mind the objectives or purpose for which statute has been enacted.'}, 'meta': {'source': 'tax_kerala_high_court judgement https://indiankanoon.org/doc/1186655'}}, {'id': 'b8f6ac625fb9435db21c94644bf7784f', 'annotations': [{'result': [{'value': {'start': 18, 'end': 38, 'text': 'W.P.No.15821 of 2008', 'labels': ['CASE_NUMBER']}, 'id': 'X6BRP5D2', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 86, 'end': 128, 'text': 'National Rural Employment Guarantee Scheme', 'labels': ['ORG']}, 'id': 'Q2GE2LIK', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}]}], 'data': {'text': 'The petitioner in W.P.No.15821 of 2008 was never considered for appointment under the National Rural Employment Guarantee Scheme either through Employment Exchange sponsorship or by Outsourcing Agencies.'}, 'met

# BIO Encoding + Preprocessing

In [87]:
def preprocess_data(data):
    preprocessed_data = {}
    for case in data:
        case_id = case['id']
        text = case['data']['text']
        annotations = case['annotations'][0]['result']
        tokens = text.split()
        #print(tokens)
        bio_encoding = ['O']*len(tokens)

        for annotation in annotations:
            label = annotation['value']['labels'][0]
            start = annotation['value']['start']
            end = annotation['value']['end']
            entity = annotation['value']['text']
            entity_tokens = entity.split()
            if(len(entity_tokens) == 1):
                for i in range(len(tokens)):
                    t_470 = tokens[i][1:] if tokens[i].startswith('(') else tokens[i]
                    t_6009 = tokens[i][1:] if tokens[i].startswith('"') else tokens[i]
                    if (tokens[i] == entity or tokens[i][:-1] == entity or tokens[i][:-2] == entity or t_470 == entity or t_6009 == entity) and bio_encoding[i]=='O':
                        bio_encoding[i] = 'B_' + label
                        break
            else:
                for i in range(len(tokens)):
                    t_470 = tokens[i][1:] if tokens[i].startswith('(') else tokens[i]
                    t_6009 = tokens[i][1:] if tokens[i].startswith('"') else tokens[i]
                    if (tokens[i] == entity_tokens[0] or tokens[i][:-1] == entity_tokens[0] or tokens[i][:-2] == entity_tokens[0] or t_470 == entity_tokens[0] or t_6009 == entity_tokens[0]) and bio_encoding[i]=='O':
                        bio_encoding[i] = 'B_' + label
                        for j in range(1, len(entity_tokens)):
                            bio_encoding[i+j] = 'I_' + label
                        break
        #print(bio_encoding)
        preprocessed_data[case_id] = {'text': text, 'labels': bio_encoding}
    return preprocessed_data

In [88]:
preprocessed_train = preprocess_data(train_data)
for i, (case_id, data) in enumerate(preprocessed_train.items()):
    print(f"Case ID: {case_id}")
    print(f"Text: {data['text']}")
    print(f"Labels: {data['labels']}")
    print()
    if i == 4:
        break
print(len(preprocessed_train))

Case ID: b29019e16fc64e5da48f20706b152fae
Text: Therefore, while interpreting statutory provisions, the courts should keep in mind the objectives or purpose for which statute has been enacted.
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Case ID: b8f6ac625fb9435db21c94644bf7784f
Text: The petitioner in W.P.No.15821 of 2008 was never considered for appointment under the National Rural Employment Guarantee Scheme either through Employment Exchange sponsorship or by Outsourcing Agencies.
Labels: ['O', 'O', 'O', 'B_CASE_NUMBER', 'I_CASE_NUMBER', 'I_CASE_NUMBER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_ORG', 'I_ORG', 'I_ORG', 'I_ORG', 'I_ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Case ID: 4bbb0629e66146edaf4ac7bde47062fb
Text: The factum of accident, allegation of rash and negligent driving causing death of Sukendra Pal Singh were denied.
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_

In [89]:
preprocessed_val = preprocess_data(val_data)
print(preprocessed_val)
print(len(preprocessed_val))

{'35a7bbe300734dffae0740ab00356e1d': {'text': 'Clause 18(1), (2) and (3)\n(a) & (b) were transposed in Article 23 of the Draft Constitution of India.', 'labels': ['B_PROVISION', 'I_PROVISION', 'I_PROVISION', 'I_PROVISION', 'I_PROVISION', 'I_PROVISION', 'I_PROVISION', 'I_PROVISION', 'O', 'O', 'O', 'B_PROVISION', 'I_PROVISION', 'O', 'O', 'B_STATUTE', 'I_STATUTE', 'I_STATUTE', 'I_STATUTE']}, 'b6d5aee6565043c799ece82bc2b05e43': {'text': 'The order cannot be said to be wrong when the only ground mentioned for impleading the Chief Minister as a party was to make it incumbent on him to file an affidavit, which he was not legally obliged to, if he was not a party.', 'labels': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, 'e7023716ce814eb28c69e184e2bc8207': {'text': 'Ajit Kumar Guha (D. W. 1), who was himself a memb

In [90]:
preprocessed_test = preprocess_data(df_test)
print(preprocessed_test)
print(len(preprocessed_test))

949


In [91]:
with open('/content/drive/MyDrive/NLP/Assignment-2/NER_train.json', 'w') as f:
    json.dump(preprocessed_train, f)

with open('/content/drive/MyDrive/NLP/Assignment-2/NER_val.json', 'w') as f:
    json.dump(preprocessed_val, f)

with open('/content/drive/MyDrive/NLP/Assignment-2/NER_test.json', 'w') as f:
    json.dump(preprocessed_test, f)