In [1]:
from spacy.lang.en import English
from random import shuffle
import pandas as pd
from utils import load_training_data, convert_biluo_scheme

train_judgement_path = './NER_TRAIN/NER_TRAIN_JUDGEMENT.json'
train_preamble_path = './NER_TRAIN/NER_TRAIN_PREAMBLE.json'

dev_judgement_path = './NER_DEV/NER_DEV_JUDGEMENT.json'
dev_preamble_path = './NER_DEV/NER_DEV_PREAMBLE.json'

train_judgement_data = load_training_data(train_judgement_path, False)
train_preamble_data = load_training_data(train_preamble_path, True)
all_training_data = train_judgement_data + train_preamble_data
shuffle(all_training_data)

dev_judgement_data = load_training_data(dev_judgement_path, False)
dev_preamble_data = load_training_data(dev_preamble_path, True)
all_dev_data = dev_judgement_data + dev_preamble_data
shuffle(all_dev_data)

In [2]:
training_df = pd.DataFrame(all_training_data, columns = ["TEXT", "PREAMBLE", "ENTITIES"])
training_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES
0,"It is also clear that on 17th May, 2011, a ten...",False,"[(25, 39, DATE)]"
1,We would accordingly bold that the High Court ...,False,"[(161, 167, PROVISION)]"
2,"He described the location of ""Tamarind Court"" ...",False,[]
3,The prosecution has to prove beyond reasonable...,False,"[(160, 176, PROVISION), (177, 183, STATUTE)]"
4,He was relying on the Apex Court judgment repo...,False,"[(22, 32, COURT), (54, 101, PRECEDENT)]"


In [3]:
dev_df = pd.DataFrame(all_dev_data, columns = ["TEXT", "PREAMBLE", "ENTITIES"])
dev_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES
0,"Obviously, all the legal action were started i...",False,"[(123, 133, DATE), (197, 211, OTHER_PERSON)]"
1,"One of the arguments raised in Sibbia, as also...",False,[]
2,The absence of a proper witness protection pro...,False,"[(136, 149, COURT), (158, 218, PRECEDENT), (23..."
3,The post of Army Commander/Vice Chief of Army ...,False,"[(134, 145, ORG)]"
4,The standard of proof beyond reasonable doubt ...,False,[]


In [4]:
nlp = English()
biluo_labels, bio_labels = convert_biluo_scheme(all_training_data, nlp)
training_df["BILUO_LABELS"] = biluo_labels
training_df["BIO_LABELS"] = bio_labels
training_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES,BILUO_LABELS,BIO_LABELS
0,"It is also clear that on 17th May, 2011, a ten...",False,"[(25, 39, DATE)]","[O, O, O, O, O, O, B-DATE, I-DATE, I-DATE, L-D...","[O, O, O, O, O, O, B-DATE, I-DATE, I-DATE, I-D..."
1,We would accordingly bold that the High Court ...,False,"[(161, 167, PROVISION)]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"He described the location of ""Tamarind Court"" ...",False,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,The prosecution has to prove beyond reasonable...,False,"[(160, 176, PROVISION), (177, 183, STATUTE)]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,He was relying on the Apex Court judgment repo...,False,"[(22, 32, COURT), (54, 101, PRECEDENT)]","[O, O, O, O, O, B-COURT, L-COURT, O, O, O, B-P...","[O, O, O, O, O, B-COURT, I-COURT, O, O, O, B-P..."


In [5]:
import numpy as np
from utils import encode_label_ids

train_biluo_labels = training_df["BILUO_LABELS"].values.tolist()
train_biluo_labels = list(np.concatenate(biluo_labels).flat)

label_to_ids, ids_to_label = encode_label_ids(train_biluo_labels)

{'B-CASE_NUMBER',
 'B-COURT',
 'B-DATE',
 'B-GPE',
 'B-JUDGE',
 'B-LAWYER',
 'B-ORG',
 'B-OTHER_PERSON',
 'B-PETITIONER',
 'B-PRECEDENT',
 'B-PROVISION',
 'B-RESPONDENT',
 'B-STATUTE',
 'B-WITNESS',
 'I-CASE_NUMBER',
 'I-COURT',
 'I-DATE',
 'I-GPE',
 'I-JUDGE',
 'I-LAWYER',
 'I-ORG',
 'I-OTHER_PERSON',
 'I-PETITIONER',
 'I-PRECEDENT',
 'I-PROVISION',
 'I-RESPONDENT',
 'I-STATUTE',
 'I-WITNESS',
 'L-CASE_NUMBER',
 'L-COURT',
 'L-DATE',
 'L-GPE',
 'L-JUDGE',
 'L-LAWYER',
 'L-ORG',
 'L-OTHER_PERSON',
 'L-PETITIONER',
 'L-PRECEDENT',
 'L-PROVISION',
 'L-RESPONDENT',
 'L-STATUTE',
 'L-WITNESS',
 'O',
 'U-CASE_NUMBER',
 'U-COURT',
 'U-DATE',
 'U-GPE',
 'U-JUDGE',
 'U-LAWYER',
 'U-ORG',
 'U-OTHER_PERSON',
 'U-PETITIONER',
 'U-PRECEDENT',
 'U-PROVISION',
 'U-RESPONDENT',
 'U-STATUTE',
 'U-WITNESS'}