In [1]:
from spacy.lang.en import English
from random import shuffle
import pandas as pd
from utils import load_training_data, convert_biluo_scheme

train_judgement_path = './NER_TRAIN/NER_TRAIN_JUDGEMENT.json'
train_preamble_path = './NER_TRAIN/NER_TRAIN_PREAMBLE.json'

dev_judgement_path = './NER_DEV/NER_DEV_JUDGEMENT.json'
dev_preamble_path = './NER_DEV/NER_DEV_PREAMBLE.json'

train_judgement_data = load_training_data(train_judgement_path, False)
train_preamble_data = load_training_data(train_preamble_path, True)
all_training_data = train_judgement_data + train_preamble_data
shuffle(all_training_data)

dev_judgement_data = load_training_data(dev_judgement_path, False)
dev_preamble_data = load_training_data(dev_preamble_path, True)
all_dev_data = dev_judgement_data + dev_preamble_data
shuffle(all_dev_data)

In [2]:
training_df = pd.DataFrame(all_training_data, columns = ["TEXT", "PREAMBLE", "ENTITIES"])
training_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES
0,"Arguably, MOHUN is a common and/or a household...",False,"[(10, 15, ORG)]"
1,"\n5.2 CW3 Mr Vijay Mishra , Deputy Manager, H...",False,"[(13, 25, WITNESS), (44, 60, ORG), (62, 64, GP..."
2,It was alleged that the meeting was held irreg...,False,"[(110, 131, OTHER_PERSON), (171, 193, OTHER_PE..."
3,The Will was also witnessed by another attesti...,False,"[(57, 70, WITNESS), (106, 118, WITNESS), (160,..."
4,In this connection it would be relevant to ref...,False,"[(52, 67, PRECEDENT), (101, 114, COURT), (152,..."


In [3]:
dev_df = pd.DataFrame(all_dev_data, columns = ["TEXT", "PREAMBLE", "ENTITIES"])
dev_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES
0,"The amount of Rs.1,03,000/­, which was receive...",False,"[(51, 62, OTHER_PERSON)]"
1,High Court Of Judicature At Allahabad\n \n \n\...,True,"[(0, 37, COURT), (315, 354, PETITIONER), (371,..."
2,"For the purposes of this sub-section, certific...",False,"[(98, 128, PROVISION), (166, 181, PROVISION), ..."
3,"Challenging the same, the State of Telangana f...",False,"[(26, 44, ORG), (51, 82, CASE_NUMBER), (90, 11..."
4,"[see Koteshwar Vittal Kamath v, K. Rangappa & ...",False,"[(5, 63, PRECEDENT)]"


In [4]:
nlp = English()
biluo_labels, bio_labels = convert_biluo_scheme(all_training_data, nlp)
training_df["BILUO_LABELS"] = biluo_labels
training_df["BIO_LABELS"] = bio_labels
training_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES,BILUO_LABELS,BIO_LABELS
0,"Arguably, MOHUN is a common and/or a household...",False,"[(10, 15, ORG)]","[O, O, U-ORG, O, O, O, O, O, O, O, O, O, O, O,...","[O, O, B-ORG, O, O, O, O, O, O, O, O, O, O, O,..."
1,"\n5.2 CW3 Mr Vijay Mishra , Deputy Manager, H...",False,"[(13, 25, WITNESS), (44, 60, ORG), (62, 64, GP...","[O, O, O, O, B-WITNESS, L-WITNESS, O, O, O, O,...","[O, O, O, O, B-WITNESS, I-WITNESS, O, O, O, O,..."
2,It was alleged that the meeting was held irreg...,False,"[(110, 131, OTHER_PERSON), (171, 193, OTHER_PE...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,The Will was also witnessed by another attesti...,False,"[(57, 70, WITNESS), (106, 118, WITNESS), (160,...","[O, O, O, O, O, O, O, O, O, B-WITNESS, L-WITNE...","[O, O, O, O, O, O, O, O, O, B-WITNESS, I-WITNE..."
4,In this connection it would be relevant to ref...,False,"[(52, 67, PRECEDENT), (101, 114, COURT), (152,...","[O, O, O, O, O, O, O, O, O, O, B-PRECEDENT, I-...","[O, O, O, O, O, O, O, O, O, O, B-PRECEDENT, I-..."


In [5]:
biluo_labels, bio_labels = convert_biluo_scheme(all_dev_data, nlp)
dev_df["BILUO_LABELS"] = biluo_labels
dev_df["BIO_LABELS"] = bio_labels
dev_df.head(5)

 
 

       ..." with entities "[(0, 37, 'COURT'), (315, 354, 'PETITIONER'), (371,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

                             ..." with entities "[(63, 97, 'COURT'), (295, 306, 'JUDGE'), (426, 444...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


                                              ..." with entities "[(120, 142, 'COURT'), (269, 283, 'PETITIONER'), (4...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

                            Before The Madurai ..." with entities "[(42, 76, 'COURT'), (268, 280, 'JUDGE'), (543, 573...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alig

Unnamed: 0,TEXT,PREAMBLE,ENTITIES,BILUO_LABELS,BIO_LABELS
0,"The amount of Rs.1,03,000/­, which was receive...",False,"[(51, 62, OTHER_PERSON)]","[O, O, O, O, O, O, O, O, O, U-OTHER_PERSON, O,...","[O, O, O, O, O, O, O, O, O, B-OTHER_PERSON, O,..."
1,High Court Of Judicature At Allahabad\n \n \n\...,True,"[(0, 37, COURT), (315, 354, PETITIONER), (371,...","[B-COURT, I-COURT, I-COURT, I-COURT, I-COURT, ...","[B-COURT, I-COURT, I-COURT, I-COURT, I-COURT, ..."
2,"For the purposes of this sub-section, certific...",False,"[(98, 128, PROVISION), (166, 181, PROVISION), ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"Challenging the same, the State of Telangana f...",False,"[(26, 44, ORG), (51, 82, CASE_NUMBER), (90, 11...","[O, O, O, O, O, B-ORG, I-ORG, L-ORG, O, B-CASE...","[O, O, O, O, O, B-ORG, I-ORG, I-ORG, O, B-CASE..."
4,"[see Koteshwar Vittal Kamath v, K. Rangappa & ...",False,"[(5, 63, PRECEDENT)]","[O, O, B-PRECEDENT, I-PRECEDENT, I-PRECEDENT, ...","[O, O, B-PRECEDENT, I-PRECEDENT, I-PRECEDENT, ..."


In [6]:
import numpy as np
from utils import encode_label_ids
import spacy

nlp = spacy.load("en_core_web_sm")

train_biluo_labels = training_df["BILUO_LABELS"].values.tolist()
train_biluo_labels = list(np.concatenate(biluo_labels).flat)

labels2i, ids_to_label = encode_label_ids(train_biluo_labels)

training_sents = []
training_pos = []
for data in all_training_data:
    tokens = nlp(data[0])
    pos = [token.pos_ for token in tokens]
    tokens = [str(token) for token in tokens]
    training_sents.append(tokens)
    training_pos.append(pos)

dev_sents = []
dev_pos = []
for data in all_dev_data:
    tokens = nlp(data[0])
    pos = [token.pos_ for token in tokens]
    tokens = [str(token) for token in tokens]
    dev_sents.append(tokens)
    dev_pos.append(pos)


In [7]:
def get_observation_dict():
    tokensList = []
    for data in all_training_data:
        tokens = nlp(data[0])
        tokens = [str(token) for token in tokens]
        tokensList = tokensList + tokens
    words = list(set(tokensList))
    words.sort()
    observation_dict = {word: i for i, word in enumerate(words)}
    return observation_dict

In [8]:
from typing import List

observation_dict = get_observation_dict()

UNK_TOKEN = '<unk>'

observation_dict[UNK_TOKEN] = len(observation_dict)
print("id of the <unk> token:", observation_dict[UNK_TOKEN])

def encode(sentences: List[List[str]]) -> List[List[int]]:
    """
    Using the observation_dict, convert the tokens to ids
    unknown words take the id for UNK_TOKEN
    """
    return [
        [observation_dict[t] if t in observation_dict else observation_dict[UNK_TOKEN]
            for t in sentence]
        for sentence in sentences]

id of the <unk> token: 45230


In [9]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

from nltk import pos_tag
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Anuj Bhavani\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to C:\Users\Anuj
[nltk_data]     Bhavani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from typing import List

def make_features(text: List[str], tags: List[str], is_preamble: bool) -> List[List[int]]:
    """Turn a text into a feature vector.

    Args:
        text (List[str]): List of tokens.

    Returns:
        List[List[int]]: List of feature Lists.
    """
    feature_lists = []
    for i, token in enumerate(text):
        feats = []
        feats.append(f"word={token}")
        feats.append(f"pos={tags[i]}")
        if i == 0:
            feats.append(f"prev_word=<S>");
        else:
            feats.append(f"prev_word={text[i-1]}")
        if i == len(text) - 1:
            feats.append(f"next_word=<E>")
        else:
            feats.append(f"next_word={text[i+1]}")
        if is_preamble:
            feats.append(f"sentence_type=PREAMBLE");
        else:
            feats.append(f"sentence_type=JUDGEMENT");
        feature_lists.append(feats)
    return feature_lists

In [11]:
def featurize(sents: List[List[str]], tags: List[List[str]], is_preamble_list: List[bool]) -> List[List[List[str]]]:
    """Turn the sentences into feature Lists.
    
    Eg.: For an input of 1 sentence:
         [[['I','am','a','student','at','CU','Boulder']]]
        Return list of features for every token for every sentence like:
        [[
         ['word=I',  'prev_word=<S>','pos=PRON',...],
         ['word=an', 'prev_word=I'  , 'pos=VB' ,...],
         [...]
        ]]

    Args:
        sents (List[List[str]]): A List of sentences, which are Lists of tokens.

    Returns:
        List[List[List[str]]]: A List of sentences, which are Lists of feature Lists
    """
    feats = []
    for i in range(len(sents)):
        feature = make_features(sents[i], tags[i], is_preamble_list[i])
        feats.append(feature)

    return feats

In [12]:
# File referenced from https://github.com/csci5832-f22/assignment_3
from crf import *
import random
from tqdm.autonotebook import tqdm


def training_loop(
    num_epochs,
    batch_size,
    train_features,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model,
    labels2i,
    pad_feature_idx
):
    samples = list(zip(train_features, train_labels))
    random.shuffle(samples)
    batches = []
    for i in range(0, len(samples), batch_size):
        batches.append(samples[i:i+batch_size])
    print("Training...")
    for i in range(num_epochs):
        losses = []
        for batch in tqdm(batches):
            features, labels = zip(*batch)
            features = pad_features(features, pad_feature_idx)
            features = torch.stack(features)
            labels = pad_labels(labels, labels2i[PAD_SYMBOL])
            labels = torch.stack(labels)
            mask = (labels != labels2i[PAD_SYMBOL])
            optimizer.zero_grad()
            loss = -model.forward(features, labels, mask)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
        dev_predictions = predict(model, dev_features)
        dev_f1 = f1_score(dev_predictions, dev_labels, labels2i['O'])
        print(f"Dev F1 {dev_f1}")
        
    return model

In [None]:
from crf import build_features_set
from crf import make_features_dict
from crf import encode_features, encode_labels
from crf import NERTagger

train_sents_is_preamble = training_df["PREAMBLE"].values.tolist()
dev_sents_is_preamble = dev_df["PREAMBLE"].values.tolist()

train_features = featurize(training_sents, training_pos, train_sents_is_preamble)
dev_features = featurize(dev_sents, dev_pos, dev_sents_is_preamble)

all_features = build_features_set(train_features)
features_dict = make_features_dict(all_features)
model = NERTagger(len(features_dict), len(labels2i))

encoded_train_features = encode_features(train_features, features_dict)
encoded_dev_features = encode_features(dev_features, features_dict)
train_tag_sents = training_df["BIO_LABELS"].values.tolist()
encoded_train_labels = encode_labels(train_tag_sents, labels2i)
dev_tag_sents = dev_df["BIO_LABELS"].values.tolist()
encoded_dev_labels = encode_labels(dev_tag_sents, labels2i)

num_epochs = 45
batch_size = 16
LR=0.05
optimizer = torch.optim.SGD(model.parameters(), LR)
model = training_loop(
    num_epochs,
    batch_size,
    encoded_train_features,
    encoded_train_labels,
    encoded_dev_features,
    encoded_dev_labels,
    optimizer,
    model,
    labels2i,
    features_dict[PAD_SYMBOL]
)

Building features set!


100%|█████████████████████████████████████████████████████████████████████████| 10995/10995 [00:00<00:00, 13249.23it/s]


Found 135236 features
Training...


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 0, loss: 30.037935635724732


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Dev F1 tensor([0.3165])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 1, loss: 18.84659894330557
Dev F1 tensor([0.3923])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 2, loss: 15.04430908311245
Dev F1 tensor([0.4258])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 3, loss: 13.334273163445813
Dev F1 tensor([0.4496])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 4, loss: 11.963907626695757
Dev F1 tensor([0.4641])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 5, loss: 11.231360014094863
Dev F1 tensor([0.4834])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 6, loss: 10.808272881346733
Dev F1 tensor([0.4926])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 7, loss: 10.478986611199934
Dev F1 tensor([0.5009])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 8, loss: 10.14004073032113
Dev F1 tensor([0.5043])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 9, loss: 9.573550282522689
Dev F1 tensor([0.5094])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 10, loss: 9.816281018107263
Dev F1 tensor([0.5182])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 11, loss: 9.050171446661617
Dev F1 tensor([0.5202])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 12, loss: 8.817352151693127
Dev F1 tensor([0.5233])


  0%|          | 0/688 [00:00<?, ?it/s]