In [1]:
from spacy.lang.en import English
from random import shuffle
import pandas as pd
from utils import load_training_data, convert_biluo_scheme

train_judgement_path = './NER_TRAIN/NER_TRAIN_JUDGEMENT.json'
train_preamble_path = './NER_TRAIN/NER_TRAIN_PREAMBLE.json'

dev_judgement_path = './NER_DEV/NER_DEV_JUDGEMENT.json'
dev_preamble_path = './NER_DEV/NER_DEV_PREAMBLE.json'

train_judgement_data = load_training_data(train_judgement_path, False)
train_preamble_data = load_training_data(train_preamble_path, True)
all_training_data = train_judgement_data + train_preamble_data
shuffle(all_training_data)

dev_judgement_data = load_training_data(dev_judgement_path, False)
dev_preamble_data = load_training_data(dev_preamble_path, True)
all_dev_data = dev_judgement_data + dev_preamble_data
shuffle(all_dev_data)

In [2]:
training_df = pd.DataFrame(all_training_data, columns = ["TEXT", "PREAMBLE", "ENTITIES"])
training_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES
0,This writ petition appears to have been drafte...,False,"[(51, 68, DATE), (150, 167, DATE), (202, 217, ..."
1,Constitution Bench of the Apex Court in The St...,False,"[(26, 36, COURT), (44, 95, PRECEDENT)]"
2,"He may choose to exercise, his. discretion to ...",False,[]
3,"Under these circumstances, the Tribunal was ju...",False,[]
4,"In this fog of authorities, however, a beacon ...",False,"[(110, 182, PRECEDENT)]"


In [3]:
dev_df = pd.DataFrame(all_dev_data, columns = ["TEXT", "PREAMBLE", "ENTITIES"])
dev_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES
0,"At best, it can be said that because of the na...",False,"[(187, 198, PROVISION)]"
1,The observation in Md.Mohar Ali (supra) statin...,False,"[(19, 31, OTHER_PERSON)]"
2,W.P.No.9267 of 2017\n\n ...,True,"[(63, 97, COURT), (295, 306, JUDGE), (426, 444..."
3,"Sri Sunil B.Ganu, learned counsel, would howev...",False,"[(4, 16, OTHER_PERSON), (68, 84, PROVISION)]"
4,"Suo Motu Writ Petition (C) No.1 of 2020, dated...",False,"[(0, 39, CASE_NUMBER), (47, 57, DATE), (58, 85..."


In [4]:
nlp = English()
biluo_labels, bio_labels = convert_biluo_scheme(all_training_data, nlp)
training_df["BILUO_LABELS"] = biluo_labels
training_df["BIO_LABELS"] = bio_labels
training_df.head(5)

Unnamed: 0,TEXT,PREAMBLE,ENTITIES,BILUO_LABELS,BIO_LABELS
0,This writ petition appears to have been drafte...,False,"[(51, 68, DATE), (150, 167, DATE), (202, 217, ...","[O, O, O, O, O, O, O, O, O, B-DATE, I-DATE, I-...","[O, O, O, O, O, O, O, O, O, B-DATE, I-DATE, I-..."
1,Constitution Bench of the Apex Court in The St...,False,"[(26, 36, COURT), (44, 95, PRECEDENT)]","[O, O, O, O, B-COURT, L-COURT, O, O, B-PRECEDE...","[O, O, O, O, B-COURT, I-COURT, O, O, B-PRECEDE..."
2,"He may choose to exercise, his. discretion to ...",False,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"Under these circumstances, the Tribunal was ju...",False,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"In this fog of authorities, however, a beacon ...",False,"[(110, 182, PRECEDENT)]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [5]:
biluo_labels, bio_labels = convert_biluo_scheme(all_dev_data, nlp)
dev_df["BILUO_LABELS"] = biluo_labels
dev_df["BIO_LABELS"] = bio_labels
dev_df.head(5)


                             ..." with entities "[(63, 97, 'COURT'), (295, 306, 'JUDGE'), (426, 444...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
    Commercial Complex, Raj Bhavan Road, Hyderaba..." with entities "[(42, 51, 'GPE')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

       ..." with entities "[(7, 41, 'COURT'), (232, 244, 'JUDGE'), (342, 432,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

    D..." with entities "[(7, 43, 'COURT'), (138, 159, 'JUDGE'), (291, 307,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


             

 Likewise, In the power of attorney (exhibit P/11..." with entities "[(70, 84, 'OTHER_PERSON'), (98, 109, 'OTHER_PERSON...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

 (c) The aforesaid cheque was presented by the c..." with entities "[(93, 104, 'ORG'), (118, 127, 'GPE'), (226, 236, '...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

Digitally signed by:RAJENDER SINGH KARKI Signing..." with entities "[(22, 42, 'OTHER_PERSON'), (56, 66, 'DATE')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


Unnamed: 0,TEXT,PREAMBLE,ENTITIES,BILUO_LABELS,BIO_LABELS
0,"At best, it can be said that because of the na...",False,"[(187, 198, PROVISION)]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,The observation in Md.Mohar Ali (supra) statin...,False,"[(19, 31, OTHER_PERSON)]","[O, O, O, B-OTHER_PERSON, I-OTHER_PERSON, L-OT...","[O, O, O, B-OTHER_PERSON, I-OTHER_PERSON, I-OT..."
2,W.P.No.9267 of 2017\n\n ...,True,"[(63, 97, COURT), (295, 306, JUDGE), (426, 444...","[O, O, O, O, O, O, B-COURT, I-COURT, I-COURT, ...","[O, O, O, O, O, O, B-COURT, I-COURT, I-COURT, ..."
3,"Sri Sunil B.Ganu, learned counsel, would howev...",False,"[(4, 16, OTHER_PERSON), (68, 84, PROVISION)]","[O, B-OTHER_PERSON, L-OTHER_PERSON, O, O, O, O...","[O, B-OTHER_PERSON, I-OTHER_PERSON, O, O, O, O..."
4,"Suo Motu Writ Petition (C) No.1 of 2020, dated...",False,"[(0, 39, CASE_NUMBER), (47, 57, DATE), (58, 85...","[B-CASE_NUMBER, I-CASE_NUMBER, I-CASE_NUMBER, ...","[B-CASE_NUMBER, I-CASE_NUMBER, I-CASE_NUMBER, ..."


In [6]:
import numpy as np
from utils import encode_label_ids
import spacy

nlp = spacy.load("en_core_web_sm")

train_biluo_labels = training_df["BILUO_LABELS"].values.tolist()
train_biluo_labels = list(np.concatenate(biluo_labels).flat)

labels2i, ids_to_label = encode_label_ids(train_biluo_labels)

training_sents = []
training_pos = []
for data in all_training_data:
    tokens = nlp(data[0])
    pos = [token.pos_ for token in tokens]
    tokens = [str(token) for token in tokens]
    training_sents.append(tokens)
    training_pos.append(pos)

dev_sents = []
dev_pos = []
for data in all_dev_data:
    tokens = nlp(data[0])
    pos = [token.pos_ for token in tokens]
    tokens = [str(token) for token in tokens]
    dev_sents.append(tokens)
    dev_pos.append(pos)


In [7]:
def get_observation_dict():
    tokensList = []
    for data in all_training_data:
        tokens = nlp(data[0])
        tokens = [str(token) for token in tokens]
        tokensList = tokensList + tokens
    words = list(set(tokensList))
    words.sort()
    observation_dict = {word: i for i, word in enumerate(words)}
    return observation_dict

In [8]:
from typing import List

observation_dict = get_observation_dict()

# we need to add the id for unknown word (<unk>) in our observations vocab
UNK_TOKEN = '<unk>'

observation_dict[UNK_TOKEN] = len(observation_dict)
print("id of the <unk> token:", observation_dict[UNK_TOKEN])

def encode(sentences: List[List[str]]) -> List[List[int]]:
    """
    Using the observation_dict, convert the tokens to ids
    unknown words take the id for UNK_TOKEN
    """
    return [
        [observation_dict[t] if t in observation_dict else observation_dict[UNK_TOKEN]
            for t in sentence]
        for sentence in sentences]

id of the <unk> token: 45230


In [9]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

from nltk import pos_tag
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Anuj Bhavani\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to C:\Users\Anuj
[nltk_data]     Bhavani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from typing import List

# TODO: Update this function to add more features
#      You can check crf.py for how they are encoded, if interested.
def make_features(text: List[str], tags: List[str]) -> List[List[int]]:
    """Turn a text into a feature vector.

    Args:
        text (List[str]): List of tokens.

    Returns:
        List[List[int]]: List of feature Lists.
    """
    feature_lists = []
    for i, token in enumerate(text):
        feats = []
        #tags = pos_tag(text)
        # We add a feature for each unigram.
        feats.append(f"word={token}")
        # TODO: Add more features here
        #feats.append(f"pos={tags[i][1]}")
        feats.append(f"pos={tags[i]}")
        if i == 0:
            feats.append(f"prev_word=<S>");
        else:
            feats.append(f"prev_word={text[i-1]}")
        if i == len(text) - 1:
            feats.append(f"next_word=<E>")
        else:
            feats.append(f"next_word={text[i+1]}")
        # We append each feature to a List for the token.
        feature_lists.append(feats)
    return feature_lists

In [11]:
def featurize(sents: List[List[str]], tags: List[List[str]]) -> List[List[List[str]]]:
    """Turn the sentences into feature Lists.
    
    Eg.: For an input of 1 sentence:
         [[['I','am','a','student','at','CU','Boulder']]]
        Return list of features for every token for every sentence like:
        [[
         ['word=I',  'prev_word=<S>','pos=PRON',...],
         ['word=an', 'prev_word=I'  , 'pos=VB' ,...],
         [...]
        ]]

    Args:
        sents (List[List[str]]): A List of sentences, which are Lists of tokens.

    Returns:
        List[List[List[str]]]: A List of sentences, which are Lists of feature Lists
    """
    feats = []
    for i in range(len(sents)):
        feature = make_features(sents[i], tags[i])
        feats.append(feature)

#     for sent in sents:
#         # Gets a List of Lists of feature strings
#         feature = make_features(sent)
#         print(feature)
#         # TO DO: Get pos tags
#         # sent_tags = get_pos(pos_tagger, [sent])[0]
#         feats.append(feature)

    return feats

In [1]:
# File referenced from https://github.com/csci5832-f22/assignment_3
from crf import *
import random
from tqdm.autonotebook import tqdm

# TODO: Implement the training loop
# HINT: Build upon what we gave you for HW2.
# See cell below for how we call this training loop.

def training_loop(
    num_epochs,
    batch_size,
    train_features,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model,
    labels2i,
    pad_feature_idx
):
    # TODO: Zip the train features and labels
    # TODO: Randomize them, while keeping them paired.
    # TODO: Build batches
    samples = list(zip(train_features, train_labels))
    random.shuffle(samples)
    batches = []
    for i in range(0, len(samples), batch_size):
        batches.append(samples[i:i+batch_size])
    print("Training...")
    for i in range(num_epochs):
        losses = []
        for batch in tqdm(batches):
            # Here we get the features and labels, pad them,
            # and build a mask so that our model ignores PADs
            # We have abstracted the padding from you for simplicity, 
            # but please reach out if you'd like learn more.
            features, labels = zip(*batch)
            features = pad_features(features, pad_feature_idx)
            features = torch.stack(features)
            # Pad the label sequences to all be the same size, so we
            # can form a proper matrix.
            labels = pad_labels(labels, labels2i[PAD_SYMBOL])
            labels = torch.stack(labels)
            mask = (labels != labels2i[PAD_SYMBOL])
            # TODO: Empty the dynamic computation graph
            optimizer.zero_grad()
            # TODO: Run the model. Since we use the pytorch-crf model,
            # our forward function returns the positive log-likelihood already.
            # We want the negative log-likelihood. See crf.py forward method in NERTagger
            loss = -model.forward(features, labels, mask)
            # TODO: Backpropogate the loss through our model
            loss.backward()
            # TODO: Update our coefficients in the direction of the gradient.
            optimizer.step()
            # TODO: Store the losses for logging
            losses.append(loss.item())
        # TODO: Log the average Loss for the epoch
        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
        # TODO: make dev predictions with the `predict()` function
        dev_predictions = predict(model, dev_features)
        # TODO: Compute F1 score on the dev set and log it.
        dev_f1 = f1_score(dev_predictions, dev_labels, labels2i['O'])
        print(f"Dev F1 {dev_f1}")
        
    # Return the trained model
    return model

In [None]:
from crf import build_features_set
from crf import make_features_dict
from crf import encode_features, encode_labels
from crf import NERTagger

print("1")
train_features = featurize(training_sents, training_pos)
print("2")
dev_features = featurize(dev_sents, dev_pos)

# Get the full inventory of possible features
print("3")
all_features = build_features_set(train_features)
# Hash all features to a unique int.
print("4")
features_dict = make_features_dict(all_features)
# Initialize the model.
print("5")
model = NERTagger(len(features_dict), len(labels2i))

print("6")
encoded_train_features = encode_features(train_features, features_dict)
print("7")
encoded_dev_features = encode_features(dev_features, features_dict)
print("8")
train_tag_sents = training_df["BIO_LABELS"].values.tolist()
encoded_train_labels = encode_labels(train_tag_sents, labels2i)
print("9")
dev_tag_sents = dev_df["BIO_LABELS"].values.tolist()
encoded_dev_labels = e ncode_labels(dev_tag_sents, labels2i)

# TODO: Play with hyperparameters here.
num_epochs = 45
batch_size = 16
LR=0.05
optimizer = torch.optim.SGD(model.parameters(), LR)
print("10")
model = training_loop(
    num_epochs,
    batch_size,
    encoded_train_features,
    encoded_train_labels,
    encoded_dev_features,
    encoded_dev_labels,
    optimizer,
    model,
    labels2i,
    features_dict[PAD_SYMBOL]
)

1
2
3
Building features set!


100%|█████████████████████████████████████████████████████████████████████████| 10995/10995 [00:00<00:00, 38091.77it/s]


4
Found 135234 features
5
6
7
8
9
10
Training...


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 0, loss: 34.07039005188055


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Dev F1 tensor([0.1112])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 1, loss: 19.238854822031286
Dev F1 tensor([0.1335])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 2, loss: 16.218920898298883
Dev F1 tensor([0.1809])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 3, loss: 14.805929558221685
Dev F1 tensor([0.2228])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 4, loss: 13.69424912957258
Dev F1 tensor([0.2539])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 5, loss: 13.251020854988763
Dev F1 tensor([0.2994])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 6, loss: 12.184567036323768
Dev F1 tensor([0.3270])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 7, loss: 11.829620380041211
Dev F1 tensor([0.3884])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 8, loss: 11.617172444975653
Dev F1 tensor([0.4300])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 9, loss: 11.06280397190604
Dev F1 tensor([0.4445])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 10, loss: 10.588711757299512
Dev F1 tensor([0.4590])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 11, loss: 10.65020798597225
Dev F1 tensor([0.4770])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 12, loss: 10.05143143271291
Dev F1 tensor([0.4868])


  0%|          | 0/688 [00:00<?, ?it/s]

epoch 13, loss: 10.046565933976062
Dev F1 tensor([0.4929])


  0%|          | 0/688 [00:00<?, ?it/s]