Learning entity recognition using legal texts

We are attempting to create a procurement tool, so mastery of vocabulary models from a diverse group of sources is a plus. You just take the skills learned in tuning one model and apply them to another model. 

Here, I am going to update the entity recognizer on the Obamacare opinion "National Federation of Independent Business v. Sebelius" to see if we can train it to make fewer mistakes.

Our goal here is to extract several things:

1) a list of cases
2) a list of judges
3) a list of government entities
4) a list of organizations



In [73]:
#test using BlackstoneNLP

#import standard library modules
import sys
import json
import re
from typing import List, Any

#modules from the community
import spacy
from dataclasses import dataclass #backported this module from 3.7
#import lxml

from spacy import displacy
from blackstone.displacy_palette import ner_displacy_options

#blackstone improved citations
from blackstone.pipeline.sentence_segmenter import SentenceSegmenter
from blackstone.rules import CITATION_PATTERNS

import en_core_web_sm


#BeautifulSoup modules

In [74]:
nlp = spacy.load('en_blackstone_proto') #the blackstone model
#nlp2 = en_core_web_sm.load()
#load sentence segmenter
sentence_segmenter = SentenceSegmenter(nlp.vocab, CITATION_PATTERNS)
nlp.add_pipe(sentence_segmenter, before="parser")

#nlp=spacy.load('en_core_web_sm') #the default spacy model

In [75]:
#Task 1: make a function to load the data from a filename

def doc_from_file(filename, model):
    with open(filename, 'r') as in_file:
        data = in_file.read()
        doc = model(data)
        return doc
    
def doc_from_json(filename: str, model: Any, dict_value: str) -> dict:
    with open(filename, 'r') as in_file:
        data=json.load(in_file)
        doc = model(data[dict_value])
        return doc
        

#Task #2: make sure everything with a 'v.' in the middle is tokenized together


In [76]:
bx = doc_from_json(filename='nfib_v_sebelius.json', model=nlp, dict_value='plain_text')

In [130]:
@dataclass
class Sentence:
    def __init__(self, num, text):
        self.num=num
        self.text=text
        self.doc=nlp(text)
        self.text_as_list = [i for i in self.text]
    
    #@property
   # def doc(self):
    #    return nlp(self.text)

i = 1
sents = list()
for sent in bx.sents:
    cleaned_sent = re.sub('\n|\t', '', str(sent)).strip()
    if(cleaned_sent):
        sents.append(Sentence(i, text=str(sent)))
        i += 1

<b>In the section above we can see the law's text displayed using displaCy. We have highlighted the cases, provisions, laws, etc in the text.</b>


In [131]:
sents = sents[130:] #eliminated the syllabus from the document since its not existing case law


In [132]:
for i in sents:
    print(i)

Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()
Sentence()

In [9]:
#display_casename_citations_filtered(doc)
def get_casename_citations_filtered(doc):
    '''Takes a spacy doc object and returns a dictionary of cases using the blackstone nlp model
    args:
        doc: the spacy doc object
    returns:
        
    '''
    cases = (i for i in doc.ents if i.label_ == 'CASENAME')
    actual_cases=[]
    results = {case:[item for item in case] for case in cases}
    return results

def get_actual_cases(case_list: dict) -> List:
    actual_cases = []
    for k, v in case_list.items():
        for i in v:
            if i.pos_ == 'ADP' and i.dep_ == 'prep':
                actual_cases.append(k)
    return actual_cases

def get_cases_from_doc(doc):
    return get_actual_cases(get_casename_citations_filtered(doc))

In [12]:
result = get_cases_from_doc(bx)

In [30]:
#how to get the cases
#get_actual_cases(get_casename_citations_filtered(doc))

In [376]:
cases_found = get_cases_from_doc(doc)
print('Cases found: ', [(i, i.label_) for i in cases_found])

Cases found:  [(United States v. Standard Oil Co., 'CASENAME'), (Dalehite v. United States,, 'CASENAME')]


In [21]:
#print(dir(spacy.tokens.Token))
#for i in cases_found:
#    for token in i:
#        print(token.text, token.pos_, token.dep_, [i for i in token.lefts], [i for i in token.rights])

In [385]:
@dataclass
class CaseWithNLP:
    case_name: str
    name_token: spacy.tokens.Token
    plaintiff: spacy.tokens.Token
    respondant: spacy.tokens.Token
    

In [388]:
def get_case_list(doc):
    result = get_cases_from_doc(doc)
    return result

In [390]:
res = make_case_with_nlp(doc)
print(res)

[United States v. Standard Oil Co., Dalehite v. United States,]


In [25]:
dir(spacy.tokens)

['Doc',
 'Span',
 'Token',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_retokenize',
 'doc',
 'span',
 'token',
 'underscore',
 'unicode_literals']