In [5]:
#test using BlackstoneNLP

#import standard library modules
import sys
from typing import List

#modules from the community
import spacy
from dataclasses import dataclass #backported this module from 3.7
import en_core_web_sm

from spacy import displacy
from blackstone.displacy_palette import ner_displacy_options

from blackstone.pipeline.sentence_segmenter import SentenceSegmenter
from blackstone.rules import CITATION_PATTERNS

In [6]:
nlp = spacy.load('en_blackstone_proto') #the blackstone model
nlp2 = en_core_web_sm.load()
#load sentence segmenter
sentence_segmenter = SentenceSegmenter(nlp.vocab, CITATION_PATTERNS)
nlp.add_pipe(sentence_segmenter, before="parser")

#nlp=spacy.load('en_core_web_sm') #the default spacy model

In [27]:
#get the text into a variable from the .txt file
def text_from_file(filename: str) -> str:
    with open(filename, 'r') as in_file:
        data = in_file.read()
        return data

In [28]:
#NOTE: we still need to load the doc into a model

def text_from_json(filename: str, dict_value: str) -> str:
    with open(filename, 'r') as in_file:
        data=json.load(in_file)
        data=data[dict_value]
        return data
    
#NOTE: we don't use this function until later

In [29]:
booker_string = text_from_file('booker2.txt')

In [30]:
#let's start the model
bx = nlp(booker_string)

<b>In the section above we can see the law's text displayed using displaCy. We have highlighted the cases, provisions, laws, etc in the text.</b>


In [31]:
#display_casename_citations_filtered(doc)
def get_casename_citations_filtered(doc):
    #one
    '''Takes a spacy doc object and returns a dictionary of cases using the blackstone nlp model
    args:
        doc: the spacy doc object
    returns:
        
    '''
    cases = (i for i in doc.ents if i.label_ == 'CASENAME')
    actual_cases=[]
    results = {case.text:[item for item in case] for case in cases}
    return results

def get_actual_cases(case_list: dict) -> List:
    '''Takes the cases and removes some of the ones that are not cases
    like the ones without a v in them'''
    actual_cases = []
    for k, v in case_list.items():
        for i in v:
            if i.text == 'v.' and i.pos_ == 'ADP' and i.dep_ == 'prep':
            #if i.pos_ == 'ADP' and i.dep_ == 'prep':
                actual_cases.append(k)
    return actual_cases

def get_cases_from_doc(doc):
    return get_actual_cases(get_casename_citations_filtered(doc))

In [32]:
result_one = get_cases_from_doc(bx)

In [33]:
def clean_case_text(case_list: List[str]) -> List[str]:
    '''Takes a list of cases and removes the newline characters at the end'''
    clean_cases = (str(i) for i in case_list)
    clean_cases = (i.rstrip('\n') for i in case_list)
    return clean_cases

In [35]:
result_one = list(clean_case_text(result_one))
print(result_one)

['Blakely v. Washington', 'Mistretta v. United States,', 'Griffith v. Kentucky', 'Mistretta v. United States', 'Stinson v. United States,', 'McMillan v. Pennsylvania', 'Witte v. United States,', '" United States v. Watts', 'Edwards v. United States,', 'Welsh v. United States', 'Califano v. Westcott', 'Sloan v. Lemon', 'United States v. Watts', 'Koon v. United States', 'United States v. Tsosie', 'United States v. Salinas', 'United States v. Cook', 'United States v. Olabanji', 'Harper v. Virginia Dept.', 'El Paso & Northeastern R. Co. v. Gutierrez', 'Hearings on Blakely v. Washington', 'Patton v. United States', '" Barrows v. Jackson', '" United States v. Raines', 'Marbury v. Madison', 'Rust v. Sullivan', '" Sloan v. Lemon', 'CompareChicago v. Morales', 'Webster v. Reproductive Health Services', 'NLRB v. Jones & Laughlin Steel Corp.,', 'Sixth Amendment violationGriffith v. Kentucky', 'Koon v. United States,', 'Renne v. Geary', 'Tennessee v. Garner', 'CfStinson v. United States,', 'Willia

In [24]:
#print(bx)
#displacy.render(bx, style='ent', options=ner_displacy_options)

In [16]:
#Now that we've gotten the cases, let's view the document in displacy

In [17]:
#judges = [i for i in get_judges(bx, doc)]
#for i in judges:
#    for j in i:
#        print(i.text, [j.text for j in i], end='\t')

In [None]:
def remove_statutes(case_list: List):
    '''removes statutes, each of which have a numeric elemnt'''

In [None]:
#how to get the cases
#get_actual_cases(get_casename_citations_filtered(doc))

In [29]:
#a = ''
#dir(a.ljust(20))

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'capitalize',
 'casefold',
 'center',
 'count',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'format_map',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'isidentifier',
 'islower',
 'isnumeric',
 'isprintable',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'zfill']

In [None]:
cases_found = get_cases_from_doc(doc)
print('Cases found: ', [(i, i.label_) for i in cases_found])

In [21]:
#print(dir(spacy.tokens.Token))
#for i in cases_found:
#    for token in i:
#        print(token.text, token.pos_, token.dep_, [i for i in token.lefts], [i for i in token.rights])

In [385]:
@dataclass
class CaseWithNLP:
    case_name: str
    name_token: spacy.tokens.Token
    plaintiff: spacy.tokens.Token
    respondant: spacy.tokens.Token
    

In [388]:
def get_case_list(doc):
    result = get_cases_from_doc(doc)
    return result

In [390]:
res = make_case_with_nlp(doc)
print(res)

[United States v. Standard Oil Co., Dalehite v. United States,]


In [None]:
def 