In [2]:
#test using BlackstoneNLP

#import standard library modules
import sys
import json
from collections import Counter
from typing import List, Any

#modules from the community
import spacy
from dataclasses import dataclass #backported this module from 3.7

#import lxml

from spacy import displacy
from blackstone.displacy_palette import ner_displacy_options

#blackstone improved citations
from blackstone.pipeline.sentence_segmenter import SentenceSegmenter
from blackstone.rules import CITATION_PATTERNS

import en_core_web_sm


#BeautifulSoup modules

In [3]:
nlp = spacy.load('en_blackstone_proto') #the blackstone model
nlp2 = en_core_web_sm.load()
#load sentence segmenter
sentence_segmenter = SentenceSegmenter(nlp.vocab, CITATION_PATTERNS)
nlp.add_pipe(sentence_segmenter, before="parser")

#nlp=spacy.load('en_core_web_sm') #the default spacy model

In [4]:
#NOTE: we still need to load the doc into a model

def text_from_json(filename: str, dict_value: str) -> str:
    with open(filename, 'r') as in_file:
        data=json.load(in_file)
        data=data[dict_value]
        return data

In [5]:
nfib_v_sebelius = text_from_json(filename='nfib_v_sebelius.json', dict_value='plain_text')

#bx = nlp(nfib_data)
#lay_doc = nlp2(nfib_data)

In [11]:
#nfib_v_sebelius = nfib_v_sebelius.split('\n')
#nfib_v_sebelius = [i.lstrip() for i in nfib_v_sebelius]

def make_clean_text_one(text_file):
    text_file = text_file.split('\n')
    text_file = [i.lstrip() for i in text_file]
    #for line in text_file:
    #    if line[0].isnumeric() == True:
    #            print(line)
    #            text_file.pop(line)
    text_file = ' '.join(text_file)
    return text_file

In [12]:
nfib_v_sebelius = make_clean_text_one(nfib_v_sebelius)

#let's load the models
bx = nlp(nfib_v_sebelius)

In [17]:
#print(nfib_v_sebelius)

#when we have x and y, we create a new example of something to train the model
#and we feed it into the model

#DISPLAY ENTITIES
#for ent in bx.ents:
#    print(ent.text, ent.label_)
    

OCTOBER TERM JUDGE
United States v. Detroit Timber & Lumber Co. CASENAME
200 U. S. 321 CITATION
SUPREME COURT COURT
NATIONAL FEDERATION OF INDEPENDENT JUDGE
BUSINESS ET AL. CASENAME
UNITED STATES COURT COURT
Florida et al. CASENAME
2            NATIONAL FEDERATION CITATION
BUSINESS v. SEBELIUS CASENAME
Federal District Court COURT
Court of Appeals COURT
648 F. 3d 1235 CITATION
CHIEF JUSTICE ROBERTS JUDGE
Anti-Injunction Act. INSTRUMENT
Affordable Care Act INSTRUMENT
Anti-Injunction Act INSTRUMENT
Anti-Injunction Act INSTRUMENT
CHIEF JUSTICE ROBERTS JUDGE
§8 PROVISION
(2012)                      3 CITATION
United States v. Lopez CASENAME
514 U. S. 549 CITATION
Affordable Care Act INSTRUMENT
Affordable Care Act INSTRUMENT
United States v. Comstock CASENAME
Affordable Care Act INSTRUMENT
CHIEF JUSTICE ROBERTS JUDGE
§8 PROVISION
Hooper v. California CASENAME
155 U. S. 648 CITATION
NATIONAL FEDERATION OF INDEPENDENT JUDGE
BUSINESS v. SEBELIUS CASENAME
Crowell v. Benson CASENAME
285 U. S. 22

<b>In the section above we can see the law's text displayed using displaCy. We have highlighted the cases, provisions, laws, etc in the text.</b>


In [9]:
#display_casename_citations_filtered(doc)
def get_casename_citations_filtered(doc):
    #one
    '''Takes a spacy doc object and returns a dictionary of cases using the blackstone nlp model
    args:
        doc: the spacy doc object
    returns:
        
    '''
    cases = (i for i in doc.ents if i.label_ == 'CASENAME')
    actual_cases=[]
    results = {case.text:[item for item in case] for case in cases}
    return results

def get_actual_cases(case_list: dict) -> List:
    '''Takes the cases and removes some of the ones that are not cases
    like the ones without a v in them'''
    actual_cases = []
    for k, v in case_list.items():
        for i in v:
            if i.text == 'v.' and i.pos_ == 'ADP' and i.dep_ == 'prep':
            #if i.pos_ == 'ADP' and i.dep_ == 'prep':
                actual_cases.append(k)
    return actual_cases

def get_cases_from_doc(doc):
    return get_actual_cases(get_casename_citations_filtered(doc))

In [100]:
result_one = get_cases_from_doc(bx)

In [99]:
def clean_case_text(case_list: List[str]) -> List[str]:
    '''Takes a list of cases and removes the newline characters at the end'''
    clean_cases = (str(i) for i in case_list)
    clean_cases = (i.rstrip('\n') for i in case_list)
    return clean_cases

In [15]:
#make a dictionary with the text, and a retokenized version of the result
result_two = {i: [j for j in nlp2(i)] for i in result_one}




barred_words = {'brief', 'financial outlook'}
barred_pos = {'NUM'}

clean_cases = []
for i, j in result_two.items():
   #print(i)
    #print(i, [(token.text, token.pos_, token.dep_) for token in j])
    for token in j:
        #eliminate the ones with numbers
        if token.pos_ not in barred_pos:
            if token.text.lower() not in barred_words:
                clean_cases.append(i)
            elif token.text.lower() in barred_words:
                if token.dep_ != 'ROOT':
                    clean_cases.append(i)

case_counter = Counter(clean_cases)
#The number of times each case is cited
print(case_counter)
#print(clean_cases)
clean_cases = list(set([str(i) for i in clean_cases]))
print(clean_cases)

Counter({'Office of Management and Budget, Historical Tables, Budget of\n': 12, 'Heart of Atlanta Motel, Inc. v. United\n': 9, 'American Power & Light Co. v. SEC,\n': 9, 'Heart of Atlanta Motel, Inc. v. United States': 9, 'Table 12.3—Total Outlays for Grants to State and Local Gov-': 9, 'United States v. Detroit Timber & Lumber Co.': 8, 'Ayotte v. Planned Parenthood of Northern New Eng': 8, 'College Savings Bank v. Florida Prepaid Postsecond-\n': 8, 'Center for Applied Ethics, Voluntary Health\n': 8, 'United States v. Reorganized CF&I Fabricators of Utah': 8, 'Brief for National Health Law Program et al.': 8, 'Metropolitan Washington Airports Authority v. Citizens for\n': 8, 'Temporary Assistance for Needy Families (TANF)': 8, 'Enochs v. Williams Packing & Nav.': 7, 'Department of Revenue of Mont. v.\n': 7, 'Nelson v. Sears, Roebuck & Co.': 7, 'Cherokee Nation v. Southern Kansas R. Co.': 7, 'Perez v. United States, O. T.': 7, 'Bowen v. Public Agencies Opposed to\n': 7, 'NLRB v. Jones &

In [16]:
#def remove_statues_from_cases(case_list: List, law_doc: Any, lay_doc: Any):
#    '''Takes a list of cases and removes the ones that are actually statutes'''
#    case_dict = {case: []}
#    pass
#print(result_two)

In [17]:
#judges = [i for i in get_judges(bx, doc)]
#for i in judges:
#    for j in i:
#        print(i.text, [j.text for j in i], end='\t')

In [None]:
def remove_statutes(case_list: List):
    '''removes statutes, each of which have a numeric elemnt'''

In [None]:
#how to get the cases
#get_actual_cases(get_casename_citations_filtered(doc))

In [29]:
#a = ''
#dir(a.ljust(20))

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmod__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'capitalize',
 'casefold',
 'center',
 'count',
 'encode',
 'endswith',
 'expandtabs',
 'find',
 'format',
 'format_map',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'isidentifier',
 'islower',
 'isnumeric',
 'isprintable',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'ljust',
 'lower',
 'lstrip',
 'maketrans',
 'partition',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'split',
 'splitlines',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'zfill']

In [None]:
cases_found = get_cases_from_doc(doc)
print('Cases found: ', [(i, i.label_) for i in cases_found])

In [21]:
#print(dir(spacy.tokens.Token))
#for i in cases_found:
#    for token in i:
#        print(token.text, token.pos_, token.dep_, [i for i in token.lefts], [i for i in token.rights])

In [385]:
@dataclass
class CaseWithNLP:
    case_name: str
    name_token: spacy.tokens.Token
    plaintiff: spacy.tokens.Token
    respondant: spacy.tokens.Token
    

In [388]:
def get_case_list(doc):
    result = get_cases_from_doc(doc)
    return result

In [390]:
res = make_case_with_nlp(doc)
print(res)

[United States v. Standard Oil Co., Dalehite v. United States,]


In [None]:
def 