In [1]:
#default things to import

#test using BlackstoneNLP

#import standard library modules
import sys
import json
from collections import Counter
from typing import List, Any

#modules from the community
import spacy
from dataclasses import dataclass #backported this module from 3.7

#import lxml

from spacy import displacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler



from blackstone.displacy_palette import ner_displacy_options

#blackstone improved citations
from blackstone.pipeline.sentence_segmenter import SentenceSegmenter
from blackstone.rules import CITATION_PATTERNS

import en_core_web_sm


In [2]:
nlp = spacy.load('en_blackstone_proto')

In [3]:
nlp2 = en_core_web_sm.load()

In [4]:
sentence_segmenter = SentenceSegmenter(nlp.vocab, CITATION_PATTERNS)

ruler = EntityRuler(nlp)
#patterns = [{"label": "LEGISLATURE", "pattern": "Congress"}]
patterns2 = [{"label": "LEGISLATURE", "pattern": [{"LOWER": "congress"}], "id": "LEGISLATURE"}]
            #{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns2)
nlp.add_pipe(sentence_segmenter, before="parser")
nlp.add_pipe(ruler, before='ner')

In [5]:
#get the text into a variable from the .txt file
def text_from_file(filename: str) -> str:
    with open(filename, 'r') as in_file:
        data = in_file.read()
        return data

quanta=text_from_file(filename='quanta_case_text.txt')
#bx=spacy.load(booker)

In [6]:
#del booker
bx = nlp(quanta)
doc = nlp2(quanta)

In [14]:
def find_chunk(doc):
    chunk=''
    for i, token in enumerate(doc):
        if token.dep_ == 'dobj':
            print('dobj' == token.text)
            shift = len([w for w in token.children])
            print([w for w in token.children])
            chunk=doc[i-shift:i+1]
            print(chunk)
    return chunk

def determine_question_type(chunk):
    question_type = 'yesno'
    for token in chunk:
        if token.dep_ == 'amod':
            question_type = 'info'
    return question_type

def generate_question(doc, question_type):
    sent=''
    for i, token in enumerate(doc):
        if token.tag_ == 'PRP' and doc[i+1].tag_ == 'VBP':
            sent = 'do ' + doc[i+1].text
            sent = sent + ' ' + doc[i+1:].text
            break
    doc=nlp(sent)
    for i, token in enumerate(bx):
        if token.tag_ == 'PRP' and token.text == 'I':
            sent = doc[:i].text + ' you ' + doc[i+1].text
            break
    doc=nlp(sent)
    if question_type == 'info':
        for i, token in enumerate(doc):
            if token.dep_ == 'dobj':
                sent = 'why ' + doc[:i].text + ' one ' + doc[i+1:].text
                break
    doc=nlp(sent)
    sent=doc[0].text.capitalize() + ' ' + doc[1:len(doc)-1].text + '?'
    return sent

#for ent in bx.ents:
#    print(ent.text, ent.label_)
#dir(spacy.lang)

#booker_string = text_from_file('booker_train.txt')
#import os
#os.chdir(model_data_path)
#os.listdir()
#test_string = """There was before us no dispute as to the relevant statutory scheme or the law as the judge had to apply it. There was no dispute but that the judge had to consider in particular the circumstances in which the evidence came to be made (see section 114(2)(d)), the reliability of the witness Wilson (section 114(2)(e)) and how reliable the making of the statement appears to be (section 114(2)(f)). There was no dispute between the parties that the judge was bound to apply section 114(2) in considering the propriety of reading the transcripts pursuant to section 116 (see R v Cole & Ors [2008] 1 Cr App R No 5, paragraph 6, 7 and 21). Quite apart from those specific provisions the ultimate consideration had to be and remains the fairness of allowing that course to be adopted as Pitchford LJ said in R v Ibrahim [2010] EWCA Crim 1176"""

In [8]:
#sents = [i for i in bx.sents]
def check_sentence(sents):
    if len(sents) > 1:
        sent = sents[0]
        nlp=spacy.load('en_blackstone_proto')
        doc=nlp(sent)
        chunk=find_chunk(doc)
        if str(chunk) == '':
            print('The sentence does not contain a direct object.')
            return
        question_type = determine_question_type(chunk)
        question= generate_question(doc, question_type)
        print(question)
    else:
        print('You did not submit a sentence!')

In [10]:
sents = [str(i) for i in bx.sents]
sents = sents[25:]

In [13]:
#print(sents[1])
#for token in sents[1]:
    #print(token.text, token.dep_, token.pos_)
check_sentence(sents=sents[12])

The sentence does not contain a direct object.


In [49]:
#for i in sents[:10]:
#    print(i)
   # print(determine_question_type(i))
    #print(generate_question(doc=nlp, sent=i))

In [62]:
#display_casename_citations_filtered(doc)
def get_casename_citations_filtered(doc):
    #on
    '''Takes a spacy doc object and returns a dictionary of cases using the blackstone nlp model
    args:
        doc: the spacy doc object
    returns:
        
    '''
    cases = (i for i in doc.ents if i.label_ == 'CASENAME')
    actual_cases=[]
    results = {case.text:[item for item in case] for case in cases}
    return results

def get_actual_cases(case_list: dict) -> List:
    '''Takes the cases and removes some of the ones that are not cases
    like the ones without a v in them'''
    actual_cases = []
    for k, v in case_list.items():
        for i in v:
            if i.text == 'v.' and i.pos_ == 'ADP' and i.dep_ == 'prep':
            #if i.pos_ == 'ADP' and i.dep_ == 'prep':
                actual_cases.append(k)
    return actual_cases

def get_cases_from_doc(doc):
    return get_actual_cases(get_casename_citations_filtered(doc))

In [68]:
result_one = get_cases_from_doc(bx)
#result_one = list(clean_case_text(result_one))
#print(result_one)

In [71]:
for ent in bx.ents:
    print(ent.text, ent.label_)

section 114(2)(d) PROVISION
section 114(2)(e) PROVISION
section 114(2)(f) PROVISION
section 114(2) PROVISION
section 116 PROVISION
R v Cole & Ors CASENAME
[2008] 1 Cr App R No 5 CITATION
Pitchford LJ JUDGE
R v Ibrahim CASENAME
[2010] EWCA Crim 1176 CITATION
