In [15]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

In [17]:
# !pip install -q -r ../requirements.txt

In [43]:
import metapack as mp
pkg = mp.open_package('http://library.metatab.org/sdcta.org-hl_contracts-1.1.1.zip')

# Create Dataframes
ann = pkg.resource('annotations').dataframe()
ann['length'] = ann.text.str.len()
ctx = pkg.resource('contexts').dataframe().set_index('part')

In [40]:
ann.head().T

Unnamed: 0,0,1,2,3,4
classid,e_7,e_19,e_1,e_6,e_4
part,s1v1,s1v1,s1v1,s1v1,s1v1
offset_start,33,33,127,153,340
text,COUNTY OF SAN DIEGO,COUNTY OF SAN DIEGO- DEPARTMENT OF PURCHASINGA...,Interfaith Shelter Network,.,Statement ofWork
coordinates,[],,[],[],[]
confidence,,pre-added,,pre-added,
confidence_prob,0.975784,1.0,0.813213,0.5,0.900681
fields,{},{},{},{},{}
normalizations,{},{},{},{},{}
who,ml:nalaf,user:SDTEF,ml:nalaf,ml:regex,ml:nalaf


In [61]:
# Word tokenization

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from difflib import SequenceMatcher
import re


# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Add the component to the pipeline
nlp.add_pipe('sentencizer')

def mark_sentences(idx,r, ctx):

    
    ctx_r = ctx.loc[r.part]

    text = re.sub(r"\n+","\n", ctx_r.context).lower()

    #  "nlp" Object is used to create documents with linguistic annotations.
    doc = nlp(text)

    spn = doc[r.offset_start: r.offset_start+r.length]

    # Create list of word tokens
    #token_list = [tok for tok in doc if tok.text not in STOP_WORDS]
    #print(token_list[:30])

    #print(spn)
    #print('++++')

    # Minimum size of block to match
    msize = int(len(spn)*.2)

    rows = []
    # Extract sentences and compute the overlap with the tagged text
    for s in doc.sents:

        sm = SequenceMatcher(None, spn.text, s.text)
        mb = [e for e in sm.get_matching_blocks() if e.size >= msize]

        anno_type = r.anno_type if len(mb) > 0 else None
       
        
        rows.append( (idx, r.part, s.text, anno_type) )
                        
    return rows

mark_sentences(4, ann.iloc[4], ctx)


[(4,
  's1v1',
  "sd cnty purch '15 :fp 14pm04:12\ncounty of san diego- department of purchasingand contracting contract 539655 amendment 10\nto interfaith shelter network.",
  'service'),
 (4,
  's1v1',
  'pursuant to the contract changes clause, you are directed to make the changes described herein to the contract or do the following described work not included in the previous agreed on statement ofwork.',
  None),
 (4,
  's1v1',
  '\ntitle of contract, temporary shelter network services\neffective date: 09/15115 description of contract change(s) and/or work to be done:\n• modify exhibit a and exhibit c to reflect the level of effort associated with fy 15-16 funding. •',
  None),
 (4,
  's1v1',
  'modify exhibit a to reflect a revision in language regarding health insurance.',
  'service'),
 (4,
  's1v1',
  '\nstatement of work:\n• add section 3.8 to read in its entirety as follows:\n3.8 bed nights: contractor shall provide a minimum of two thousand sixty nine (2068) nights of lodgin

In [62]:
from itertools import chain
x = [mark_sentences(idx, row, ctx) for idx, row in ann.iterrows()]

In [64]:
df = pd.DataFrame( chain(*x), columns='idx part text anno_type'.split()) 

In [22]:

import en_core_web_lg
# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_lg.load()

nlp.add_pipe('textcat', last=True)
textcat = nlp.get_pipe('textcat')

for at in ann.anno_type.unique():
    textcat.add_label(at)

nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'textcat']

In [None]:

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

#("Number of texts to train from","t" , int)
n_texts=30000
#You can increase texts count if you have more computational power.

#("Number of training iterations", "n", int))
n_iter=10

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat

In [19]:
# add label to text classifier
textcat.add_label('POSITIVE')

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [9]:
doc= nlp(text)
entities=[(i, i.label_, i.label) for i in doc.ents if i.label_ not in ('CARDINAL',) ]
entities

[(san diego, 'ORG', 383),
 (a-79, 'ORG', 383),
 ($14 million, 'MONEY', 394),
 (san diego, 'GPE', 384),
 (california, 'GPE', 384),
 (california, 'GPE', 384),
 (san diego, 'GPE', 384),
 (3) year period, 'DATE', 391),
 (3) year period, 'DATE', 391),
 (five business days, 'DATE', 391),
 (irs, 'ORG', 383)]

In [10]:
import spacy
spacy.displacy.render(doc, style='ent', jupyter=True)

In [11]:
ents = []
for idx, row in ctx.iterrows():
    text = row.context
    doc = nlp(text.lower())
    entities=[(i, i.label_, i.label) for i in doc.ents if i.label_ not in ('CARDINAL',) ]
    ents.extend(entities)

In [12]:
t = set([ e[0:2] for e in ents])
set(label  for tok, label in t)

{'DATE',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [13]:
set([tok.text  for tok, label in t if label == 'NORP'])

{'african american',
 'american',
 'americans',
 'catholic',
 'd.',
 'hawaiian',
 'hispanic',
 'hispanics',
 'lgbtq',
 'non-hispanic',
 'spanish'}

In [14]:
set([tok.text  for tok, label in t if label == 'GPE'])

{'2950 el cajon bl~',
 '6,600.00',
 '\\)\\',
 'ab',
 'accountina',
 'angeles',
 'angeles county',
 'ave',
 'ave los angeles',
 'ave.',
 'beverly hills',
 'budget2014-15',
 'california',
 'california landlord!tenant',
 'camino del rio south address',
 'carlsbad',
 'culver city',
 'delaware',
 'diego',
 'east county',
 'el cajon',
 'el cajon blvd',
 'el cajon city',
 "el cajon's",
 'escondido',
 'fremont',
 'fresno',
 'fresno county',
 'ga',
 'http://www.sdcounty.ca.gov/hhsa/programs/phs/tuberculosis_control_program/guidelines_additional_resources.html',
 'los angeles',
 'los angeles county',
 'm.d.',
 'marengo st',
 'marengo st\n\n',
 'maryland',
 'md',
 'n.a.',
 'north county',
 'north county lifeline',
 'oceanside',
 'oflos angeles',
 'p.l.',
 'p.o.',
 'pennsylvania',
 'philadelphia',
 'portland',
 'poway',
 'redondo beach',
 'san',
 'san dibqo',
 'san diego',
 'san diego -health',
 'san diego city',
 'san diego county',
 'san diego county\n\n',
 "san diego's",
 'san francisco',
 'san