In [1]:
import pandas as pd 
import numpy as np 
import spacy

from spacy.tokens import Doc, DocBin
from spacy.matcher import PhraseMatcher
from spacy import displacy

from tqdm.notebook import tqdm
from collections import Counter, defaultdict
import pickle
from pathlib import Path
import random
from joblib import Parallel, delayed

from collections import Counter
import phonetics 


from itertools import islice
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

Doc.set_extension("pdf_path", default=None)


spacy.__version__

'3.1.1'

In [2]:
# The documents saved in a previous notebook 

#!  python -m spacy download en_core_web_sm
model = "en_core_web_lg"

text_path  = Path('texts.pkl')
docs_path  = Path(f'docs-{model}.spacy')
nlp_path = Path(f'nlp-{model}.spacy')

nlp = spacy.load(model)
nlp.max_length = 3000000

nlp = nlp.from_disk(nlp_path)
doc_bin = DocBin().from_disk(docs_path)
docs = list(doc_bin.get_docs(nlp.vocab))

In [3]:
%%time 

import re
p = re.compile(r'\W+')

docv = list(docs)

wd = defaultdict(list)

for d in docv:
    for e in  d.ents:
        if not str(e).isnumeric():
            wd[e.label_].append(p.sub(' ', str(e.lemma_)).lower() )

cd = defaultdict(Counter)

for k, v in wd.items():
    cd[k].update(v)


CPU times: user 7.14 s, sys: 26.1 ms, total: 7.17 s
Wall time: 7.2 s


In [4]:
%%time
# Make a single combined document
from spacy.tokens import Doc
cdocs = Doc.from_docs(docv)



CPU times: user 37.9 s, sys: 3.39 s, total: 41.3 s
Wall time: 41.7 s


In [5]:
ents = cdocs.ents
labels = { e.label_ for e in ents }
len(labels), labels

(18,
 {'CARDINAL',
  'DATE',
  'EVENT',
  'FAC',
  'GPE',
  'LANGUAGE',
  'LAW',
  'LOC',
  'MONEY',
  'NORP',
  'ORDINAL',
  'ORG',
  'PERCENT',
  'PERSON',
  'PRODUCT',
  'QUANTITY',
  'TIME',
  'WORK_OF_ART'})

In [6]:
# Create a full entities dataset
rows = []
for ent in ents:
    p = tuple(t.lemma_.lower() for t in list(ent) if  t.is_alpha and not t.is_stop)
    if p:
        rows.append([ent.label_, ' '.join(p), p, len(p), ent, ent.start, ent.end, ent.sent])
    
ent_df = pd.DataFrame(rows,columns='label pstr p plen ent start end sentence'.split())
#t['sentence']  = t.sentence.str.replace('\n',' ')

In [7]:
ent_df.head()

Unnamed: 0,label,pstr,p,plen,ent,start,end,sentence
0,GPE,county san diego,"(county, san, diego)",3,"(the, County, of, San, Diego)",53,58,"(This, agreement, (, "", Agreement, "", ), is, m..."
1,GPE,state california,"(state, california)",2,"(the, State, of, California)",63,67,"(This, agreement, (, "", Agreement, "", ), is, m..."
2,ORG,alpha project,"(alpha, project)",2,"(Alpha, Project)",73,75,"(This, agreement, (, "", Agreement, "", ), is, m..."
3,NORP,homeless,"(homeless,)",1,(Homeless),77,78,"(This, agreement, (, "", Agreement, "", ), is, m..."
4,FAC,avenue,"(avenue,)",1,"(3737, 5th, Avenue)",88,91,"(This, agreement, (, "", Agreement, "", ), is, m..."


In [8]:

def ents_by_type(typ):
    return [ e for e in ents if str(e.label_) == typ ]

orig_orgs = ents_by_type('DATE')
#x =random.sample(list(sorted(set())), 40)
orgs = list(set([ ' '.join([phonetics.metaphone(str(t).lower()) for t in list(e) if  t.is_alpha and not t.is_stop]) for e in orig_orgs ]))

orig_ents = ents_by_type('GPE')

p_ents = [ ' '.join([t.lemma_.lower() for t in list(e) if  t.is_alpha and not t.is_stop]) for e in orig_ents ]
u_ents = list(set(p_ents))

print(len(orig_ents), len(p_ents), len(u_ents))

Counter(p_ents).most_common(50)

91483 91483 3425


[('county', 37664),
 ('san diego', 10691),
 ('county san diego', 4815),
 ('california', 3761),
 ('san diego county', 3449),
 ('', 2366),
 ('state california', 2053),
 ('united states', 1266),
 ('escondido', 894),
 ('county san', 806),
 ('el cajon', 763),
 ('diego', 667),
 ('oceanside', 575),
 ('east county', 472),
 ('north county', 466),
 ('san', 465),
 ('ymca', 441),
 ('los angeles', 420),
 ('carlsbad', 390),
 ('united states america', 341),
 ('usa', 327),
 ('san marcos', 279),
 ('city', 269),
 ('md', 257),
 ('diego county', 235),
 ('qar', 233),
 ('city san', 224),
 ('del mar', 214),
 ('tenn', 195),
 ('city san diego', 195),
 ('conjunction', 195),
 ('san diego county code', 184),
 ('camino del rio south', 169),
 ('encinitas', 158),
 ('vista hill', 154),
 ('san bernardino', 146),
 ('america', 145),
 ('san diego city', 141),
 ('national city', 136),
 ('city el cajon', 136),
 ('city vista', 134),
 ('county contractors', 129),
 ('fresno', 129),
 ('walden', 126),
 ('la mesa', 117),
 ('nort

# Matching Requirements

In [9]:
%%time
import spacy
from spacy.matcher import PhraseMatcher
from itertools import islice
from tqdm.notebook import tqdm

phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = ['contractor', 'sub-contractor', 'sub-recipient', 'recipient', 'party','offeror'
           'shall','must','will','require','ensure']
patterns = [nlp(text) for text in phrases]
phrase_matcher.add('contr_shall', None, *patterns)

contr_shall = {}

i = 0

contr_matches = {}
for doc in tqdm(list(docs)):
  
    for match_id, start, end in phrase_matcher(doc):
        span = doc[start:end]
        contr_matches[(id(doc), start, end)] = span.sent

len(contr_matches)
      

  0%|          | 0/876 [00:00<?, ?it/s]

CPU times: user 1.11 s, sys: 104 ms, total: 1.22 s
Wall time: 1.21 s


98261

In [10]:
import re
p = re.compile(r'(contractor|recipient|party|offeror).*(shall|must|will)')
cm = { k:v for k, v in contr_matches.items() if p.search(v.text.replace('\n',' ').lower()) }
        
len(   contr_matches), len(cm)

(98261, 23869)

In [11]:
from random import sample

samp = sample(list(contr_matches.values()), 40)

print('\n\n'.join(e.text.replace('\n',' ') for e in samp if 'shall' in e.text))

Nothing herein shall be construed to require access to any privileged or confidential information as set forth in Federal or State law.  (e)  SUBRECIPIENT shall allow CITY to annually audit all CDBG funds associated with the Project, pursuant to federal regulations found in Title 24 of the Code of Federal Regulations and other applicable federal laws and regulations.

Subrecipient understands that the Project Compensation will be paid from CDBG funds and such amounts shall be received and utilized  Only actual expenses incurred by Subrecipient are eligible for reimbursement, and then only to the extent solely for the Project pursuant to all of the terms and conditions of this Agreement.

Contractor shall also ensure that their staff will be trained in Copeland’s ‘Wellness Recovery Action Planning’ (WRAP®), SAMHSA’s ‘Illness Management and Recovery’ Implementation Resource Kit, and evidence-based social skills training, and the Contractor will incorporate these and other recoveryoriente

# Matching Emails and URLs

In [12]:
# Find emails and URLS
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

matcher.add("Email", [[{'LIKE_EMAIL': True}]])
matcher.add("Url", [[{'LIKE_URL': True, "TEXT": {"REGEX": "^https?:\/\/[^\/]+\.\w\w\w\/"}}]])

from itertools import islice 

matches = []

for doc in docs:
    for match_id, start, end in matcher(doc):
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        matches.append([match_id, string_id, start, end, span.text])

t = pd.DataFrame(matches, columns = 'match_id string_id start end text'.split())
t.head()
        

Unnamed: 0,match_id,string_id,start,end,text
0,12303433538686809003,Url,3780,3781,https://www.ecfr.gov/cgi-bin/text-idx?tpl=/ecf...
1,12303433538686809003,Url,3886,3887,http://www.gsa.gov/portal/category/21287
2,12303433538686809003,Url,11803,11804,http://www.sandiegocounty.gov/content/sdc/cao/...
3,11010771136823990775,Email,14508,14509,ACS.HHSA@sdcounty.ca.gov
4,12303433538686809003,Url,20662,20663,https://www.coveredca.com/


# Cities and The County


In [13]:
cities = """
Carlsbad
Chula Vista
Coronado
Del Mar
El Cajon
Encinitas
Escondido
Imperial Beach
La Mesa
Lemon Grove
National City
Oceanside
Poway
San Diego
San Marcos
Santee
Solana Beach
Vista
San Diego County
County of San Diego
"""


phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = [ e for e in cities.splitlines() if e ]
patterns = [nlp(text) for text in phrases]
phrase_matcher.add('gpe', None, *patterns)

In [14]:
gpes = []
for doc in tqdm(list(islice(docs, 100))):
  
    for match_id, start, end in phrase_matcher(doc):
        span = doc[start:end]
        gpes.append(span.sent)

len(gpes)

  0%|          | 0/100 [00:00<?, ?it/s]

8535

In [15]:
list(gpes[10].doc.sents)[2].text.replace('\n',' ')

"RECITALS A.  Pursuant to Administrative Code section 401, the County's Director of the Department of Purchasing and Contracting is authorized to award a contract for Housing Navigation and Case Management Services for the Homeless program."

# Agreement Statement

In [16]:
cities = """
agreement
contract
ammendment
by and between
made and entered
corporation
enter into
entered into
as described herein
"""

phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = [ e for e in cities.splitlines() if e ]
patterns = [nlp(text) for text in phrases]
phrase_matcher.add('agreement', None, *patterns)


agr_matches = {}
for doc in tqdm(list(docs)):
  
    for match_id, start, end in phrase_matcher(doc):
        span = doc[start:end]
        agr_matches[(id(doc), start, end)] = span.sent

print( f" {len(agr_matches)} Matches ")


  0%|          | 0/876 [00:00<?, ?it/s]

 26678 Matches 


In [17]:

c = Counter([ ' '.join(list(e.text for e in v[:5] if e.is_alpha)[:3])\
             .replace('\n','').strip().lower() 
             for k, v in agr_matches.items() ])
c.most_common(40)


[('any subcontract or', 578),
 ('contractor shall submit', 525),
 ('the following information', 433),
 ('this agreement agreement', 366),
 ('if contractor becomes', 352),
 ('', 303),
 ('contractor shall not', 292),
 ('if contractor after', 280),
 ('in the event', 280),
 ('contractor shall provide', 265),
 ('amendment no', 253),
 ('amendment d', 249),
 ('contractor shall include', 212),
 ('if contractor is', 203),
 ('background instructions the', 195),
 ('the name contract', 186),
 ('this indemnification agreement', 180),
 ('the contractor shall', 167),
 ('no subcontract utilizing', 163),
 ('this budget will', 157),
 ('this agreement together', 148),
 ('without limiting contractor', 131),
 ('contractor shall', 130),
 ('as a requirement', 123),
 ('county may in', 117),
 ('if coverage provided', 116),
 ('contractor shall notify', 114),
 ('gift cards that', 113),
 ('however if this', 110),
 ('no federal appropriated', 110),
 ('contractor', 108),
 ('you or', 105),
 ('byrd anti lobbying', 10

In [18]:
import re
p = re.compile(r'(agreement|contract).*\".*(made|entered|by|between).*'
               '(county|city|corporation|non-profit|nonprofit|municipal)')

mm = {}
for k, v in agr_matches.items():
    txt = v.text.replace('\n',' ').lower()
    first_three = list(e.text for e in v[:5] if e.is_alpha)[:3]
    
    if p.search(txt) and ('agreement' in first_three or 'contract' in first_three ):
        mm[k] = v


In [19]:
from random import choices 
values = list(mm.values())
k = len(values) / 10 
sample = choices(values, k=int(k))
len(sample)

41

In [20]:
import re
rows   = []
for (doc_id, start, end), sent in mm.items():
    rows.append({
        'text': re.sub(r'\s+', ' ', str(sent).strip()),
        'start': start,
        'end': end,
        'path':  sent.doc._.pdf_path
    })
    
t = pd.DataFrame(rows)
t = t.drop_duplicates(subset=['text','path'])
t.to_csv('agreement_statements.csv')

# Make docs for the agreement statements
ag_docs =  Doc.from_docs([nlp(v.text) for k,v in mm.items() ])

ag_ents = [ ' '.join([t.lower_ for t in e if t.is_alpha]) for e in ag_docs.ents if e.label_ == 'ORG' ]

c = Counter(ag_ents)
c.most_common(40)

[('offeror', 220),
 ('offeror company organization name', 112),
 ('the california public records act government code section', 68),
 ('the california public records act', 44),
 ('telecare corporation', 28),
 ('interfaith community services', 27),
 ('mcalister institute', 24),
 ('walden family services', 15),
 ('pan asian communities', 12),
 ('mental health systems', 10),
 ('deaf community services of san diego', 9),
 ('state', 9),
 ('interfaith shelter network', 9),
 ('vista hill', 8),
 ('national alliance', 8),
 ('community research foundation', 8),
 ('ac', 8),
 ('corporation for supportive housing', 8),
 ('serving seniors', 8),
 ('carroll', 7),
 ('mission treatment services', 6),
 ('escondido', 6),
 ('llc', 6),
 ('the state of californi', 6),
 ('contractor southern indian health council', 6),
 ('el dorado community service centers', 6),
 ('san diego health alliance', 6),
 ('south bay community services', 6),
 ('n madison avenue', 6),
 ('center for community solutions', 6),
 ('alpha p

In [21]:
contractors = {
 'alpha project',
 'anthem compassionate',
 'casa de amparo',
 'casa de oro ',
 'catholic charities',
 'center for community solutions',
 'community health improvement partners',
 'community research foundation',
 'community resource center',
 'southern indian health council',
 'healthright',
 'corporation for supportive housing',
 'deaf community services of san diego',
 'diocese of san diego',
 'el dorado community service centers',
 'family health centers of san diego',
 'fancor guest home',
 'home start',
 'interfaith',
 'legal aid society of san',
 'mcalister institute',
 'mental health systems',
 'mission treatment services',
 'nami',
 'national alliance',
 'new alternatives',
 'orlando guest home',
 'pan asian communities',
 'pathways community services',
 'public consulting group',
 'rescare workforce services',
 'san diego health alliance',
 'san diego youth services',
 'serving seniors',
 'south bay community services',
 'telecare corporation',
 'neighborhood house association',
 'upac',
 'urban street angels',
 'vista hill',
 'walden environment',
 'walden family services',
 'wing street',
 'offeror',
'YMCA',
'National Conflict Resolution Center',
'Prosperity Way',
'PATH',
'Exodus Recovery',
'Regional Task Force on the Homeless',
"Carroll's Community Care"}


In [22]:

def find_contractor(v):
    for c in contractors:
        if c.lower() in v.lower():
            return c
    return None

t['contractor'] = t.text.apply(find_contractor)
for idx, r in t[t.contractor.isnull()].iterrows()    :
    print(r.text)

This agreement ("Agreement") is made and entered into effective as of the date of the last signature on the signature page by and between the County of San Diego, a political subdivision of the State of California ("County") and Prime Healthcare Paradise Valley LLC, located at 2400 East 4th Street National City, CA 91950 ("Contractor"), with reference to the following facts:
This indemnification agreement ("Agreement") is made and entered into by and between the County of San Diego
This agreement ("Agreement") is made and entered into effective as of the date of the last signature on the signature p age by and between the County of San Diego, a political subdivision of the State of California ("County") and Casa El Cajon located at 306 Shady Lane, El Cajon, CA 92021 ("Contractor"), with reference to the following facts:
Let This agreement ("Agreement") is made and entered into effective July I, 2019, by and between the County of San Diego, a political subdivision of the State of Califo

# Funding Sources

In [23]:
%%time
import spacy
from spacy.matcher import PhraseMatcher
from itertools import islice
from tqdm.notebook import tqdm

phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = ['HUD', 'HRSA', 'SAMHSA', 'CDBG','Medi-Cal','medicare']
patterns = [nlp(text) for text in phrases]
phrase_matcher.add('contr_shall', None, *patterns)

fund = {}

i = 0

matches = {}
for doc in tqdm(list(docs)):
  
    for match_id, start, end in phrase_matcher(doc):
        span = doc[start:end]
        matches[(id(doc), start, end)] = span.sent

len(matches)

  0%|          | 0/876 [00:00<?, ?it/s]

CPU times: user 567 ms, sys: 6.44 ms, total: 574 ms
Wall time: 570 ms


12290

In [24]:
samp = sample(list(matches.values()), 40)

for e in samp:
    print(e)
    print('-----')

TypeError: 'list' object is not callable