In [1]:
import pandas as pd 
import numpy as np 
import spacy

from spacy.tokens import Doc, DocBin
from spacy.matcher import PhraseMatcher
from spacy import displacy

from tqdm.notebook import tqdm
from collections import Counter, defaultdict
import pickle
from pathlib import Path
import random
from joblib import Parallel, delayed

from collections import Counter
import phonetics 


from itertools import islice
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

Doc.set_extension("pdf_path", default=None)


spacy.__version__

'3.1.1'

In [2]:
# The documents saved in a previous notebook 

#!  python -m spacy download en_core_web_sm
model = "en_core_web_lg"

text_path  = Path('texts.pkl')
docs_path  = Path(f'docs-{model}.spacy')
nlp_path = Path(f'nlp-{model}.spacy')

nlp = spacy.load(model)
nlp.max_length = 3000000

nlp = nlp.from_disk(nlp_path)
doc_bin = DocBin().from_disk(docs_path)
docs = list(doc_bin.get_docs(nlp.vocab))

In [3]:
# Map of types of entities. 
import re
p = re.compile(r'\W+')

docv = list(docs)

wd = defaultdict(list)

for d in docv:
    for e in  d.ents:
        if not str(e).isnumeric():
            wd[e.label_].append(p.sub(' ', str(e.lemma_)).lower() )

cd = defaultdict(Counter)

for k, v in wd.items():
    cd[k].update(v)


In [4]:
%%time
# Make a single combined document
from spacy.tokens import Doc
cdocs = Doc.from_docs(docv)



CPU times: user 48.7 s, sys: 10.8 s, total: 59.5 s
Wall time: 1min 4s


In [7]:
agrmt = pd.read_csv('agreement_statements.csv').iloc[:,1:]
agrmt.head()

Unnamed: 0,type,contractor,text,hash,path
0,contract,Alpha Project,"This agreement (""Agreement"") is made and enter...",469993d856132848e5ad869941392ed13cbf6c17297a18...,/Users/eric/proj/data-projects/text-classifica...
1,contract,Nami,"This agreement (""Agreement"") is made and enter...",6676b44941458a4809511372282f59e46080ebacd87911...,/Users/eric/proj/data-projects/text-classifica...
2,contract,,"This Agreement (""AgreemenC') is made and enter...",43d2490bda4f4203580b70f41bcc32b1b9be419a91b34c...,/Users/eric/proj/data-projects/text-classifica...
3,indemnification,Mcalister Institute,"This indemnification agreement (""Agreement"") i...",46e4bfeaa01c20f6e5d240872ade6464f69f331610fd55...,/Users/eric/proj/data-projects/text-classifica...
4,contract,Deaf Community Services Of San Diego,"This agreement (""Agreement"") is made and enter...",5bcf9f12da1486f1c934dea3236b0bba80fc0e80551757...,/Users/eric/proj/data-projects/text-classifica...


In [8]:
types = agrmt[agrmt.type=='contract'][['type','contractor','path']].drop_duplicates()

In [9]:
paths = set(agrmt['path'])
not_agr = set()
i = 0
for doc in docs:
    pp  = doc._.pdf_path
    if pp not in paths:
        i += 1
        not_agr.add(pp)
        
        if i < 10:
            print(list(doc.sents)[:5])
        

[COUNTY OF SAN DIEGO- DEPARTMENT OF PURCHASING AND CONTRACTING
CONTRACT NO., 552704 AMENDMENT NO., M
Community Health Improvement Pu1nen ("Contractor") and the County ofSan Diego «'County'') enter ioto this amendment
("Amendment") to emend the above·refcrenced conuact ("Contract") es described herein., 

, Title of Contract: Jndependent Living Association (lLA) and Recovery Residence Auodadon (RRA)
Amendment Effecdve Date: Date Signed by die Department of Purcha1lng & Conlracdog
Description ofCooCiact Cbnnge(s}:
1.

]
[COUNTY OF SAN DIEGO - DEPARTMENT OF PURCHASING AND CONTRACTING
CONTRACT NO. 556355 AMENDMENT NO. 3
VISTA HILL FOUNDATION ("Contractor") and the County of San Diego ("County") enter into this amendment ("Amendment")
to amend the above-referenced contract ("Contract") as described herein., 
, Title of Contract: SUD Outpatient Treatment Program

, Amendment Effective Date: January 1, 2020

, Description of Contract Change(s):
1.

]
[COUNTY OF SAN DIEGO - DEPARTMENT OF PURCH

In [10]:
ents = cdocs.ents
labels = { e.label_ for e in ents }
len(labels), labels

(18,
 {'CARDINAL',
  'DATE',
  'EVENT',
  'FAC',
  'GPE',
  'LANGUAGE',
  'LAW',
  'LOC',
  'MONEY',
  'NORP',
  'ORDINAL',
  'ORG',
  'PERCENT',
  'PERSON',
  'PRODUCT',
  'QUANTITY',
  'TIME',
  'WORK_OF_ART'})

In [11]:
# Create a full entities dataset
rows = []
for ent in ents:
    p = tuple(t.lemma_.lower() for t in list(ent) if  t.is_alpha and not t.is_stop)
    if p:
        rows.append([ent.label_, ' '.join(p), p, len(p), ent, ent.start, ent.end, ent.sent])
    
ent_df = pd.DataFrame(rows,columns='label pstr p plen ent start end sentence'.split())
#t['sentence']  = t.sentence.str.replace('\n',' ')

In [12]:
ent_df.head()

Unnamed: 0,label,pstr,p,plen,ent,start,end,sentence
0,GPE,county san diego,"(county, san, diego)",3,"(the, County, of, San, Diego)",53,58,"(This, agreement, (, "", Agreement, "", ), is, m..."
1,GPE,state california,"(state, california)",2,"(the, State, of, California)",63,67,"(This, agreement, (, "", Agreement, "", ), is, m..."
2,ORG,alpha project,"(alpha, project)",2,"(Alpha, Project)",73,75,"(This, agreement, (, "", Agreement, "", ), is, m..."
3,NORP,homeless,"(homeless,)",1,(Homeless),77,78,"(This, agreement, (, "", Agreement, "", ), is, m..."
4,FAC,avenue,"(avenue,)",1,"(3737, 5th, Avenue)",88,91,"(This, agreement, (, "", Agreement, "", ), is, m..."


In [13]:

def ents_by_type(typ):
    return [ e for e in ents if str(e.label_) == typ ]

orig_orgs = ents_by_type('DATE')
#x =random.sample(list(sorted(set())), 40)
orgs = list(set([ ' '.join([phonetics.metaphone(str(t).lower()) for t in list(e) if  t.is_alpha and not t.is_stop]) for e in orig_orgs ]))

orig_ents = ents_by_type('GPE')

p_ents = [ ' '.join([t.lemma_.lower() for t in list(e) if  t.is_alpha and not t.is_stop]) for e in orig_ents ]
u_ents = list(set(p_ents))

print(len(orig_ents), len(p_ents), len(u_ents))

Counter(p_ents).most_common(50)

91483 91483 3425


[('county', 37664),
 ('san diego', 10691),
 ('county san diego', 4815),
 ('california', 3761),
 ('san diego county', 3449),
 ('', 2366),
 ('state california', 2053),
 ('united states', 1266),
 ('escondido', 894),
 ('county san', 806),
 ('el cajon', 763),
 ('diego', 667),
 ('oceanside', 575),
 ('east county', 472),
 ('north county', 466),
 ('san', 465),
 ('ymca', 441),
 ('los angeles', 420),
 ('carlsbad', 390),
 ('united states america', 341),
 ('usa', 327),
 ('san marcos', 279),
 ('city', 269),
 ('md', 257),
 ('diego county', 235),
 ('qar', 233),
 ('city san', 224),
 ('del mar', 214),
 ('tenn', 195),
 ('city san diego', 195),
 ('conjunction', 195),
 ('san diego county code', 184),
 ('camino del rio south', 169),
 ('encinitas', 158),
 ('vista hill', 154),
 ('san bernardino', 146),
 ('america', 145),
 ('san diego city', 141),
 ('national city', 136),
 ('city el cajon', 136),
 ('city vista', 134),
 ('county contractors', 129),
 ('fresno', 129),
 ('walden', 126),
 ('la mesa', 117),
 ('nort

# Matching Requirements

In [14]:
# Match to words that indicate referencing a contractor
import spacy
from spacy.matcher import PhraseMatcher
from itertools import islice
from tqdm.notebook import tqdm

phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = ['contractor', 'sub-contractor', 'sub-recipient', 'recipient', 'party','offeror'
           'shall','must','will','require','ensure']
patterns = [nlp(text) for text in phrases]
phrase_matcher.add('contr_shall', None, *patterns)

contr_shall = {}

i = 0

contr_matches = {}
for doc in tqdm(list(docs)):
  
    for match_id, start, end in phrase_matcher(doc):
        span = doc[start:end]
        contr_matches[(id(doc), start, end)] = span.sent

len(contr_matches)
      

  0%|          | 0/876 [00:00<?, ?it/s]

98261

In [15]:
# From the contractor matches, look for words that indicate a requirement
import re
p = re.compile(r'(contractor|recipient|party|offeror).*(shall|must|will)')
cm = { k:v for k, v in contr_matches.items() if p.search(v.text.replace('\n',' ').lower()) }
        
len(   contr_matches), len(cm)

(98261, 23869)

In [16]:

samp = [ {'path': e.doc._.pdf_path, 'requirement':e.text.replace('\n',' ').strip() } 
        or e in cm.values()]

t = pd.DataFrame(samp).merge(types)

t = t[['contractor','requirement', 'path']]
t.sort_values('contractor')
t = t.drop_duplicates()

t['path'] = t.path.apply(lambda v: v.split('/')[-1] )

t.head()


Unnamed: 0,contractor,requirement,path


In [17]:
t.to_csv('requirements.csv')

t.drop_duplicated(subset='requirement').to_csv('requirements_dedupe.csv')

t.head()

AttributeError: 'DataFrame' object has no attribute 'drop_duplicated'

# Matching Emails and URLs

In [None]:
# Find emails and URLS
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

matcher.add("Email", [[{'LIKE_EMAIL': True}]])
matcher.add("Url", [[{'LIKE_URL': True, "TEXT": {"REGEX": "^https?:\/\/[^\/]+\.\w\w\w\/"}}]])

from itertools import islice 

matches = []

for doc in docs:
    for match_id, start, end in matcher(doc):
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        matches.append([match_id, string_id, start, end, span.text])

t = pd.DataFrame(matches, columns = 'match_id string_id start end text'.split())
t.head()
        

# Cities and The County


In [None]:
cities = """
Carlsbad
Chula Vista
Coronado
Del Mar
El Cajon
Encinitas
Escondido
Imperial Beach
La Mesa
Lemon Grove
National City
Oceanside
Poway
San Marcos
Santee
Solana Beach
Vista """

x = """
San Diego County
County of San Diego
San Diego
"""

phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = [ e for e in cities.splitlines() if e ]
patterns = [nlp(text) for text in phrases]
phrase_matcher.add('gpe', None, *patterns)

In [None]:
# Find sentences that reference cities or the county
gpes = []
for doc in tqdm(list(docs)):
  
    for match_id, start, end in phrase_matcher(doc):
        span = doc[start:end]
        gpes.append(span.sent)

len(gpes)

In [None]:
list(gpes[10].doc.sents)[2].text.replace('\n',' ')

In [None]:
gpes[50].text.replace('\n',' ')


In [None]:
phrases
