# Contractors and Agreement Statements

Extract the agreement statements, such as: 

    "This agreement (""Agreement"") is made and entered into effective as of the date of the last signature on the signature page by and between the County of San Diego, a political subdivision of the State of California (""County"") and Center for Community Solutions, 4508 Mission Bay Drive, San Diego, CA 92109 (""Contractor""), with reference to the following facts:"
    
Then determine the names of the contractors.


In [1]:
import pandas as pd 
import numpy as np 
import spacy

from spacy.tokens import Doc, DocBin
from spacy.matcher import PhraseMatcher
from spacy import displacy
import re
from tqdm.notebook import tqdm
from collections import Counter, defaultdict
import pickle
from pathlib import Path
import random
from joblib import Parallel, delayed

from collections import Counter
import phonetics 

import textract
from itertools import islice
import hashlib 

from itertools import islice
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

Doc.set_extension("pdf_path", default=None)

spacy.__version__

'3.1.1'

In [2]:
# The documents saved in a previous notebook 

#!  python -m spacy download en_core_web_sm
model = "en_core_web_lg"

text_path  = Path('texts.pkl')
docs_path  = Path(f'docs-{model}.spacy')
nlp_path = Path(f'nlp-{model}.spacy')

nlp = spacy.load(model)
nlp.max_length = 3000000

nlp = nlp.from_disk(nlp_path)

doc_bin = DocBin().from_disk(docs_path)
docs = list(doc_bin.get_docs(nlp.vocab))

In [3]:
docd = { Path(doc._.pdf_path).name:doc for doc in docs }
len(docd)

876

# Agreement Statement

In [4]:
phrases = """
memorandum
agreement
contract
ammendment
by and between
made and entered
corporation
enter into
entered into
as described herein
"""

phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = [ e for e in phrases.splitlines() if e ]
patterns = [nlp(text) for text in phrases]
phrase_matcher.add('agreement', None, *patterns)

# The quotes filters out a whole lot of false positives, but also a few true positives

agr_pattern = re.compile(r'(agreement|contract|memorandum).*["“].*(made|entered|by|between).*'
                   '(county|city|corporation|non-profit|nonprofit|municipal)')


def match_agreement(v, p=None):

    try:
        txt = v.text
    except AttributeError:
        txt = v

    if p is None:
        p = agr_pattern
        
    txt = txt.replace('\n',' ').lower()
    #initial = list(str(e.text).lower() for e in v[:15] if e.is_alpha)

    if agr_pattern.search(txt) \
        and not txt.startswith('in addition') \
        and not 'shall' in txt:
        return True
    else:
        return False

def find_agreements(docs):

    agr_matches = {}
    for doc in tqdm(list(docs)):

        for match_id, start, end in phrase_matcher(doc):
            span = doc[start:end]
            h = hashlib.sha224(str(span.sent).encode('utf8')).hexdigest()
            agr_matches[(id(doc), h)] = span.sent

    print( f" {len(agr_matches)} Matches ")

    mm = {}
    misses = {}

    for k, s in agr_matches.items():

        if match_agreement(s) :
            mm[k] = s
        else:
            misses[k] = s
            
    return mm, misses
        
mm, misses = find_agreements(docs)
len(mm), len(misses) 

  0%|          | 0/876 [00:00<?, ?it/s]

 20896 Matches 


(259, 20637)

In [5]:
# Build a dataframe of matches

import re

rows   = []
for (_id, h),  sent in mm.items():
    rows.append({
        'text': re.sub(r'\s+', ' ', str(sent).strip()),
        'hash': h,
        'path':  sent.doc._.pdf_path
    })
    
t = pd.DataFrame(rows)
t = t.drop_duplicates(subset=['text','path'])

def categorize_as(v):
    v = v.lower()
    
    if 'indem' in v:
        return 'indemnification'
    elif 'ammend' in v:
        return 'ammendment'
    else:
        return 'contract'
        
t['type'] = t.text.apply(categorize_as)

agrmt = t



In [6]:
# Cleaned up list of contractors
contractors = ["Carroll's Community Care", "Carroll's Residential Care", 'Casa El Cajon', 'Exodus Recovery', 
               'National Conflict Resolution Center', 'Path', 'People Assisting', 'Prime Healthcare Paradise Valley', 
               'Prosperity Way', 'Regional Task Force On The Homeless', 'Ymca', 'Alpha Project', 'Anthem Compassionate', 
               'Casa De Amparo', 'Casa De Oro ', 'Catholic Charities', 'Center For Community Solutions', 
               'Community Health Improvement Partners', 'Community Research Foundation', 'Community Resource Center', 
               'Corporation For Supportive Housing', 'Deaf Community Services Of San Diego', 'Diocese Of San Diego', 
               'El Dorado Community Service Centers', 'Family Health Centers Of San Diego', 'Fancor Guest Home', 
               'Healthright', 'Home Start', 'Interfaith', 'Legal Aid Society Of San', 'Mcalister Institute', 
               'Mental Health Systems', 'Mission Treatment Services', 'Nami', 'National Alliance', 
               'Neighborhood House Association', 'New Alternatives', 'Orlando Guest Home', 'Pan Asian Communities', 
               'Pathways Community Services', 'Public Consulting Group', 'Rescare Workforce Services', 
               'San Diego Health Alliance', 'San Diego Youth Services', 'Serving Seniors', 
               'South Bay Community Services', 'Southern Indian Health Council', 'Telecare Corporation', 
               'Upac', 'Urban Street Angels', 'Vista Hill', 'Walden Environment', 'Walden Family Services', 
               'Wing Street', 'Pan Asian', 'Crisis House',
               'North County Lifeline', 'Operation Hope', 'Episcopal Community Services', 'Palomar health',
               'New Alternatives','Recovery Innovations','Santee Food Bank','2-1-1 San Diego',
               'Elderhelp of San Diego', ]
              

pd.DataFrame({'contractors':contractors}).to_csv('contractors.csv')

def find_contractor(v):
               
    from thefuzz import process
    from thefuzz import fuzz
               
    for c in contractors:
        if c.lower() in v.lower():
            return c
               
    # Try a fuzzy string match        
    m, score = process.extractOne(v, contractors,  scorer=fuzz.token_sort_ratio)
       
    if score > 75:
        return m
               
    return None

agrmt['contractor'] = agrmt.text.apply(find_contractor)

agrmt = agrmt[['type', 'contractor', 'text', 'hash', 'path']]

agrmt.to_csv('agreement_statements.csv')


In [7]:


# Make docs for the agreement statements
ag_docs =  Doc.from_docs([nlp(v.text) for k,v in mm.items() ])

# Find the organizatino entities -- the contractors. 
ag_ents = set([ ' '.join([t.lower_ for t in e if t.is_alpha]) for e in ag_docs.ents if e.label_ == 'ORG' ])

In [8]:
for e in ag_ents:
    e = e.replace('contractor','').replace('offeror','')
    
    fc = find_contractor(e)
    
    #if not fc:
    #        print(e)
  



In [9]:
# Paths that were not selected for agreement lines. 
paths = set(agrmt['path'])
not_agr = set() # Not an agreement document
is_agr = set()
rows = []
i = 0
for doc in docs:
    pp  = doc._.pdf_path

    if pp not in paths:
        not_agr.add(pp)
    else:
        is_agr.add(pp)


In [10]:
# Find documents that were not classified as agreements that have 'agree' in the title. 
# a copy them for additional analysis. 

# Some documents, like 557758-09_Amendment.pdf are contracts, and have text, but the text is not complete. 
# Part of the document will be an unconverted image ( the part with the agreement line ) and the remainder
# is convertable text, so the NLP is working on incomplete text. 

import shutil

from pathlib import Path
root = Path('/Users/eric/proj/data-projects/text-classification/source-data/homelessness-contracts/tmp')

pagr_path = root / 'processed_agree'
sagr_path = root / 'short_agree'
lagr_path = root / 'long_agree'
amd_path = root / 'amendment'
other_path = root / 'other'

for p in [sagr_path, lagr_path, other_path, amd_path, pagr_path]:
    p.mkdir( parents=True, exist_ok=True)

rows = []

for e in not_agr:
    
    e = Path(e)
    d = docd[e.name]
    
    r = {
        's': len(str(d).strip()),
        'agr': 'agree' in str(e).lower(),
        'name': e.name,
        'path': str(e),
    }
    
    rows.append(r)
    
    if 'agree' in str(e).lower():
        if r['s'] < 10_000:
            shutil.copy(e, sagr_path)
        else:
            shutil.copy(e, lagr_path)
    elif 'amend' in str(e).lower():
        shutil.copy(e, amd_path)
    else:
        shutil.copy(e, other_path)
    
for e in is_agr:
    
    e = Path(e)
    d = docd[e.name]
    shutil.copy(e, pagr_path)

In [11]:
t = pd.DataFrame(rows)
t[t.agr].sort_values('s')

Unnamed: 0,s,agr,name,path
16,0,True,Mental Health Systems Transitional Storage Agr...,/Users/eric/proj/data-projects/text-classifica...
345,0,True,Copy of 20-21 Agreement - Home Start Motel Sta...,/Users/eric/proj/data-projects/text-classifica...
371,0,True,Copy of CDBG Agreement - Crisis House PY18.pdf,/Users/eric/proj/data-projects/text-classifica...
677,0,True,ESG-CV First Amendment to HP Subrecipient Agre...,/Users/eric/proj/data-projects/text-classifica...
378,0,True,Copy of 20-21 Agreement - ECTLC Guard rails pr...,/Users/eric/proj/data-projects/text-classifica...
...,...,...,...,...
222,76464,True,ESG Executed Subrecipient Agreement FY 17-18 c...,/Users/eric/proj/data-projects/text-classifica...
278,77644,True,CDBG Emergency Shelter Subrecipient Agreement ...,/Users/eric/proj/data-projects/text-classifica...
465,88481,True,CRC 19-20 Agreement (executed) - Facility Impr...,/Users/eric/proj/data-projects/text-classifica...
403,95286,True,Centerplate Agreement with Exhibits-executed.pdf,/Users/eric/proj/data-projects/text-classifica...


In [12]:
t = agr_pattern.match("Contractor shall provide Contracting Officer Representative with copies of all other "
                  "subcontracts relating to this Agreement entered intoby Contractor within 30 days after "
                  "the effective date of the subcontract.")

agr_pattern = re.compile(r'(agreement|contract).*(made|entered|by|between).*'
                   '(county|city|corporation|non-profit|nonprofit|municipal)')

p2 = re.compile(r'(agreement|contract).*(made|entered|by|between)')


match_agreement("""This agreement (“Agreement”) is made and entered into effective as of the date of the last signature on the signature page by and
 between the County of San Diego, a political subdivision of the State of California (“County”) and Community Research
 Foundation, Inc. 1202 Morena Blvd., Suite 300, San Diego, CA 92110 (“Contractor”), with reference to the following facts:,
 """.lower())

True

In [13]:


h, m = find_agreements([docd['Executed Agreement CDBG CRC.pdf']])
m

  0%|          | 0/1 [00:00<?, ?it/s]

 24 Matches 


{(140188000898688,
  '00a3248fcd0a904728dcbc1add342bd1e70a334dc860f914263bb09f'): The Application
 and assurances form is hereby incorporated by reference into this agreement
 fully as if set forth hereino
 National Objectives: All activities funded with CDGB funds must meet one of
 the CDBG program's National Objectives: benefit low- and moderate-income
 persons; aid in the preventian or elimination of slums or blight; ar meet
 community development needs having a particular urgency, as defined in 24
 CFR 570.208.,
 (140188000898688,
  '7232d6b87fbcaa280ba5e4ac5b010440d05534c3dd5e89165570019e'): If action to correct such substandard
 performance is not taken by the SUBRECIPIENTwithin areasonable period of
 time after being natified by the CITY, contract suspension or termination
 procedures will be initiated.,
 (140188000898688,
  '40c7138669a7596db830f0bbc5517cb3660a64f7c3e2a66d0e29de6d'): The contract budget
 
 
 for this project is described in Exhibit B.,
 (140188000898688,
  'aa0