In [14]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
import spacy 
from spacy.tokens import DocBin

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()

In [2]:
# !pip install -q -r ../requirements.txt

In [3]:
import metapack as mp
pkg = mp.open_package('http://library.metatab.org/sdcta.org-hl_contracts-1.1.1.zip')

# Create Dataframes
ann = pkg.resource('annotations').dataframe()
ann['length'] = ann.text.str.len()
ctx = pkg.resource('contexts').dataframe().set_index('part')

In [4]:
ann.head().T

Unnamed: 0,0,1,2,3,4
classid,e_7,e_19,e_1,e_6,e_4
part,s1v1,s1v1,s1v1,s1v1,s1v1
offset_start,33,33,127,153,340
text,COUNTY OF SAN DIEGO,COUNTY OF SAN DIEGO- DEPARTMENT OF PURCHASINGA...,Interfaith Shelter Network,.,Statement ofWork
coordinates,[],,[],[],[]
confidence,,pre-added,,pre-added,
confidence_prob,0.975784,1.0,0.813213,0.5,0.900681
fields,{},{},{},{},{}
normalizations,{},{},{},{},{}
who,ml:nalaf,user:SDTEF,ml:nalaf,ml:regex,ml:nalaf


In [5]:
# Word tokenization

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from difflib import SequenceMatcher
import re


# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Add the component to the pipeline
nlp.add_pipe('sentencizer')

def mark_sentences(idx,r, ctx):

    
    ctx_r = ctx.loc[r.part]

    text = re.sub(r"\n+","\n", ctx_r.context).lower()

    #  "nlp" Object is used to create documents with linguistic annotations.
    doc = nlp(text)

    spn = doc[r.offset_start: r.offset_start+r.length]

    # Create list of word tokens
    #token_list = [tok for tok in doc if tok.text not in STOP_WORDS]
    #print(token_list[:30])

    #print(spn)
    #print('++++')

    # Minimum size of block to match
    msize = int(len(spn)*.2)

    rows = []
    # Extract sentences and compute the overlap with the tagged text
    for s in doc.sents:

        sm = SequenceMatcher(None, spn.text, s.text)
        mb = [e for e in sm.get_matching_blocks() if e.size >= msize]

        anno_type = r.anno_type if len(mb) > 0 else None
       
        
        rows.append( (idx, r.part, s.text, anno_type) )
                        
    return rows

mark_sentences(4, ann.iloc[4], ctx)


[(4,
  's1v1',
  "sd cnty purch '15 :fp 14pm04:12\ncounty of san diego- department of purchasingand contracting contract 539655 amendment 10\nto interfaith shelter network.",
  'service'),
 (4,
  's1v1',
  'pursuant to the contract changes clause, you are directed to make the changes described herein to the contract or do the following described work not included in the previous agreed on statement ofwork.',
  None),
 (4,
  's1v1',
  '\ntitle of contract, temporary shelter network services\neffective date: 09/15115 description of contract change(s) and/or work to be done:\n• modify exhibit a and exhibit c to reflect the level of effort associated with fy 15-16 funding. •',
  None),
 (4,
  's1v1',
  'modify exhibit a to reflect a revision in language regarding health insurance.',
  'service'),
 (4,
  's1v1',
  '\nstatement of work:\n• add section 3.8 to read in its entirety as follows:\n3.8 bed nights: contractor shall provide a minimum of two thousand sixty nine (2068) nights of lodgin

In [6]:
from itertools import chain
x = [mark_sentences(idx, row, ctx) for idx, row in ann.iterrows()]

In [7]:
df = pd.DataFrame( chain(*x), columns='idx part text anno_type'.split()) 

In [8]:
df.anno_type.value_counts(dropna=False)

years                93726
payor                57560
location             47978
goal-service         47291
service              33708
NaN                  32376
organization         29613
compensation         28734
target-population    11931
contract              6278
Name: anno_type, dtype: int64

In [10]:
# Another way to do test/train split
#from sklearn.model_selection import train_test_split
#train_df, test_df = train_test_split(df, test_size=0.2)
#tain_data = train_df[['text','cats']].values.tolist()
#test_data = test_df[['text','cats']].values.tolist() 

In [None]:
from spacy.tokens import DocBin 

import en_core_web_lg
# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_lg.load()

nlp.add_pipe('textcat', last=True)
textcat = nlp.get_pipe('textcat')

for at,_ in proto_d:
    textcat.add_label(at)


In [12]:

# If this fails, run :
# !python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

# Create the categories data
proto_d = [ (e, 'F') for e in df.anno_type.unique() if e ]
def make_cat(cat):
    d = dict(proto_d)
    d[cat] = 'T'
    return d

df['cats'] = [make_cat(r.anno_type) for idx, r in df.iterrows() ]


def write_docs(df, path):
    
    docs = []
    for idx, row in df.iterrows():
   
        doc = nlp(row.text)
        doc.cats[str(row.anno_type)] = 1.0
        docs.append(doc)
        
    db = DocBin(docs=docs)
    db.to_disk(path)
    
write_docs(train_df.sample(500), 'train.spacy')
write_docs(test_df.sample(500), 'test.spacy')




In [17]:
# Check that save file makes sense
doc_bin = DocBin().from_disk('train.spacy')
docs = list(doc_bin.get_docs(nlp.vocab))
docs[0].cats

{'service': 1.0}

In [18]:
# Pre-training docs
from tqdm.notebook import tqdm
pt_docs = [nlp(row.text) for idx, row in tqdm(list(df.sample(5000).iterrows()))]
db = DocBin(docs=pt_docs)
db.to_disk('raw_text.spacy')

  0%|          | 0/5000 [00:00<?, ?it/s]

In [19]:
# Pretrain

!python -m spacy pretrain config.cfg ./output 


[38;5;3m⚠ Output directory is not empty.[0m
It is better to use an empty directory or refer to a new output path, then the
new directory will be created for you.
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ Loading config from: config.cfg[0m
[38;5;2m✔ Saved config file in the output directory[0m
[1m
  #      # Words   Total Loss     Loss    w/s
[38;5;2m✔ Successfully finished pretrain[0m


In [None]:
# Train
!python -m spacy train config.cfg --output ./output 

[38;5;4mℹ Using CPU[0m
[1m
[2021-08-17 17:39:09,050] [INFO] Set up nlp object from config
[2021-08-17 17:39:09,059] [INFO] Pipeline: ['tok2vec', 'textcat']
[2021-08-17 17:39:09,063] [INFO] Created vocabulary
[2021-08-17 17:39:11,237] [INFO] Added vectors: en_core_web_lg
[2021-08-17 17:39:13,486] [INFO] Finished initializing nlp object
[2021-08-17 17:39:14,331] [INFO] Initialized pipeline components: ['tok2vec', 'textcat']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.02        0.00    0.00
  1     200       1766.03         33.11        1.25    0.01
  3     400        868.79         21.64        4.11    0.04
  6     600       3676.73         17.39        4.32    0.04
 10     800      97719.45          6.54        4.13    0.04
 14    1000      6696