In [3]:
import os
import re
import shutil
import pandas as pd
import numpy as np

### Preprocess

In [2]:
def preprocess(text):        
    text = text.replace('\n',' ')
    text = re.sub(r'[^.0-9a-zA-Z]', ' ', text)
    text = re.sub(r'[^.\w\s]', ' ', text)
    text = text.lower()
    return text

### Dataset

In [3]:
ADDENDUM = 0
MSA = 1
NDA = 2
OTHERS = 3
SOW = 4

In [148]:
path = '/home/user/Shyam/Code/Release_6.0/Dev/Snorkel/data/filtered/'
docs = []
filenames = []
labels = []

for root, dirs, files in os.walk(path):
    for file in files:
        with open (os.path.join(root+'/'+file), encoding='utf8') as f:            

            text = f.read()
            text = preprocess(text)                            
            docs.append(text)
            filenames.append(file)
            
            if 'msa' in root:
                labels.append('MSA')
            
            if 'sow' in root:
                labels.append('SOW')
            
            if 'addendum' in root:
                labels.append('Addendum')
                
            if 'nda' in root:
                labels.append('NDA')
                
            if 'other' in root:
                labels.append('Others')

print(len(docs))

1400


In [149]:
df = pd.DataFrame(list(zip(filenames, docs, labels)), columns=['filename','text', 'label'])
df.head()

Unnamed: 0,filename,text,label
0,D27186.pdf.out.html.txt,loch woltexs kinwer busines statement o...,Addendum
1,D12738.pdf.out.html.txt,amendment two to the outsource services agr...,Addendum
2,D27843.pdf.out.html.txt,addendum 3 re new schedule c pricing of ja...,Addendum
3,D04193.pdf.out.html.txt,addendum no. 3 to master services agreeme...,Addendum
4,D01834.pdf.out.html.txt,addendum 4 to toner contract agreement...,Addendum


In [116]:
y = df.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y = np.array(y)
np.bincount(y)

array([278, 467, 134, 257, 264])

# MSA

In [120]:
df_msa = df[df.label == 'MSA'][['filename','text']]
df_msa.head()

Unnamed: 0,filename,text
535,D23598.pdf.out.html.txt,CCH a Wollers Khewer business Statement o...
536,D34429.pdf.out.html.txt,HAR 23 2010 TUE 01 42 PM Woltere Kluwer FAX N...
537,D01930.pdf.out.html.txt,y Wo Ite rs Klu wer North America Shared Serv...
538,D35062.pdf.out.html.txt,AMENDMENT SCHEDULE A DUTIES TERM AND CO...
539,D39419.pdf.out.html.txt,AMENDMENT SCHEDULE A To Independent Con...


In [127]:
y_msa = df[df.label == 'MSA']['label'].map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3}).values

In [144]:
from snorkel.labeling import LabelingFunction
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier

"""msa_keywords = ["indemnification", "warranties", "force majeure", "governing law", 
                "confidential information", "project management", "remedies", "injunctive relief", 
                "conflicts of interest", "right to injunction", "dispute resolution", "confidentiality", 
                "limitation of liability", "usage right", "remuneration", "choice of law", "inter company",
                "validity and enforceability", "this agreement"]
"""

msa_keywords = ['meaning set forth', 'including without limitation', 'prior written consent', 
                'intellectual industrial property', 'prior written notice', 'mutually agreed upon', 
                'force majeure event', 'subject matter hereof', 'forth applicable statement']

ABSTAIN = -1
MSA = 1

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=MSA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))

labl_functions = []
for key in msa_keywords:
    labl_functions.append(make_keyword_lf([key]))

@labeling_function()
def regex_agreement(x):
    return MSA if re.search(r"agreement (.+?) between (.+?) and (.+?)", x.text) else ABSTAIN
    
labl_functions.append(regex_agreement)
print(labl_functions)

applier = PandasLFApplier(lfs=labl_functions)
L_train = applier.apply(df=df_msa)

'msa_keywords = ["indemnification", "warranties", "force majeure", "governing law", \n                "confidential information", "project management", "remedies", "injunctive relief", \n                "conflicts of interest", "right to injunction", "dispute resolution", "confidentiality", \n                "limitation of liability", "usage right", "remuneration", "choice of law", "inter company",\n                "validity and enforceability", "this agreement"]\n'

  from pandas import Panel

  0%|          | 0/467 [00:00<?, ?it/s][A
100%|██████████| 467/467 [00:00<00:00, 3504.96it/s][A

[LabelingFunction keyword_meaning set forth, Preprocessors: [], LabelingFunction keyword_including without limitation, Preprocessors: [], LabelingFunction keyword_prior written consent, Preprocessors: [], LabelingFunction keyword_intellectual industrial property, Preprocessors: [], LabelingFunction keyword_prior written notice, Preprocessors: [], LabelingFunction keyword_mutually agreed upon, Preprocessors: [], LabelingFunction keyword_force majeure event, Preprocessors: [], LabelingFunction keyword_subject matter hereof, Preprocessors: [], LabelingFunction keyword_forth applicable statement, Preprocessors: [], LabelingFunction regex_agreement, Preprocessors: []]





In [145]:
from snorkel.labeling import LFAnalysis
#y = df.label.values
LFAnalysis(L=L_train, lfs=labl_functions).lf_summary(y_msa).sort_values(by='Emp. Acc.')

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
keyword_intellectual industrial property,3,[],0.0,0.0,0.0,0,0,0.0
keyword_forth applicable statement,8,[],0.0,0.0,0.0,0,0,0.0
keyword_meaning set forth,0,[1],0.025696,0.008565,0.0,12,0,1.0
keyword_including without limitation,1,[1],0.06424,0.049251,0.0,30,0,1.0
keyword_prior written consent,2,[1],0.074946,0.068522,0.0,35,0,1.0
keyword_prior written notice,4,[1],0.036403,0.029979,0.0,17,0,1.0
keyword_mutually agreed upon,5,[1],0.051392,0.034261,0.0,24,0,1.0
keyword_force majeure event,6,[1],0.019272,0.017131,0.0,9,0,1.0
keyword_subject matter hereof,7,[1],0.027837,0.023555,0.0,13,0,1.0
regex_agreement,9,[1],0.1606,0.094218,0.0,75,0,1.0


In [131]:
#Check how many documents from other classes classified as MSA
from snorkel.analysis import get_label_buckets

buckets = get_label_buckets(y, L_train[:, 1])

for key in buckets.keys():
    if key[1] == 1 and key[0] != 1:        
        print(df.iloc[buckets[key]].label.value_counts())    

ValueError: Arrays must all have the same number of elements

# SOW

In [132]:
df_sow = df[df.label == 'SOW'][['filename','text']]
df_sow.head()

Unnamed: 0,filename,text
1002,D10962.pdf.out.html.txt,3601 West 76th Street Suite 250 A . m E...
1003,D06041.pdf.out.html.txt,OLPA2 SOW . ERITUM PARTNERS 37 Exeter Rd ...
1004,D38018.pdf.out.html.txt,May 15 2017 Denise Silva Managing Editor ...
1005,D41099.pdf.out.html.txt,AMENDMENT SCHEDULE A To Independent Con...
1006,D17903.pdf.out.html.txt,06 30 2010 WED 14 27 FAX ZJ001L 028 542 Amh...


In [142]:
y_sow = df[df.label == 'SOW']['label'].map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3}).values

In [143]:
from snorkel.labeling import LabelingFunction
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier

sow_keywords = ['this sow', 'this statement of work', 'description of services', 'sow end date', 
                'sow duration']

ABSTAIN = -1
SOW = 4

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=SOW):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

labl_functions = []
for key in sow_keywords:
    labl_functions.append(make_keyword_lf([key]))

print(labl_functions)

applier = PandasLFApplier(lfs=labl_functions)
L_train = applier.apply(df=df_sow)

#labels
#y = df_sow.label.values

LFAnalysis(L=L_train, lfs=labl_functions).lf_summary(y_sow).sort_values(by='Emp. Acc.')

  from pandas import Panel

100%|██████████| 264/264 [00:00<00:00, 3349.84it/s]

[LabelingFunction keyword_this sow, Preprocessors: [], LabelingFunction keyword_this statement of work, Preprocessors: [], LabelingFunction keyword_sow effective date, Preprocessors: [], LabelingFunction keyword_statement of work effective date, Preprocessors: [], LabelingFunction keyword_purpose of sow, Preprocessors: [], LabelingFunction keyword_description of services, Preprocessors: [], LabelingFunction keyword_sow start date, Preprocessors: [], LabelingFunction keyword_sow end date, Preprocessors: [], LabelingFunction keyword_resource the work, Preprocessors: [], LabelingFunction keyword_sow assumptions, Preprocessors: [], LabelingFunction keyword_signing and returning this sow, Preprocessors: [], LabelingFunction keyword_sow term, Preprocessors: [], LabelingFunction keyword_sow duration, Preprocessors: []]





Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
keyword_sow effective date,2,[],0.0,0.0,0.0,0,0,0.0
keyword_statement of work effective date,3,[],0.0,0.0,0.0,0,0,0.0
keyword_purpose of sow,4,[],0.0,0.0,0.0,0,0,0.0
keyword_sow start date,6,[],0.0,0.0,0.0,0,0,0.0
keyword_resource the work,8,[],0.0,0.0,0.0,0,0,0.0
keyword_sow assumptions,9,[],0.0,0.0,0.0,0,0,0.0
keyword_signing and returning this sow,10,[],0.0,0.0,0.0,0,0,0.0
keyword_sow term,11,[],0.0,0.0,0.0,0,0,0.0
keyword_this sow,0,[4],0.136364,0.068182,0.0,36,0,1.0
keyword_this statement of work,1,[4],0.07197,0.064394,0.0,19,0,1.0


# ***Addendum***

In [11]:
from snorkel.labeling import LabelingFunction
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier

addendum_keywords = ["this addendum", "this amendment",
                       "addendum is part of", "amendment is part of",
                       "term of this addendum", "term of this amendment",
                       "this amendment is made and entered", "this addendum is made and entered", 
                       "this amendment is entered into", "this addendum is entered into",
                       "this amendment is between", "this addendum is between",
                       "duration of this addendum", "duration of this amendment",
                       "purpose of amendment", "purpose of addendum",
                       "addendum shall become effective", "amendment shall become effective"]

ABSTAIN = -1
ADDENDUM = 0

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=ADDENDUM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

labl_functions = []
for key in addendum_keywords:
    labl_functions.append(make_keyword_lf([key]))

match = re.search(r"(?:addendum|amendment) (?:is the (first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|[0-9](st|nd|rd|th))|no.\s?[0-9]|number (one|two|three|four|five|six|seven|eight|nine|ten|[0-9])|#\s?[0-9])", text)

@labeling_function()
def regex_addendum(x):
    return ADDENDUM if re.search(r"(?:addendum|amendment) (?:is the (first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|[0-9](st|nd|rd|th))|no.\s?[0-9]|number (one|two|three|four|five|six|seven|eight|nine|ten|[0-9])|#\s?[0-9])", x.text) else ABSTAIN
    
labl_functions.append(regex_addendum)
print(labl_functions)

applier = PandasLFApplier(lfs=labl_functions)
L_train = applier.apply(df=df)

#labels
#y = df_addendum.label.values

LFAnalysis(L=L_train, lfs=labl_functions).lf_summary(y).sort_values(by='Emp. Acc.')

  from pandas import Panel
 19%|█▊        | 261/1400 [00:00<00:00, 2606.25it/s]

[LabelingFunction keyword_this addendum, Preprocessors: [], LabelingFunction keyword_this amendment, Preprocessors: [], LabelingFunction keyword_addendum is part of, Preprocessors: [], LabelingFunction keyword_amendment is part of, Preprocessors: [], LabelingFunction keyword_term of this addendum, Preprocessors: [], LabelingFunction keyword_term of this amendment, Preprocessors: [], LabelingFunction keyword_this amendment is made and entered, Preprocessors: [], LabelingFunction keyword_this addendum is made and entered, Preprocessors: [], LabelingFunction keyword_this amendment is entered into, Preprocessors: [], LabelingFunction keyword_this addendum is entered into, Preprocessors: [], LabelingFunction keyword_this amendment is between, Preprocessors: [], LabelingFunction keyword_this addendum is between, Preprocessors: [], LabelingFunction keyword_duration of this addendum, Preprocessors: [], LabelingFunction keyword_duration of this amendment, Preprocessors: [], LabelingFunction key

100%|██████████| 1400/1400 [00:00<00:00, 1683.79it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
keyword_amendment is part of,3,[],0.0,0.0,0.0,0,0,0.0
keyword_term of this amendment,5,[],0.0,0.0,0.0,0,0,0.0
keyword_this addendum is made and entered,7,[],0.0,0.0,0.0,0,0,0.0
keyword_amendment shall become effective,17,[],0.0,0.0,0.0,0,0,0.0
keyword_this amendment is between,10,[],0.0,0.0,0.0,0,0,0.0
keyword_this addendum is between,11,[],0.0,0.0,0.0,0,0,0.0
keyword_duration of this amendment,13,[],0.0,0.0,0.0,0,0,0.0
keyword_term of this addendum,4,[0],0.014286,0.014286,0.0,15,5,0.75
keyword_addendum shall become effective,16,[0],0.020714,0.020714,0.0,22,7,0.758621
keyword_addendum is part of,2,[0],0.031429,0.031429,0.0,35,9,0.795455


# NDA

In [12]:
from snorkel.labeling import LabelingFunction
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier

nda_keywords = ['mutual confidentiality','nondisclosure agreement', 'non disclosure agreement']

ABSTAIN = -1
NDA = 2

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=NDA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

labl_functions = []
for key in nda_keywords:
    labl_functions.append(make_keyword_lf([key]))

print(labl_functions)

applier = PandasLFApplier(lfs=labl_functions)
L_train = applier.apply(df=df)

#labels
#y = df_nda.label.values

LFAnalysis(L=L_train, lfs=labl_functions).lf_summary(y).sort_values(by='Emp. Acc.')

  from pandas import Panel
100%|██████████| 1400/1400 [00:00<00:00, 9884.02it/s]

[LabelingFunction keyword_mutual confidentiality, Preprocessors: [], LabelingFunction keyword_nondisclosure agreement, Preprocessors: [], LabelingFunction keyword_non disclosure agreement, Preprocessors: []]





Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
keyword_non disclosure agreement,2,[2],0.102857,0.039286,0.0,68,76,0.472222
keyword_nondisclosure agreement,1,[2],0.005714,0.000714,0.0,4,4,0.5
keyword_mutual confidentiality,0,[2],0.059286,0.038571,0.0,81,2,0.975904


# Others

In [13]:
from snorkel.labeling import LabelingFunction
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier

ABSTAIN = -1
Others = 3

@labeling_function()
def others_lookup(x):
    if any(word not in x.text.lower() for word in list(set(msa_keywords + sow_keywords + nda_keywords + addendum_keywords))):
        return Others
    return ABSTAIN

lfs=[others_lookup]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)

#labels
#y = df_nda.label.values

LFAnalysis(L=L_train, lfs=lfs).lf_summary(y).sort_values(by='Emp. Acc.')

  from pandas import Panel
100%|██████████| 1400/1400 [00:00<00:00, 18083.07it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
others_lookup,0,[3],1.0,0.0,0.0,257,1143,0.183571


### Labeling Functions

In [163]:
from snorkel.labeling import LabelingFunction
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier

ABSTAIN = -1
MSA = 1
SOW = 4
ADDENDUM = 0
NDA = 2
OTHERS = 3

labl_functions = []

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN

#===============MSA===================
msa_keywords = ['set forth section', 'meaning set forth', 'including without limitation', 
                'prior written consent', 'without prior written', 'intellectual industrial property', 
                'industrial property right', 'intellectual property right',  'privacy restricted data', 
                'prior written notice', 'force majeure event',  'subject matter hereof', 'confidential information',
                'work product', 'shall deemed', 'disclosing party', 'without limitation', 'indemnified party', 
                'indemnifying party', 'applicable law', 'force majeure', 'trade secret', 'obligation agreement', 
                'intellectual industrial']

def make_keyword_lf_msa(keywords, label=MSA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))

for key in msa_keywords:
    labl_functions.append(make_keyword_lf_msa([key]))

@labeling_function()
def regex_agreement(x):
    return MSA if re.search(r"agreement (.+?) between (.+?) and (.+?)", x.text) else ABSTAIN
    
#labl_functions.append(regex_agreement)




#===============SOW===================
sow_keywords = ['statement work','sow','term sow','sow effective date', 'work sow', 'sow shall', 'sow consultant',
                'sow term', 'service sow', 'sow project', 'defined sow','specified sow','outlined sow','msa sow',
                'addendum sow', 'client sow','sow client', 'sow agreement', 'statement work effective', 
                'agreement statement work']

def make_keyword_lf_sow(keywords, label=SOW):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in sow_keywords:
    labl_functions.append(make_keyword_lf_sow([key]))


    
    
#===============ADDENDUM===================
addendum_keywords = ['addendum', 'amendment', 'book production', 'production health', 'agreement amendment', 
                     'agreement addendum', 'addendum number', 'addendum executed', 'amendment agreement', 'sow subsequent', 
                     'subsequent addendum', 'addendum made', 'amendment number', 'amendment date', 'amendment entered', 
                     'amendment made', 'amendment executed', 'amendment term', 'amendment effective date', 
                     'inconsistent contradictory term', 'addendum may executed', 'addendum made entered', 'effective date addendum', 
                     'addendum statement work', 'amendment made entered', 'addendum effective date', 
                     'effective date amendment', 'amend agreement', 'agreement amended', 'agreement hereby amended', 
                     'service agreement amendment']

def make_keyword_lf_addendum(keywords, label=ADDENDUM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in addendum_keywords:
    labl_functions.append(make_keyword_lf_addendum([key]))

@labeling_function()
def regex_addendum(x):
    return ADDENDUM if re.search(r"(?:addendum|amendment) (?:is the (first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|[0-9](st|nd|rd|th))|no.\s?[0-9]|number (one|two|three|four|five|six|seven|eight|nine|ten|[0-9])|#\s?[0-9])", x.text) else ABSTAIN
    
#labl_functions.append(regex_addendum)



#===============NDA===================
nda_keywords = ['mutual confidentiality', 'affiliated entity', 'agreement negotiation', 'disclosure hereunder', 
                'confidentiality confidential', 'protect confidentiality', 'mutual confidentiality agreement', 
                'disclosure confidential information', 'non disclosure agreement', 'non confidential basis', 
                'confidential information agent', 'confidentiality non disclosure', 'disclosing party prompt', 
                'notice intent terminate']


def make_keyword_lf_nda(keywords, label=NDA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in nda_keywords:
    labl_functions.append(make_keyword_lf_nda([key]))
    
    

#===============OTHERS===================
@labeling_function()
def others_lookup(x):
    if all(word not in x.text.lower() for word in list(set(msa_keywords + sow_keywords + nda_keywords + addendum_keywords))):
        return OTHERS
    return ABSTAIN

labl_functions.append(others_lookup)

print(labl_functions)

[LabelingFunction keyword_set forth section, Preprocessors: [], LabelingFunction keyword_meaning set forth, Preprocessors: [], LabelingFunction keyword_including without limitation, Preprocessors: [], LabelingFunction keyword_prior written consent, Preprocessors: [], LabelingFunction keyword_without prior written, Preprocessors: [], LabelingFunction keyword_intellectual industrial property, Preprocessors: [], LabelingFunction keyword_industrial property right, Preprocessors: [], LabelingFunction keyword_intellectual property right, Preprocessors: [], LabelingFunction keyword_privacy restricted data, Preprocessors: [], LabelingFunction keyword_prior written notice, Preprocessors: [], LabelingFunction keyword_force majeure event, Preprocessors: [], LabelingFunction keyword_subject matter hereof, Preprocessors: [], LabelingFunction keyword_confidential information, Preprocessors: [], LabelingFunction keyword_work product, Preprocessors: [], LabelingFunction keyword_shall deemed, Preproces

In [156]:
pd.options.display.max_rows = 4000

### Prepare dataset

In [146]:
path = '/home/user/Shyam/DATASET/classified_corpus_text/'

docs = []
filenames = []

for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith('.txt') and file not in df.filename.values:
            with open (os.path.join(root+'/'+file), encoding='utf8') as f:            
                text = f.read()
                text = preprocess(text)                            
                docs.append(text)
                filenames.append(file)

print(len(docs))

15467


In [147]:
df_train = pd.DataFrame(list(zip(filenames, docs)), columns=['filename','text'])
df_train.head()

Unnamed: 0,filename,text
0,D05236.pdf.out.html.txt,dec 05 03 09 49a quim brown ettiott 8645915...
1,D06496.pdf.out.html.txt,11 10 10 wed 04 14 fax 783 545. 38 colours ...
2,D15596.pdf.out.html.txt,date subject wolters kluwer nv zuidpool...
3,D27186.pdf.out.html.txt,loch woltexs kinwer busines statement o...
4,D12738.pdf.out.html.txt,amendment two to the outsource services agr...


In [150]:
#Split labelled data into valid and test sets
import numpy as np
msk = np.random.rand(len(df)) < 0.8

df_valid = df[msk]
df_test = df[~msk]

In [151]:
print(len(df_valid))
print(len(df_test))

1149
251


In [152]:
y_valid = df_valid.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_valid = np.array(y_valid)
np.bincount(y_valid)

array([222, 391, 105, 218, 213])

In [153]:
y_test = df_test.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_test = np.array(y_test)
np.bincount(y_test)

array([56, 76, 29, 39, 51])

In [154]:
df_valid.drop('label', axis=1, inplace=True)
df_test.drop('label', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Apply Labeling Functions to dataset

In [164]:
#Apply the label functions to the train and valid sets
applier = PandasLFApplier(lfs=labl_functions)
L_train = applier.apply(df=df_train)
L_valid = applier.apply(df=df_valid)

  from pandas import Panel

  0%|          | 0/15467 [00:00<?, ?it/s][A
  0%|          | 39/15467 [00:00<00:40, 384.24it/s][A
  0%|          | 77/15467 [00:00<00:41, 374.28it/s][A
  1%|          | 116/15467 [00:00<00:41, 372.02it/s][A
  1%|          | 158/15467 [00:00<00:39, 383.32it/s][A
  1%|▏         | 194/15467 [00:00<00:40, 375.79it/s][A
  1%|▏         | 229/15467 [00:00<00:41, 367.65it/s][A
  2%|▏         | 262/15467 [00:00<00:44, 344.39it/s][A
  2%|▏         | 304/15467 [00:00<00:42, 353.26it/s][A
  2%|▏         | 344/15467 [00:00<00:41, 365.69it/s][A
  2%|▏         | 380/15467 [00:01<00:45, 332.96it/s][A
  3%|▎         | 417/15467 [00:01<00:43, 343.22it/s][A
  3%|▎         | 455/15467 [00:01<00:42, 352.55it/s][A
  3%|▎         | 491/15467 [00:01<00:44, 335.60it/s][A
  3%|▎         | 525/15467 [00:01<00:44, 334.12it/s][A
  4%|▎         | 564/15467 [00:01<00:42, 348.85it/s][A
  4%|▍         | 600/15467 [00:01<00:42, 346.51it/s][A
  4%|▍         | 635/15467 [00:01

 58%|█████▊    | 8896/15467 [00:30<00:31, 210.21it/s][A
 58%|█████▊    | 8918/15467 [00:31<00:31, 206.99it/s][A
 58%|█████▊    | 8939/15467 [00:31<00:31, 205.86it/s][A
 58%|█████▊    | 8964/15467 [00:31<00:30, 213.18it/s][A
 58%|█████▊    | 8986/15467 [00:31<00:31, 203.86it/s][A
 58%|█████▊    | 9007/15467 [00:31<00:31, 205.28it/s][A
 58%|█████▊    | 9028/15467 [00:31<00:31, 205.40it/s][A
 59%|█████▊    | 9049/15467 [00:31<00:31, 203.45it/s][A
 59%|█████▊    | 9071/15467 [00:31<00:30, 206.75it/s][A
 59%|█████▉    | 9092/15467 [00:31<00:32, 195.56it/s][A
 59%|█████▉    | 9122/15467 [00:32<00:29, 214.67it/s][A
 59%|█████▉    | 9148/15467 [00:32<00:27, 226.45it/s][A
 59%|█████▉    | 9172/15467 [00:32<00:28, 219.63it/s][A
 59%|█████▉    | 9195/15467 [00:32<00:28, 218.27it/s][A
 60%|█████▉    | 9222/15467 [00:32<00:27, 224.19it/s][A
 60%|█████▉    | 9255/15467 [00:32<00:25, 247.07it/s][A
 60%|██████    | 9281/15467 [00:32<00:25, 245.26it/s][A
 60%|██████    | 9311/15467 [00

<li><b>Polarity</b>: The set of unique labels this LF outputs (excluding abstains) <br>
<li><b>Coverage</b>: The fraction of the dataset the LF labels<br>
<li><b>Overlaps</b>: The fraction of the dataset where this LF and at least one other LF label<br>
<li><b>Conflicts</b>: The fraction of the dataset where this LF and at least one other LF label and disagree<br>
<li><b>Correct</b>: The number of data points this LF labels correctly (if gold labels are provided)<br>
<li><b>Incorrect</b>: The number of data points this LF labels incorrectly (if gold labels are provided)<br>
<li><b>Empirical Accuracy</b>: The empirical accuracy of this LF (if gold labels are provided)<br>

In [1]:
#Check the performance of label functions
#LFAnalysis(L=L_train, lfs=labl_functions).lf_summary().sort_values(by='Coverage')
LFAnalysis(L=L_valid, lfs=labl_functions).lf_summary(y_valid).sort_values(by='Emp. Acc.')

NameError: name 'LFAnalysis' is not defined

In [51]:
L_train

array([[3, 3, 3, ..., 3, 3, 3],
       [3, 3, 3, ..., 3, 3, 3],
       [3, 3, 3, ..., 3, 3, 3],
       ...,
       [3, 1, 3, ..., 2, 3, 2],
       [3, 1, 3, ..., 2, 3, 2],
       [3, 1, 3, ..., 2, 3, 2]])

### Majority Label Voter

In [159]:
from snorkel.labeling import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

IndexError: index 3 is out of bounds for axis 0 with size 2

# Label Model

In [166]:
label_model = LabelModel(cardinality=5, verbose=True)
label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=50, seed=123)

In [167]:
label_model_acc = label_model.score(L=L_valid, Y=y_valid)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")



Label Model Accuracy:     65.7%


In [67]:
probs_train = label_model.predict_proba(L_train)

In [68]:
#filter unlabelled data points
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

In [72]:
probs_train_filtered

array([[9.99960711e-01, 5.38259622e-08, 1.48756887e-08, 8.32677404e-08,
        3.91373277e-05],
       [1.91601912e-05, 9.89331344e-01, 4.60149692e-05, 4.28054425e-05,
        1.05606757e-02],
       [2.37080628e-01, 2.37636745e-01, 2.87086446e-02, 3.87869451e-02,
        4.57787038e-01],
       ...,
       [6.62925075e-15, 9.99943659e-01, 5.63410096e-05, 4.24924936e-14,
        3.63681216e-18],
       [6.62925075e-15, 9.99943659e-01, 5.63410096e-05, 4.24924936e-14,
        3.63681216e-18],
       [6.62925075e-15, 9.99943659e-01, 5.63410096e-05, 4.24924936e-14,
        3.63681216e-18]])

### Transformation functions

In [73]:
from snorkel.preprocess.nlp import SpacyPreprocessor

spacy = SpacyPreprocessor(text_field="text", doc_field="doc", memoize=True)

In [75]:
import names
from snorkel.augmentation import transformation_function

# Pregenerate some random person names to replace existing ones with
# for the transformation strategies below
replacement_names = [names.get_full_name() for _ in range(50)]


# Replace a random named entity with a different entity of the same type.
@transformation_function(pre=[spacy])
def change_person(x):
    person_names = [ent.text for ent in x.doc.ents if ent.label_ == "ORG"]
    # If there is at least one person name, replace a random one. Else return None.
    if person_names:
        name_to_replace = np.random.choice(person_names)
        replacement_name = np.random.choice(replacement_names)
        x.text = x.text.replace(name_to_replace, replacement_name)
        return x


# Swap two adjectives at random.
@transformation_function(pre=[spacy])
def swap_adjectives(x):
    adjective_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "ADJ"]
    # Check that there are at least two adjectives to swap.
    if len(adjective_idxs) >= 2:
        idx1, idx2 = sorted(np.random.choice(adjective_idxs, 2, replace=False))
        # Swap tokens in positions idx1 and idx2.
        x.text = " ".join(
            [
                x.doc[:idx1].text,
                x.doc[idx2].text,
                x.doc[1 + idx1 : idx2].text,
                x.doc[idx1].text,
                x.doc[1 + idx2 :].text,
            ]
        )
        return x

In [97]:
import nltk
from nltk.corpus import wordnet as wn

nltk.download("wordnet")


def get_synonym(word, pos=None):
    """Get synonym for word given its part-of-speech (pos)."""
    synsets = wn.synsets(word, pos=pos)
    # Return None if wordnet has no synsets (synonym sets) for this word and pos.
    if synsets:
        words = [lemma.name() for lemma in synsets[0].lemmas()]
        if words[0].lower() != word.lower():  # Skip if synonym is same as word.
            # Multi word synonyms in wordnet use '_' as a separator e.g. reckon_with. Replace it with space.
            return words[0].replace("_", " ")


def replace_token(spacy_doc, idx, replacement):
    """Replace token in position idx with replacement."""
    return " ".join([spacy_doc[:idx].text, replacement, spacy_doc[1 + idx :].text])


@transformation_function(pre=[spacy])
def replace_verb_with_synonym(x):
    # Get indices of verb tokens in sentence.
    verb_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "VERB"]
    if verb_idxs:
        # Pick random verb idx to replace.
        #idx = np.random.choice(verb_idxs)
        for idx in verb_idxs:
            synonym = get_synonym(x.doc[idx].text, pos="v")
            # If there's a valid verb synonym, replace it. Otherwise, return None.
            if synonym:
                x.text = replace_token(x.doc, idx, synonym)
        return x


@transformation_function(pre=[spacy])
def replace_noun_with_synonym(x):
    # Get indices of noun tokens in sentence.
    noun_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "NOUN"]
    if noun_idxs:
        # Pick random noun idx to replace.
        #idx = np.random.choice(noun_idxs)
        for idx in noun_idxs:
            synonym = get_synonym(x.doc[idx].text, pos="n")
            # If there's a valid noun synonym, replace it. Otherwise, return None.
            if synonym:
                x.text = replace_token(x.doc, idx, synonym)
        return x


@transformation_function(pre=[spacy])
def replace_adjective_with_synonym(x):
    # Get indices of adjective tokens in sentence.
    adjective_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "ADJ"]
    if adjective_idxs:
        # Pick random adjective idx to replace.
        #idx = np.random.choice(adjective_idxs)
        for idx in adjective_idxs:
            synonym = get_synonym(x.doc[idx].text, pos="a")
            # If there's a valid adjective synonym, replace it. Otherwise, return None.
            if synonym:
                x.text = replace_token(x.doc, idx, synonym)
        return x

[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [98]:
tfs = [
    change_person,
    swap_adjectives,
    replace_verb_with_synonym,
    replace_noun_with_synonym,
    replace_adjective_with_synonym,
]

In [106]:
from utils import preview_tfs

df_transformed = preview_tfs(df_train, tfs)

In [107]:
df_transformed['Original Text'][3]

'    as   aquatic management agreement   this agreement  proposal  19453 dated 6 21 2005  is made betwean aquagenix and customer    kings highway industrial park   7305 commercial circle   kings highway commercial park   ft. pierce  fl 34954  772  342 1935   both customer and aquagenix agree to the following terms and conditions    i  general conditions    aquagenix will provide aquatic management services on behalf of the customer in accordance with   the terms and conditions of this agreement at the following aquatic site s     2 retention pond and canals tocated in ft. pierce  fl.   2. contract term    the term of this agreement shall be 1 year s  or as otherwise provided by contract addendum.   3. contract services    customer agrees to pay aquagenix the following amounts during the term of this agreement for thase   specific water management services.   algae and aquatic plant contro  included   border grass and brush control to water s edge included   water testing  see addendum 

In [108]:
df_transformed['Transformed Text'][3]

'    as   aquatic management agreement   this agreement  proposal  19453 dated 6 21 2005  is made betwean aquagenix and customer    kings highway industrial park   7305 commercial circle   kings highway commercial park   ft. pierce  fl 34954  772  342 1935   both customer and aquagenix agree to the following terms and conditions    i  general conditions    aquagenix will provide aquatic management services on behalf of the customer in accordance with   the terms and conditions of this agreement at the following aquatic site s     2 retention pond and canals tocated in ft. pierce  fl.   2. contract term    the term of this agreement shall be 1 year s  or as otherwise provided by contract addendum.   3. contract services    customer agrees to pay aquagenix the following amounts during the term of this agreement for thase   specific water management services.   algae and aquatic plant contro  included   border grass and brush control to water s edge included   water testing  see addendum 

### Model Training

In [162]:
from utils import featurize_df_tokens, get_keras_lstm, get_keras_early_stopping

X_train = featurize_df_tokens(df_train)
#X_train_augmented = featurize_df_tokens(df_train_augmented)
X_valid = featurize_df_tokens(df_valid)
X_test = featurize_df_tokens(df_test)



X_train,
#Y_train,
X_valid=X_valid,
Y_valid=y_valid,
X_test=X_test,
Y_test=y_test,
num_buckets=30000

# Define a vanilla LSTM model with Keras
lstm_model = get_keras_lstm(num_buckets)
lstm_model.fit(
    X_train,
    #Y_train,
    epochs=25,
    validation_data=(X_valid, Y_valid),
    callbacks=[get_keras_early_stopping(5)],
    verbose=2,
)
#preds_test = lstm_model.predict(X_test)[:, 0] > 0.5
#print((preds_test == Y_test).mean())


#acc_augmented = train_and_test(X_train_augmented, Y_train_augmented)
#acc_original = train_and_test(X_train, Y_train)

(array([[ 4830, 24735,  5392, ...,  5177,  9938, 14648],
        [17671, 18627, 18627, ...,  9237, 18571, 20356],
        [27032, 24379, 12929, ...,  4981, 16644, 21820],
        ...,
        [   28,  3480, 28369, ..., 23453, 26165, 11300],
        [   28,  3480,  4830, ..., 29175, 27595, 16315],
        [ 4112, 15864, 12108, ...,  3570, 23453, 26165]]),)

Train on 15467 samples, validate on 1149 samples
Epoch 1/25
15467/15467 - 6s - loss: 0.0820 - acc: 0.9984 - val_loss: -5.1287e+01 - val_acc: 0.3403
Epoch 2/25
15467/15467 - 5s - loss: 2.1218e-06 - acc: 1.0000 - val_loss: -5.8763e+01 - val_acc: 0.3403
Epoch 3/25
15467/15467 - 6s - loss: 7.1212e-07 - acc: 1.0000 - val_loss: -6.4099e+01 - val_acc: 0.3403
Epoch 4/25
15467/15467 - 6s - loss: 3.3950e-07 - acc: 1.0000 - val_loss: -6.7853e+01 - val_acc: 0.3403
Epoch 5/25
15467/15467 - 6s - loss: 1.9863e-07 - acc: 1.0000 - val_loss: -7.0690e+01 - val_acc: 0.3403
Epoch 6/25
Restoring model weights from the end of the best epoch.
15467/15467 - 5s - loss: 1.3119e-07 - acc: 1.0000 - val_loss: -7.2955e+01 - val_acc: 0.3403
Epoch 00006: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fabb5cf02e8>