In [1]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english")) 

In [2]:
pd.options.display.max_rows = 4000

# Preprocess

In [3]:
def get_text_start_pos(text):
    pos = 0
    match1 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|between|among) (.+?) and (.+?)", text)
    match2 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective|dated|entered|executed|made) (.+?) and (.+?)", text)
    match3 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)(.+?)(the undersigned)(.+?) and (.+?)", text)
    if match1 and match1.start() < 1000:
        pos = match1.start()
    elif match2 and match2.start() < 1000:
        pos = match2.start()
    elif match3 and match3.start() < 1000:
        pos = match3.start()
    return pos

In [4]:
def preprocess(text):
    #Preprocess                
    text = text.replace('\n',' ').lower()
    
    #Remove non-alpha characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    pos = get_text_start_pos(text)
    text = text[pos:]
    #Remove articles
    #articles = ('a', 'an', 'the')
    #text = ' '.join([t for t in text.split() if t not in articles])

    return text

### Import Labeled Data

In [5]:
df_labeled = pd.read_csv('labeled_data_new1.csv') #without removing stop words 
#df_labeled = pd.read_csv('labeled_data.csv') #with removing stop words
df_labeled.head()
print(df_labeled.shape)

(1397, 3)


In [6]:
df_labeled.label.value_counts()

MSA         463
Addendum    314
Others      250
SOW         236
NDA         134
Name: label, dtype: int64

In [7]:
df_labeled[df_labeled['label'] == 'MSA'].head()

Unnamed: 0,filename,text,label
934,D02541.pdf.out.html.txt,agreement this agreement is made effective the...,MSA
935,D03758.pdf.out.html.txt,agreement book single author agreement made th...,MSA
936,D05984.pdf.out.html.txt,agreement this agreement is entered into as of...,MSA
937,D17795.pdf.out.html.txt,agreement this master consulting services agre...,MSA
938,D00662.pdf.out.html.txt,agreement services gustomer address address la...,MSA


In [8]:
#Split labelled data into test and dev sets
import numpy as np
np.random.seed(2019)
msk = np.random.rand(len(df_labeled)) < 0.8

df_dev = df_labeled[msk]
df_test = df_labeled[~msk]

In [9]:
print(df_test.shape, df_dev.shape)

(277, 3) (1120, 3)


In [10]:
df_dev.label.value_counts()

MSA         366
Addendum    245
Others      196
SOW         194
NDA         119
Name: label, dtype: int64

In [11]:
y_test = df_test.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_test = np.array(y_test)
np.bincount(y_test)

array([69, 97, 15, 54, 42], dtype=int64)

In [12]:
y_dev = df_dev.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_dev = np.array(y_dev)
np.bincount(y_dev)

array([245, 366, 119, 196, 194], dtype=int64)

In [13]:
df_test.drop('label', axis=1, inplace=True)
df_dev.drop('label', axis=1, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



### Import Unlabeled Data

In [14]:
df_unlabeled = pd.read_csv('unlabeled_data_new1.csv') #without removing stop words
#df_unlabeled = pd.read_csv('unlabeled_data.csv') #with removing stop words
df_unlabeled.shape

(15472, 2)

In [15]:
df_unlabeled.tail()

Unnamed: 0,filename,text
15467,D16492.pdf.out.html.txt,ert oep ik ie scope of work corporate mobilit...
15468,D19966.pdf.out.html.txt,w f bof hl lai b statement of work for gss in...
15469,D26253.pdf.out.html.txt,statement of work wolters kluwer united states...
15470,D37991.pdf.out.html.txt,statement of work including all exhibits attac...
15471,D01957.pdf.out.html.txt,statement of work no sow services definitions...


In [16]:
#Split unlabelled data into train and valid sets
np.random.seed(2019)
msk = np.random.rand(len(df_unlabeled)) < 0.02

df_train = df_unlabeled[~msk]
df_valid = df_unlabeled[msk]

In [17]:
print(df_train.shape, df_valid.shape)

(15179, 2) (293, 2)


### Labeling Functions

In [18]:
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier

ABSTAIN = -1
MSA = 1
SOW = 4
ADDENDUM = 0
NDA = 2
OTHERS = 3

labl_functions = []

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN

#===============MSA=======================
@labeling_function()
def msa_regex_lookup(x):    
    nonmsa_keywords = ['sow', 'statement of work', 'addendum', 'amendment', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(effective)(.+?) and (.+?)", x.text)
    match3 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(the undersigned)(.+?) and (.+?)", x.text)
        
    if (match1 and not(any(key in x.text[:match1.end()] for key in nonmsa_keywords))) \
        or (match2 and not(any(key in x.text[:match2.end()] for key in nonmsa_keywords))) \
        or (match3 and not(any(key in x.text[:match3.end()] for key in nonmsa_keywords))):
        return MSA
    return ABSTAIN

labl_functions.append(msa_regex_lookup)

msa_keywords = ['indemnified party', 'indemnifying party', 'force majeure', 'intellectual industrial', 
                'wk service provider', 'intellectual industrial property', 'industrial property right', 
                'privacy restricted data', 'prior written notice', 'force majeure event', 'subject matter hereof']

def make_keyword_lf_msa(keywords, label=MSA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))

for key in msa_keywords:
    labl_functions.append(make_keyword_lf_msa([key]))
    
 
#===============ADDENDUM===================
@labeling_function()
def addendum_regex_lookup(x):
    match1 = re.search(r"(addendum|amendment|change request|change order)\s+(\S+\s+){1,30}(by and between|by and among|between) (.+?) and (.+?)", x.text)
    match2 = re.search(r"(addendum|amendment)\s+(\S+\s+){1,30}(schedule a|effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(addendum|amendment) (.+?) (the undersigned) (.+?) and (.+?)", x.text)
    
    if (match1 and match1.start() < 1000) or (match2 and match2.start() < 1000) or (match3 and match3.start() < 1000):
        return ADDENDUM
    return ABSTAIN

labl_functions.append(addendum_regex_lookup)

addendum_keywords = ['rom work', 'addendum number', 'addendum part', 'amendment part',
                     'term addendum', 'term amendment', 'addendum made entered',
                     'addendum entered', 'duration addendum', 
                     'purpose addendum', 
                     'addendum executed', 'subsequent addendum', 'amendment number', 
                     'amendment date', 'amendment entered', 'amendment made', 'amendment executed', 
                     'amendment effective date', 
                     'addendum may executed', 'effective date addendum', 
                     'amendment made entered', 
                     'agreement hereby amended', 'service agreement amendment']

def make_keyword_lf_addendum(keywords, label=ADDENDUM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in addendum_keywords:
    labl_functions.append(make_keyword_lf_addendum([key]))

#===============SOW===================
@labeling_function()
def sow_regex_lookup(x):    
    nonsow_keywords = ['addendum','amendment']
    match1 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|executed by|between|entered into)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(the undersigned) (.+?) and (.+?)", x.text)
       
    if (match1 and match1.start() < 1000 and not(any(key in x.text[:match1.end()] for key in nonsow_keywords)) \
        or (match2 and match2.start() < 1000 and not(any(key not in x.text[:match2.end()] for key in nonsow_keywords))) \
        or match3 and match3.start() < 1000 and not(any(key not in x.text[:match3.end()] for key in nonsow_keywords))):
        return SOW
    return ABSTAIN

labl_functions.append(sow_regex_lookup)

sow_keywords = ['sow effective date', 'work sow', 'sow shall', 'sow term', 'service sow', 'defined sow', 
                'specified sow', 'outlined sow', 'addendum sow', 'client sow', 'sow agreement', 
                'statement work effective', 'sow end date', 'sow duration']

def make_keyword_lf_sow(keywords, label=SOW):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in sow_keywords:
    labl_functions.append(make_keyword_lf_sow([key]))

#===============NDA===================
@labeling_function()
def nda_regex_lookup(x):
    nda_keywords = ['mutual confidentiality', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(disclosure agreement|confidentiality agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    
    if match1 and match1.start() < 1000 and any(key in x.text for key in nda_keywords):
        return NDA
    return ABSTAIN

labl_functions.append(nda_regex_lookup)

nda_keywords = ['mutual confidentiality', 'affiliated entity', 'agreement negotiation', 'disclosure hereunder', 
                'mutual confidentiality agreement', 'non confidential basis', 'confidential information agent', 
                'confidentiality non disclosure', 'party certain confidential information',
                'party desire disclose party', 'party wish protect','party furnish']

def make_keyword_lf_nda(keywords, label=NDA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in nda_keywords:
    labl_functions.append(make_keyword_lf_nda([key]))
    

#===============OTHERS===================
@labeling_function()
def others_lookup(x):
    msa = msa_regex_lookup(x)
    sow = sow_regex_lookup(x)
    addendum = addendum_regex_lookup(x)
    nda = nda_regex_lookup(x)
    
    if msa == ABSTAIN and sow == ABSTAIN and addendum == ABSTAIN and nda == ABSTAIN:
        return OTHERS
    return ABSTAIN    
    
labl_functions.append(others_lookup)

other_keywords = ['sir madam letter', 'letter inform', 'engagement letter', 'service order form',
                  'change request form', 'signature form', 'agreement service order', 'service component order', 
                  'term service order', 'component order']

def make_keyword_lf_others(keywords, label=OTHERS):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in other_keywords:
    labl_functions.append(make_keyword_lf_others([key]))
    
@labeling_function()
def others_keyword_lookup(x):
    if all(word not in x.text for word in list(set(msa_keywords + sow_keywords + nda_keywords + addendum_keywords))):
        return OTHERS
    return ABSTAIN

#labl_functions.append(others_keyword_lookup)

print(len(labl_functions))

74


### Apply Label Functions to Corpus

In [19]:
#Apply the label functions to the train and valid sets
applier = PandasLFApplier(lfs=labl_functions)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)
L_valid = applier.apply(df=df_valid)
L_unlabelled = applier.apply(df=df_unlabeled)

100%|███████████████████████████████████████████████████████████████████████████| 15179/15179 [02:22<00:00, 106.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1120/1120 [00:14<00:00, 78.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 293/293 [00:03<00:00, 90.72it/s]
100%|███████████████████████████████████████████████████████████████████████████| 15472/15472 [02:25<00:00, 106.24it/s]


In [20]:
#Check the coverage of label functions on train set

from snorkel.labeling import LFAnalysis
LFAnalysis(L=L_train, lfs=labl_functions).lf_summary().sort_values(by='Coverage')

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_agreement hereby amended,34,[],0.0,0.0,0.0
keyword_duration addendum,21,[],0.0,0.0,0.0
keyword_addendum may executed,31,[],0.0,0.0,0.0
keyword_amendment made entered,33,[],0.0,0.0,0.0
keyword_specified sow,43,[],0.0,0.0,0.0
keyword_term amendment,18,[],0.0,0.0,0.0
keyword_addendum sow,45,[],0.0,0.0,0.0
keyword_outlined sow,44,[],0.0,0.0,0.0
keyword_party desire disclose party,61,[],0.0,0.0,0.0
keyword_party wish protect,62,[],0.0,0.0,0.0


In [21]:
#Check the Coverage and Accuracy of label functions on dev set

LFAnalysis(L=L_dev, lfs=labl_functions).lf_summary(y_dev).sort_values(by='Emp. Acc.')

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
keyword_component order,73,[],0.0,0.0,0.0,0,0,0.0
keyword_addendum entered,20,[],0.0,0.0,0.0,0,0,0.0
keyword_duration addendum,21,[],0.0,0.0,0.0,0,0,0.0
keyword_purpose addendum,22,[],0.0,0.0,0.0,0,0,0.0
keyword_addendum sow,45,[],0.0,0.0,0.0,0,0,0.0
keyword_outlined sow,44,[],0.0,0.0,0.0,0,0,0.0
keyword_specified sow,43,[],0.0,0.0,0.0,0,0,0.0
keyword_addendum made entered,19,[],0.0,0.0,0.0,0,0,0.0
keyword_amendment entered,27,[],0.0,0.0,0.0,0,0,0.0
keyword_amendment executed,29,[],0.0,0.0,0.0,0,0,0.0


### Label Model

In [22]:
from snorkel.labeling import LabelModel
label_model = LabelModel(cardinality=5, verbose=True)
#label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=50, seed=123)
label_model.fit(L_unlabelled, n_epochs=500, lr=0.001, log_freq=50, seed=123) #this is for gridsearchCV where train and valid split not required

In [23]:
label_model_acc = label_model.score(L=L_dev, Y=y_dev)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")



Label Model Accuracy:     85.9%


In [24]:
probs_train = label_model.predict_proba(L_train)

In [25]:
probs_valid = label_model.predict_proba(L_valid)

In [26]:
probs_full_train = label_model.predict_proba(L_unlabelled)

### Filter unlabelled data if any

In [37]:
from snorkel.labeling import filter_unlabeled_dataframe

"""df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)"""

#this is for gridsearchCV where train and valid split not required
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_unlabeled, y=probs_full_train, L=L_unlabelled
)

In [38]:
print(df_train_filtered.shape, probs_train_filtered.shape)

(9027, 2) (9027, 5)


In [48]:
df_valid = pd.DataFrame(df_valid, columns=['Text'])
df_valid.head()

Unnamed: 0,Text
0,addendum this software maintenance support and...
1,cc h statement of work schedule a financial s...
2,wolters kluwer certification the undersigned ...
3,wolters kluwer westpoint statement of work fo...
4,wolters kluwer contingent staffing request fo...


In [49]:
y_train = np.argmax(probs_train_filtered, axis=1)
y_valid = np.argmax(probs_valid, axis=1)

In [50]:
len(df_train_filtered), y_train.shape, len(df_valid), y_valid.shape

(9027, (9027,), 293, (293,))

In [51]:
df_train_filtered['label'] = y_train
df_valid['label'] = y_valid



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [52]:
df_train_filtered.to_csv('train.csv', index=False)
df_valid.to_csv('test.csv', index=False)