In [1]:
import os
import re
import pandas as pd
import numpy as np

### Load Unlabeled Data

In [3]:
df_unlabeled = pd.read_csv('../csv/unlabeled_data_new2.csv')
df_unlabeled.shape

(15505, 2)

In [4]:
df_unlabeled.head()

Unnamed: 0,filename,text
0,D36051.pdf.out.html.txt,janet ley approval sow mcw dba dmi mobility s...
1,D07271.pdf.out.html.txt,agreement this schedule a this schedule is att...
2,D28723.pdf.out.html.txt,wolters kluwer contingent staffing request fo...
3,D42247.pdf.out.html.txt,agreement received this agreement is entered i...
4,D19377.pdf.out.html.txt,addendum to hosting and services agreement thi...


### Load Labeled Data

In [5]:
df_labeled = pd.read_csv('../csv/labeled_data_relabeled.csv')
df_labeled.shape

(1364, 3)

In [6]:
df_labeled.head()

Unnamed: 0,filename,text,label
0,D00152.pdf.out.html.txt,addendum no to master services agreement this ...,Addendum
1,D16833.pdf.out.html.txt,addendum no to master services agreement this ...,Addendum
2,D08368.pdf.out.html.txt,agreement i to add effective as of january the...,Addendum
3,D38435.pdf.out.html.txt,amendment number one to work order this amendm...,Addendum
4,D00918.pdf.out.html.txt,agreement and or enrollment identified above t...,Addendum


### Regex Functions

In [22]:
def msa_regex_lookup(x):    
    nonmsa_keywords = ['sow', 'statement of work', 'addendum', 'amendment', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(effective)(.+?) and (.+?)", x.text)
    match3 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(the undersigned)(.+?) and (.+?)", x.text)
        
    if (match1 and not(any(key in x.text[:match1.end()] for key in nonmsa_keywords))) \
        or (match2 and not(any(key in x.text[:match2.end()] for key in nonmsa_keywords))) \
        or (match3 and not(any(key in x.text[:match3.end()] for key in nonmsa_keywords))):
        return 1
    return 0

def addendum_regex_lookup(x):
    match1 = re.search(r"(addendum|amendment|change request|change order)\s+(\S+\s+){1,30}(by and between|by and among|between) (.+?) and (.+?)", x.text)
    match2 = re.search(r"(addendum|amendment)\s+(\S+\s+){1,30}(schedule a|effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(addendum|amendment) (.+?) (the undersigned) (.+?) and (.+?)", x.text)
    
    if (match1 and match1.start() < 1000) or (match2 and match2.start() < 1000) or (match3 and match3.start() < 1000):
        return 1
    return 0

def sow_regex_lookup(x):    
    nonsow_keywords = ['addendum','amendment']
    match1 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|executed by|between|entered into)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(the undersigned) (.+?) and (.+?)", x.text)
       
    if (match1 and match1.start() < 1000 and not(any(key in x.text[:match1.end()] for key in nonsow_keywords)) \
        or (match2 and match2.start() < 1000 and not(any(key not in x.text[:match2.end()] for key in nonsow_keywords))) \
        or match3 and match3.start() < 1000 and not(any(key not in x.text[:match3.end()] for key in nonsow_keywords))):
        return 1
    return 0

def nda_regex_lookup(x):
    nda_keywords = ['mutual confidentiality', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(disclosure agreement|confidentiality agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    
    if match1 and match1.start() < 1000 and any(key in x.text for key in nda_keywords):
        return 1
    return 0

def others_lookup(x):
    msa = msa_regex_lookup(x)
    sow = sow_regex_lookup(x)
    addendum = addendum_regex_lookup(x)
    nda = nda_regex_lookup(x)
    
    if msa == 0 and sow == 0 and addendum == 0 and nda == 0:
        return 1
    return 0    

### Keywords

In [23]:
msa_keywords = ['indemnified party', 'indemnifying party', 'force majeure', 'industrial property right', 'privacy restricted data', 
                'prior written notice', 'subject matter hereof']

addendum_keywords = ['addendum number', 'addendum date', 'addendum effective date',
                     'term of addendum', 'term of amendment', 'addendum made',
                     'addendum entered', 'duration of the addendum', 'purpose of the addendum', 
                     'subsequent addendum', 'amendment number', 'amendment date', 'amendment entered', 
                     'amendment made', 'amendment executed', 'amendment effective date', 
                     'agreement hereby amended', 'service agreement amendment']

sow_keywords = ['sow effective date', 'work sow', 'sow shall', 'sow term', 'client sow', 
                'sow agreement', 'statement of work effective', 'sow end date', 'sow duration']

nda_keywords = ['mutual confidentiality', 'affiliated entity', 'agreement negotiation', 'disclosure hereunder', 
                'mutual confidentiality agreement', 'non confidential basis', 'confidential information agent', 
                'confidentiality non disclosure', 'party certain confidential information',
                'party furnish']

other_keywords = ['sir madam', 'letter to inform', 'engagement letter', 'service order form',
                  'change request form', 'signature form', 'agreement service order', 'service component order', 
                  'component order', 'editorial service order']

### Create Latent Features

In [24]:
#Unlabelled Dataset

#Initialize empty dataframe
df_latent_train = pd.DataFrame()

for index, row in df_unlabeled.iterrows():    
    msa_keywords_count = 0
    sow_keywords_count = 0
    nda_keywords_count = 0
    addendum_keywords_count = 0
    others_keywords_count = 0
    
    dict_latent = {}
    dict_latent['DocID'] = row["filename"]       
           
    #============MSA=============
        
    #Check for matching MSA keywords
    for key in msa_keywords:
        if key in row['text']:            
            dict_latent[key] = 1             
            msa_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['msa_keywords_count'] = msa_keywords_count
    
    #Call MSA regex lookup function
    dict_latent['msa_regex_lookup'] = msa_regex_lookup(row)
                
    #============SOW=============
            
    #Check for matching SOW keywords
    for key in sow_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            sow_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['sow_keywords_count'] = sow_keywords_count
    
    #Call SOW regex lookup function
    dict_latent['sow_regex_lookup'] = sow_regex_lookup(row)
                
    #==========ADDENDUM=============
            
    #Check for matching Addnedum keywords
    for key in addendum_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            addendum_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['addendum_keywords_count'] = addendum_keywords_count
        
    #Call Addendum regex lookup function
    dict_latent['addendum_regex_lookup'] = addendum_regex_lookup(row)        
                
    #============NDA=============
        
    #Check for matching NDA keywords
    for key in nda_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            nda_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['nda_keywords_count'] = nda_keywords_count
    
    #Call NDA regex lookup function
    dict_latent['nda_regex_lookup'] = nda_regex_lookup(row)
    
    #============Others===========
    
    #Check for matching Others keywords
    for key in other_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            others_keywords_count += 1
        else:
            dict_latent[key] = 0

    dict_latent['others_keywords_count'] = others_keywords_count
    
    #Call Others regex lookup function
    dict_latent['others_lookup'] = others_lookup(row)                
    #break
    
    #Append dictionary to the DataFrame
    df_latent_train = df_latent_train.append(dict_latent, ignore_index = True)    

#fill NaNs with 0
df_latent_train.fillna(0, inplace=True)

In [25]:
df_latent_train.shape

(15505, 65)

In [26]:
df_latent_train.head()

Unnamed: 0,DocID,addendum date,addendum effective date,addendum entered,addendum made,addendum number,addendum_keywords_count,addendum_regex_lookup,affiliated entity,agreement hereby amended,...,sow shall,sow term,sow_keywords_count,sow_regex_lookup,statement of work effective,subject matter hereof,subsequent addendum,term of addendum,term of amendment,work sow
0,D36051.pdf.out.html.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,D07271.pdf.out.html.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,D28723.pdf.out.html.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,D42247.pdf.out.html.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,D19377.pdf.out.html.txt,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
#Labeled Dataset

#Initialize empty dataframe
df_latent_test = pd.DataFrame()

for index, row in df_labeled.iterrows():    
    msa_keywords_count = 0
    sow_keywords_count = 0
    nda_keywords_count = 0
    addendum_keywords_count = 0
    others_keywords_count = 0
    
    dict_latent = {}
    dict_latent['DocID'] = row["filename"]       
           
    #============MSA=============
        
    #Check for matching MSA keywords
    for key in msa_keywords:
        if key in row['text']:            
            dict_latent[key] = 1             
            msa_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['msa_keywords_count'] = msa_keywords_count
    
    #Call MSA regex lookup function
    dict_latent['msa_regex_lookup'] = msa_regex_lookup(row)
                
    #============SOW=============
            
    #Check for matching SOW keywords
    for key in sow_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            sow_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['sow_keywords_count'] = sow_keywords_count
    
    #Call SOW regex lookup function
    dict_latent['sow_regex_lookup'] = sow_regex_lookup(row)
                
    #==========ADDENDUM=============
            
    #Check for matching Addnedum keywords
    for key in addendum_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            addendum_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['addendum_keywords_count'] = addendum_keywords_count
        
    #Call Addendum regex lookup function
    dict_latent['addendum_regex_lookup'] = addendum_regex_lookup(row)        
                
    #============NDA=============
        
    #Check for matching NDA keywords
    for key in nda_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            nda_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['nda_keywords_count'] = nda_keywords_count
    
    #Call NDA regex lookup function
    dict_latent['nda_regex_lookup'] = nda_regex_lookup(row)
    
    #============Others===========
    
    #Check for matching Others keywords
    for key in other_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            others_keywords_count += 1
        else:
            dict_latent[key] = 0

    dict_latent['others_keywords_count'] = others_keywords_count
    
    #Call Others regex lookup function
    dict_latent['others_lookup'] = others_lookup(row)                
    #break
    
    #Append dictionary to the DataFrame
    df_latent_test = df_latent_test.append(dict_latent, ignore_index = True)

#fill NaNs with 0
df_latent_test.fillna(0, inplace=True)

In [28]:
print(df_latent_test.shape)

(1364, 65)


In [29]:
df_latent_test.head()

Unnamed: 0,DocID,addendum date,addendum effective date,addendum entered,addendum made,addendum number,addendum_keywords_count,addendum_regex_lookup,affiliated entity,agreement hereby amended,...,sow shall,sow term,sow_keywords_count,sow_regex_lookup,statement of work effective,subject matter hereof,subsequent addendum,term of addendum,term of amendment,work sow
0,D00152.pdf.out.html.txt,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,D16833.pdf.out.html.txt,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,D08368.pdf.out.html.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,D38435.pdf.out.html.txt,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,D00918.pdf.out.html.txt,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
df_latent_train.to_csv('latent_features_train.csv', index=None)
df_latent_test.to_csv('latent_features_test.csv', index=None)