In [85]:
import pandas as pd
import re
import tensorflow as tf

In [87]:
df_train = pd.read_csv('../lstm/datafiles/train.csv')
df_train.head()

Unnamed: 0,filename,text,label
0,D36051.pdf.out.html.txt,janet ley approval sow mcw dba dmi mobility s...,3
1,D07271.pdf.out.html.txt,agreement this schedule a this schedule is att...,1
2,D28723.pdf.out.html.txt,wolters kluwer contingent staffing request fo...,3
3,D42247.pdf.out.html.txt,agreement received this agreement is entered i...,2
4,D19377.pdf.out.html.txt,addendum to hosting and services agreement thi...,0


In [88]:
df_train_filtered = df_train.text.tolist()

In [89]:
df_pred = pd.read_csv('linkage_files.csv')
df_pred.head()

Unnamed: 0,filename,text
0,D00738.txt,statement of work sys...
1,D00862.txt,statement of work m...
2,D03070.txt,cch incorporated 1 master consulting se...
3,D04125.txt,cch incorporated 14 master services agr...
4,D03866.txt,statement of work m...


In [90]:
df_pred.shape

(46, 2)

In [91]:
def get_text_start_pos(text):
    pos = 0
    match1 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|between|among) (.+?) and (.+?)", text)
    match2 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective|dated|entered|executed|made) (.+?) and (.+?)", text)
    match3 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)(.+?)(the undersigned)(.+?) and (.+?)", text)
    if match1 and match1.start() < 1000:
        pos = match1.start()
    elif match2 and match2.start() < 1000:
        pos = match2.start()
    elif match3 and match3.start() < 1000:
        pos = match3.start()
    return pos

In [92]:
def preprocess(text):
    #Preprocess                
    text = text.replace('\n',' ').lower()
    
    #Remove non-alpha characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    pos = get_text_start_pos(text)
    text = text[pos:]
    #Remove articles
    #articles = ('a', 'an', 'the')
    #text = ' '.join([t for t in text.split() if t not in articles])

    return text

In [93]:
df_pred['text'] = df_pred.text.apply(preprocess)

In [94]:
df_pred.loc[1].text

'statement of work mps practice and worksteam dba s cost center between cch incorporated mphasis bfl limited december statement of work mps cch incorporated page of this statement of work sow dated december is part of the master service agreement msa executed between cch incorporated client and mphasis bfl limited consultant and is subject to the terms and conditions of the msa between client and consultant made as of january except to the extent expressly provided otherwise in this sow all the terms of the msa are incorporated by reference into this sow in the event of any inconsistent or contradictory terms between the msa and the sow the terms of the msa will control and supersede such inconsistent or contradictory terms included in the msa any terms used specifically in the sow and not otherwise defined in the msa will be defined in the sow additional terms are defined in this sow project name practice and worksteam dba s statement of work mps cch incorporated page of table of cont

In [95]:
def msa_regex_lookup(x):    
    nonmsa_keywords = ['sow', 'statement of work', 'addendum', 'amendment', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(effective)(.+?) and (.+?)", x.text)
    match3 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(the undersigned)(.+?) and (.+?)", x.text)
        
    if (match1 and not(any(key in x.text[:match1.end()] for key in nonmsa_keywords))) \
        or (match2 and not(any(key in x.text[:match2.end()] for key in nonmsa_keywords))) \
        or (match3 and not(any(key in x.text[:match3.end()] for key in nonmsa_keywords))):
        return 1
    return 0

def addendum_regex_lookup(x):
    match1 = re.search(r"(addendum|amendment|change request|change order)\s+(\S+\s+){1,30}(by and between|by and among|between) (.+?) and (.+?)", x.text)
    match2 = re.search(r"(addendum|amendment)\s+(\S+\s+){1,30}(schedule a|effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(addendum|amendment) (.+?) (the undersigned) (.+?) and (.+?)", x.text)
    
    if (match1 and match1.start() < 1000) or (match2 and match2.start() < 1000) or (match3 and match3.start() < 1000):
        return 1
    return 0

def sow_regex_lookup(x):    
    nonsow_keywords = ['addendum','amendment']
    match1 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|executed by|between|entered into)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(the undersigned) (.+?) and (.+?)", x.text)
       
    if (match1 and match1.start() < 1000 and not(any(key in x.text[:match1.end()] for key in nonsow_keywords)) \
        or (match2 and match2.start() < 1000 and not(any(key not in x.text[:match2.end()] for key in nonsow_keywords))) \
        or match3 and match3.start() < 1000 and not(any(key not in x.text[:match3.end()] for key in nonsow_keywords))):
        return 1
    return 0

def nda_regex_lookup(x):
    nda_keywords = ['mutual confidentiality', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(disclosure agreement|confidentiality agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    
    if match1 and match1.start() < 1000 and any(key in x.text for key in nda_keywords):
        return 1
    return 0

def others_lookup(x):
    msa = msa_regex_lookup(x)
    sow = sow_regex_lookup(x)
    addendum = addendum_regex_lookup(x)
    nda = nda_regex_lookup(x)
    
    if msa == 0 and sow == 0 and addendum == 0 and nda == 0:
        return 1
    return 0    

In [96]:
msa_keywords = ['indemnified party', 'indemnifying party', 'force majeure', 'industrial property right', 'privacy restricted data', 
                'prior written notice', 'subject matter hereof']

addendum_keywords = ['addendum number', 'addendum date', 'addendum effective date',
                     'term of addendum', 'term of amendment', 'addendum made',
                     'addendum entered', 'duration of the addendum', 'purpose of the addendum', 
                     'subsequent addendum', 'amendment number', 'amendment date', 'amendment entered', 
                     'amendment made', 'amendment executed', 'amendment effective date', 
                     'agreement hereby amended', 'service agreement amendment']

sow_keywords = ['sow effective date', 'work sow', 'sow shall', 'sow term', 'client sow', 
                'sow agreement', 'statement of work effective', 'sow end date', 'sow duration']

nda_keywords = ['mutual confidentiality', 'affiliated entity', 'agreement negotiation', 'disclosure hereunder', 
                'mutual confidentiality agreement', 'non confidential basis', 'confidential information agent', 
                'confidentiality non disclosure', 'party certain confidential information',
                'party furnish']

other_keywords = ['sir madam', 'letter to inform', 'engagement letter', 'service order form',
                  'change request form', 'signature form', 'agreement service order', 'service component order', 
                  'component order', 'editorial service order']

In [97]:
#Unlabelled Dataset

#Initialize empty dataframe
df_latent_pred = pd.DataFrame()

for index, row in df_pred.iterrows():    
    msa_keywords_count = 0
    sow_keywords_count = 0
    nda_keywords_count = 0
    addendum_keywords_count = 0
    others_keywords_count = 0
    
    dict_latent = {}
    dict_latent['DocID'] = row["filename"]       
           
    #============MSA=============
        
    #Check for matching MSA keywords
    for key in msa_keywords:
        if key in row['text']:            
            dict_latent[key] = 1             
            msa_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['msa_keywords_count'] = msa_keywords_count
    
    #Call MSA regex lookup function
    dict_latent['msa_regex_lookup'] = msa_regex_lookup(row)
                
    #============SOW=============
            
    #Check for matching SOW keywords
    for key in sow_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            sow_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['sow_keywords_count'] = sow_keywords_count
    
    #Call SOW regex lookup function
    dict_latent['sow_regex_lookup'] = sow_regex_lookup(row)
                
    #==========ADDENDUM=============
            
    #Check for matching Addnedum keywords
    for key in addendum_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            addendum_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['addendum_keywords_count'] = addendum_keywords_count
        
    #Call Addendum regex lookup function
    dict_latent['addendum_regex_lookup'] = addendum_regex_lookup(row)        
                
    #============NDA=============
        
    #Check for matching NDA keywords
    for key in nda_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            nda_keywords_count += 1
        else:
            dict_latent[key] = 0
    
    dict_latent['nda_keywords_count'] = nda_keywords_count
    
    #Call NDA regex lookup function
    dict_latent['nda_regex_lookup'] = nda_regex_lookup(row)
    
    #============Others===========
    
    #Check for matching Others keywords
    for key in other_keywords:
        if key in row['text']:            
            dict_latent[key] = 1
            others_keywords_count += 1
        else:
            dict_latent[key] = 0

    dict_latent['others_keywords_count'] = others_keywords_count
    
    #Call Others regex lookup function
    dict_latent['others_lookup'] = others_lookup(row)                
    #break
    
    #Append dictionary to the DataFrame
    df_latent_pred = df_latent_pred.append(dict_latent, ignore_index = True)    

#fill NaNs with 0
df_latent_pred.fillna(0, inplace=True)

In [98]:
df_latent_pred.head()

Unnamed: 0,DocID,addendum date,addendum effective date,addendum entered,addendum made,addendum number,addendum_keywords_count,addendum_regex_lookup,affiliated entity,agreement hereby amended,...,sow shall,sow term,sow_keywords_count,sow_regex_lookup,statement of work effective,subject matter hereof,subsequent addendum,term of addendum,term of amendment,work sow
0,D00738.txt,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,D00862.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,D03070.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,D04125.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,D03866.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [99]:
df_latent_pred = df_latent_pred.drop(['DocID'], axis=1)
df_latent_pred.shape

(46, 64)

In [100]:
"""from sklearn.feature_extraction.text import CountVectorizer

#Count Vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=512)
x_train = vectorizer.fit_transform(df_train_filtered)
x_pred = vectorizer.transform(df_pred.text.tolist())"""

'from sklearn.feature_extraction.text import CountVectorizer\n\n#Count Vectorizer\nvectorizer = CountVectorizer(ngram_range=(1, 3), max_features=512)\nx_train = vectorizer.fit_transform(df_train_filtered)\nx_pred = vectorizer.transform(df_pred.text.tolist())'

In [101]:
import pickle
import numpy as np
countvect_model_pkl = '/home/user/Shyam/Code/Release_6.0/Dev/Snorkel/DCNN/models/count_vectorizer.pkl'
with open(countvect_model_pkl, 'rb') as f:
    countvect_model = pickle.load(f)

x_pred = countvect_model.transform(df_pred.text.tolist())

In [102]:
x_pred.shape

(46, 512)

In [103]:
max_features = 512
latent_features_size = df_latent_pred.shape[1]
word_index = 386003
EMBEDDING_DIM = 128

In [104]:
from keras.engine import Layer, InputSpec
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Embedding, Conv1D, Activation, ZeroPadding1D, Permute, Reshape, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate

class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """
    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[1] * self.k))

    def call(self, inputs):
        
        # swap last two dimensions since top_k will be applied along the last dimension
        #shifted_input = tf.transpose(inputs, [0, 2, 1])
        
        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(inputs, k=self.k, sorted=True, name=None)[0]
        
        # return flattened output
        return top_k

In [105]:
# two kinds of k's and kernel sizes for each operation
def two_conv_dynamic_cnn(k1 = 12, k2 = 8, ksize1 = 5, ksize2 = 5):
    inputs = Input(shape=(max_features,))
    inputs_latent = Input(shape=(latent_features_size,))
    embed = Embedding(word_index, 128, input_length=512)(inputs)
    conv_results = []
    # two feature maps using for loop
    for i in range(2):
        padded = ZeroPadding1D(ksize1 - 1)(embed)
        conv1 = Conv1D(EMBEDDING_DIM, ksize1, activation = 'relu')(padded)
        permuted = Permute((2,1))(conv1)
        kmaxpool1 = KMaxPooling(k1)(permuted)
        kmaxpool1 = Reshape((k1, -1))(kmaxpool1)
        padded = ZeroPadding1D(ksize2 -1)(kmaxpool1)
        conv2 = Conv1D(EMBEDDING_DIM, ksize2, activation = 'relu')(padded)
        permuted = Permute((2,1))(conv2)
        kmaxpool2 = KMaxPooling(k2)(permuted)
        kmaxpool2 = Reshape((k2, -1))(kmaxpool2)
        flattened = Flatten()(kmaxpool2)
        conv_results.append(flattened)
    x = concatenate(conv_results)
    x = concatenate([x, inputs_latent], axis=1)
    x = BatchNormalization()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(5, activation='softmax')(x)
    
    model = Model(inputs = [inputs, inputs_latent], outputs = outputs)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [106]:
two_conv_dynamic_cnn = two_conv_dynamic_cnn()
two_conv_dynamic_cnn.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 512)           0                                            
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 512, 128)      49408384    input_5[0][0]                    
____________________________________________________________________________________________________
zero_padding1d_9 (ZeroPadding1D) (None, 520, 128)      0           embedding_3[0][0]                
____________________________________________________________________________________________________
zero_padding1d_11 (ZeroPadding1D (None, 520, 128)      0           embedding_3[0][0]                
___________________________________________________________________________________________

In [107]:
two_conv_dynamic_cnn.load_weights('/home/user/Shyam/Code/Release_6.0/Dev/Snorkel/DCNN/models/dcnn-10epochs-90.0-98.97-99.52.hdf5')

In [108]:
probs_test = two_conv_dynamic_cnn.predict([x_pred, df_latent_pred])

KeyError: "None of [Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,\n            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],\n           dtype='int64')] are in the [columns]"

In [65]:
probs_test.shape

(24063, 5)

In [67]:
df_pred['prediction'] = pd.Series(probs_test.argmax(axis=1)).map({0: 'Addendum', 1: 'MSA', 4: 'SOW', 2: 'NDA', 3: 'Others'})

In [68]:
df_pred.head()

Unnamed: 0,filename,text,prediction
0,D10357.txt,mrted service order form this order can be fa...,Others
1,D17119.txt,e book conversion prices usd vendor name exhi...,Others
2,D33712.txt,statement of work ot between cch and blueswitc...,SOW
3,D40414.txt,addendum to agreement re managed services for ...,Addendum
4,D42859.txt,agreement this independent contractor agreemen...,MSA


In [69]:
df_pred.prediction.value_counts()

Others      12589
MSA          5266
Addendum     2411
SOW          2388
NDA          1409
Name: prediction, dtype: int64

In [70]:
df_pred.to_csv('corpus_predictions.csv', index=None)