In [19]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english")) 

In [20]:
pd.options.display.max_rows = 4000

### Preprocess

In [6]:
def get_text_start_pos(text):
    pos = 0
    match1 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|between|among) (.+?) and (.+?)", text)
    match2 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective|dated|entered|executed|made) (.+?) and (.+?)", text)
    match3 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)(.+?)(the undersigned)(.+?) and (.+?)", text)
    if match1 and match1.start() < 1000:
        pos = match1.start()
    elif match2 and match2.start() < 1000:
        pos = match2.start()
    elif match3 and match3.start() < 1000:
        pos = match3.start()
    return pos

In [7]:
def preprocess(text):
    #Preprocess                
    text = text.replace('\n',' ').lower()
    
    #Remove non-alpha characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    pos = get_text_start_pos(text)
    text = text[pos:]
    #Remove articles
    #articles = ('a', 'an', 'the')
    #text = ' '.join([t for t in text.split() if t not in articles])

    return text

### Import Labeled Data

In [23]:
df_labeled = pd.read_csv('../datafiles/labeled_data_relabeled.csv') #without removing stop words 
#df_labeled = pd.read_csv('labeled_data.csv') #with removing stop words
df_labeled.head()
print(df_labeled.shape)

(1365, 3)


In [4]:
df_labeled.label.value_counts()

MSA         463
Addendum    314
Others      250
SOW         236
NDA         134
Name: label, dtype: int64

In [24]:
df_labeled[df_labeled['label'] == 'MSA'].head()

Unnamed: 0,filename,text,label
78,D28099.pdf.out.html.txt,addendum to the worldwide services agreement f...,MSA
123,D01983.pdf.out.html.txt,amendment online services and courtlink servic...,MSA
207,D38343.pdf.out.html.txt,addendum this addendum entered into effec ve i...,MSA
296,D01856.pdf.out.html.txt,addendum by signing below this addendum may no...,MSA
322,D05035.pdf.out.html.txt,agreement price is covering a term of year s t...,MSA


In [25]:
#Split labelled data into test and dev sets
import numpy as np
np.random.seed(2019)
msk = np.random.rand(len(df_labeled)) < 0.8

df_dev = df_labeled[msk]
df_test = df_labeled[~msk]

In [26]:
print(df_test.shape, df_dev.shape)

(269, 3) (1096, 3)


In [27]:
df_dev.label.value_counts()

MSA         377
Addendum    214
Others      204
SOW         186
NDA         114
Name: label, dtype: int64

In [28]:
y_test = df_test.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_test = np.array(y_test)
np.bincount(y_test)

array([64, 91, 19, 55, 40])

In [29]:
y_dev = df_dev.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_dev = np.array(y_dev)
np.bincount(y_dev)

TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'

In [11]:
df_test.drop('label', axis=1, inplace=True)
df_dev.drop('label', axis=1, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Import Unlabeled Data

In [12]:
df_unlabeled = pd.read_csv('../datafiles/unlabeled_data_new1.csv') #without removing stop words
#df_unlabeled = pd.read_csv('unlabeled_data.csv') #with removing stop words
df_unlabeled.shape

(15472, 2)

In [13]:
df_unlabeled.tail()

Unnamed: 0,filename,text
15467,D16492.pdf.out.html.txt,ert oep ik ie scope of work corporate mobilit...
15468,D19966.pdf.out.html.txt,w f bof hl lai b statement of work for gss in...
15469,D26253.pdf.out.html.txt,statement of work wolters kluwer united states...
15470,D37991.pdf.out.html.txt,statement of work including all exhibits attac...
15471,D01957.pdf.out.html.txt,statement of work no sow services definitions...


In [14]:
#Split unlabelled data into train and valid sets
np.random.seed(2019)
msk = np.random.rand(len(df_unlabeled)) < 0.02

df_train = df_unlabeled[~msk]
df_valid = df_unlabeled[msk]

In [15]:
print(df_train.shape, df_valid.shape)

(15179, 2) (293, 2)


### Labeling Functions

In [16]:
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier

ABSTAIN = -1
MSA = 1
SOW = 4
ADDENDUM = 0
NDA = 2
OTHERS = 3

labl_functions = []

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN

#===============MSA=======================
@labeling_function()
def msa_regex_lookup(x):    
    nonmsa_keywords = ['sow', 'statement of work', 'addendum', 'amendment', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(effective)(.+?) and (.+?)", x.text)
    match3 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(the undersigned)(.+?) and (.+?)", x.text)
        
    if (match1 and not(any(key in x.text[:match1.end()] for key in nonmsa_keywords))) \
        or (match2 and not(any(key in x.text[:match2.end()] for key in nonmsa_keywords))) \
        or (match3 and not(any(key in x.text[:match3.end()] for key in nonmsa_keywords))):
        return MSA
    return ABSTAIN

labl_functions.append(msa_regex_lookup)

msa_keywords = ['indemnified party', 'indemnifying party', 'force majeure', 'intellectual industrial', 
                'wk service provider', 'intellectual industrial property', 'industrial property right', 
                'privacy restricted data', 'prior written notice', 'force majeure event', 'subject matter hereof']

def make_keyword_lf_msa(keywords, label=MSA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))

for key in msa_keywords:
    labl_functions.append(make_keyword_lf_msa([key]))
    
 
#===============ADDENDUM===================
@labeling_function()
def addendum_regex_lookup(x):
    match1 = re.search(r"(addendum|amendment|change request|change order)\s+(\S+\s+){1,30}(by and between|by and among|between) (.+?) and (.+?)", x.text)
    match2 = re.search(r"(addendum|amendment)\s+(\S+\s+){1,30}(schedule a|effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(addendum|amendment) (.+?) (the undersigned) (.+?) and (.+?)", x.text)
    
    if (match1 and match1.start() < 1000) or (match2 and match2.start() < 1000) or (match3 and match3.start() < 1000):
        return ADDENDUM
    return ABSTAIN

labl_functions.append(addendum_regex_lookup)

addendum_keywords = ['rom work', 'addendum number', 'addendum part', 'amendment part',
                     'term addendum', 'term amendment', 'addendum made entered',
                     'addendum entered', 'duration addendum', 
                     'purpose addendum', 
                     'addendum executed', 'subsequent addendum', 'amendment number', 
                     'amendment date', 'amendment entered', 'amendment made', 'amendment executed', 
                     'amendment effective date', 
                     'addendum may executed', 'effective date addendum', 
                     'amendment made entered', 
                     'agreement hereby amended', 'service agreement amendment']

def make_keyword_lf_addendum(keywords, label=ADDENDUM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in addendum_keywords:
    labl_functions.append(make_keyword_lf_addendum([key]))

#===============SOW===================
@labeling_function()
def sow_regex_lookup(x):    
    nonsow_keywords = ['addendum','amendment']
    match1 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|executed by|between|entered into)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(the undersigned) (.+?) and (.+?)", x.text)
       
    if (match1 and match1.start() < 1000 and not(any(key in x.text[:match1.end()] for key in nonsow_keywords)) \
        or (match2 and match2.start() < 1000 and not(any(key not in x.text[:match2.end()] for key in nonsow_keywords))) \
        or match3 and match3.start() < 1000 and not(any(key not in x.text[:match3.end()] for key in nonsow_keywords))):
        return SOW
    return ABSTAIN

labl_functions.append(sow_regex_lookup)

sow_keywords = ['sow effective date', 'work sow', 'sow shall', 'sow term', 'service sow', 'defined sow', 
                'specified sow', 'outlined sow', 'addendum sow', 'client sow', 'sow agreement', 
                'statement work effective', 'sow end date', 'sow duration']

def make_keyword_lf_sow(keywords, label=SOW):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in sow_keywords:
    labl_functions.append(make_keyword_lf_sow([key]))

#===============NDA===================
@labeling_function()
def nda_regex_lookup(x):
    nda_keywords = ['mutual confidentiality', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(disclosure agreement|confidentiality agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    
    if match1 and match1.start() < 1000 and any(key in x.text for key in nda_keywords):
        return NDA
    return ABSTAIN

labl_functions.append(nda_regex_lookup)

nda_keywords = ['mutual confidentiality', 'affiliated entity', 'agreement negotiation', 'disclosure hereunder', 
                'mutual confidentiality agreement', 'non confidential basis', 'confidential information agent', 
                'confidentiality non disclosure', 'party certain confidential information',
                'party desire disclose party', 'party wish protect','party furnish']

def make_keyword_lf_nda(keywords, label=NDA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in nda_keywords:
    labl_functions.append(make_keyword_lf_nda([key]))
    

#===============OTHERS===================
@labeling_function()
def others_lookup(x):
    msa = msa_regex_lookup(x)
    sow = sow_regex_lookup(x)
    addendum = addendum_regex_lookup(x)
    nda = nda_regex_lookup(x)
    
    if msa == ABSTAIN and sow == ABSTAIN and addendum == ABSTAIN and nda == ABSTAIN:
        return OTHERS
    return ABSTAIN    
    
labl_functions.append(others_lookup)

other_keywords = ['sir madam letter', 'letter inform', 'engagement letter', 'service order form',
                  'change request form', 'signature form', 'agreement service order', 'service component order', 
                  'term service order', 'component order']

def make_keyword_lf_others(keywords, label=OTHERS):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in other_keywords:
    labl_functions.append(make_keyword_lf_others([key]))
    
@labeling_function()
def others_keyword_lookup(x):
    if all(word not in x.text for word in list(set(msa_keywords + sow_keywords + nda_keywords + addendum_keywords))):
        return OTHERS
    return ABSTAIN

#labl_functions.append(others_keyword_lookup)

print(len(labl_functions))

75


### Apply Label Functions to Corpus

In [17]:
#Apply the label functions to the train and valid sets
applier = PandasLFApplier(lfs=labl_functions)
#L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)
#L_valid = applier.apply(df=df_valid)
#L_unlabelled = applier.apply(df=df_unlabeled)

100%|██████████| 1120/1120 [00:13<00:00, 82.15it/s]


In [18]:
L_dev

array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1,  1, ..., -1, -1, -1],
       [ 1,  1, -1, ..., -1, -1, -1]])

In [21]:
#Check the coverage of label functions on train set

from snorkel.labeling import LFAnalysis
LFAnalysis(L=L_train, lfs=labl_functions).lf_summary().sort_values(by='Coverage')

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_party wish protect,62,[],0.0,0.0,0.0
keyword_agreement hereby amended,34,[],0.0,0.0,0.0
keyword_addendum made entered,19,[],0.0,0.0,0.0
keyword_amendment made entered,33,[],0.0,0.0,0.0
keyword_party desire disclose party,61,[],0.0,0.0,0.0
keyword_addendum may executed,31,[],0.0,0.0,0.0
keyword_sir madam letter,65,[],0.0,0.0,0.0
keyword_letter inform,66,[],0.0,0.0,0.0
keyword_duration addendum,21,[],0.0,0.0,0.0
keyword_specified sow,43,[],0.0,0.0,0.0


In [22]:
#Check the Coverage and Accuracy of label functions on dev set

LFAnalysis(L=L_dev, lfs=labl_functions).lf_summary(y_dev).sort_values(by='Emp. Acc.')

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
keyword_component order,74,[],0.0,0.0,0.0,0,0,0.0
keyword_addendum entered,20,[],0.0,0.0,0.0,0,0,0.0
keyword_duration addendum,21,[],0.0,0.0,0.0,0,0,0.0
keyword_purpose addendum,22,[],0.0,0.0,0.0,0,0,0.0
keyword_statement work effective,48,[],0.0,0.0,0.0,0,0,0.0
keyword_addendum sow,45,[],0.0,0.0,0.0,0,0,0.0
keyword_outlined sow,44,[],0.0,0.0,0.0,0,0,0.0
keyword_amendment entered,27,[],0.0,0.0,0.0,0,0,0.0
keyword_specified sow,43,[],0.0,0.0,0.0,0,0,0.0
keyword_amendment executed,29,[],0.0,0.0,0.0,0,0,0.0


### Label Model

In [23]:
from snorkel.labeling import LabelModel
label_model = LabelModel(cardinality=5, verbose=True)
#label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=50, seed=123)
label_model.fit(L_unlabelled, n_epochs=500, lr=0.001, log_freq=50, seed=123) #this is for gridsearchCV where train and valid split not required

In [24]:
label_model_acc = label_model.score(L=L_dev, Y=y_dev)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

W1126 10:12:42.424433 140103858988864 label_model.py:501] Metrics calculated over data points with non-abstain labels only


Label Model Accuracy:     83.2%


In [25]:
probs_train = label_model.predict_proba(L_train)
probs_train

array([[1.29169205e-01, 1.13923146e-01, 1.45825600e-01, 4.70655391e-01,
        1.40426658e-01],
       [5.80389208e-02, 7.38499149e-01, 1.36270172e-01, 1.08979427e-05,
        6.71808610e-02],
       [1.29169205e-01, 1.13923146e-01, 1.45825600e-01, 4.70655391e-01,
        1.40426658e-01],
       ...,
       [3.42098974e-21, 9.99999999e-01, 1.90871293e-25, 2.12926821e-19,
        1.34091042e-09],
       [1.30362784e-02, 1.35354423e-02, 5.98373991e-03, 1.52538422e-06,
        9.67443014e-01],
       [7.10104235e-03, 1.44825894e-02, 3.46110212e-03, 5.07593464e-03,
        9.69879331e-01]])

In [26]:
probs_valid = label_model.predict_proba(L_valid)
probs_valid

array([[4.17405758e-01, 4.62244464e-01, 3.48237479e-02, 1.14811527e-05,
        8.55145486e-02],
       [1.29169205e-01, 1.13923146e-01, 1.45825600e-01, 4.70655391e-01,
        1.40426658e-01],
       [1.29169205e-01, 1.13923146e-01, 1.45825600e-01, 4.70655391e-01,
        1.40426658e-01],
       ...,
       [9.76592663e-01, 9.84246506e-04, 8.95563490e-03, 1.06297994e-09,
        1.34674547e-02],
       [1.29169205e-01, 1.13923146e-01, 1.45825600e-01, 4.70655391e-01,
        1.40426658e-01],
       [1.60941190e-03, 1.74891545e-03, 4.40019780e-04, 7.22597434e-08,
        9.96201581e-01]])

In [27]:
#this is for gridsearchCV where train and valid split not required
probs_full_train = label_model.predict_proba(L_unlabelled)
probs_full_train

array([[1.29169205e-01, 1.13923146e-01, 1.45825600e-01, 4.70655391e-01,
        1.40426658e-01],
       [5.80389208e-02, 7.38499149e-01, 1.36270172e-01, 1.08979427e-05,
        6.71808610e-02],
       [1.29169205e-01, 1.13923146e-01, 1.45825600e-01, 4.70655391e-01,
        1.40426658e-01],
       ...,
       [3.42098974e-21, 9.99999999e-01, 1.90871293e-25, 2.12926821e-19,
        1.34091042e-09],
       [1.30362784e-02, 1.35354423e-02, 5.98373991e-03, 1.52538422e-06,
        9.67443014e-01],
       [7.10104235e-03, 1.44825894e-02, 3.46110212e-03, 5.07593464e-03,
        9.69879331e-01]])

### Filter unlaabeled data if any

In [28]:
from snorkel.labeling import filter_unlabeled_dataframe

"""df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)"""

#this is for gridsearchCV where train and valid split not required
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_unlabeled, y=probs_full_train, L=L_unlabelled
)

In [29]:
print(df_train_filtered.shape, probs_train_filtered.shape)

(15472, 2) (15472, 5)


In [30]:
df_train_filtered = df_train_filtered.text.tolist()
y_train = np.argmax(probs_train_filtered, axis=1)
df_valid = df_valid.text.tolist()
y_valid = np.argmax(probs_valid, axis=1)

In [31]:
len(df_train_filtered), y_train.shape, len(df_valid), y_valid.shape

(15472, (15472,), 293, (293,))

### Training

In [32]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 512 # cut texts after this number of words

# prepare tokenizer
t = Tokenizer(num_words=10000)
t.fit_on_texts(df_train_filtered)
t.fit_on_texts(df_valid)
post_seq_train = t.texts_to_sequences(df_train_filtered)
post_seq_valid = t.texts_to_sequences(df_valid)
X_train = pad_sequences(post_seq_train, maxlen=max_features, padding='post')
X_valid = pad_sequences(post_seq_valid, maxlen=max_features, padding='post')

Using TensorFlow backend.


In [34]:
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)
#print(X_train.shape, y_train.shape)

(15472, 512) (15472,) (293, 512) (293,)


In [35]:
word_index = t.word_index
len(word_index)

401227

In [58]:
#Import Embeddings and create embedding dict
embeddings_index = {}
f = open('glove.6B/glove.6B.300d.txt', encoding="utf8")
#f = open('../w2v_embedding/cms_word2vec_embedding_300.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [59]:
from keras.layers import Embedding
from keras.initializers import Constant

#Create embedding layer
EMBEDDING_DIM = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_features,
                            trainable=False)

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=100)
X_train = vectorizer.fit_transform(df_train_filtered)

X_dev = vectorizer.transform(df_dev.text.tolist())
X_valid = vectorizer.transform(df_valid)
X_test = vectorizer.transform(df_labeled.text.tolist())

In [89]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional, Activation
from keras.layers.normalization import BatchNormalization

batch_size = 32

model = Sequential()
model.add(Embedding(len(word_index), 128, input_length=100))
#model.add(Embedding(len(word_index), 128, input_length=max_features, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(256)))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))

"""
inputs = Input(shape=(max_features,), dtype='int32')
embed = embedding_layer(inputs)
x = Bidirectional(LSTM(256))(embed)
x = BatchNormalization()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(5, activation='softmax')(x)
model = Model(inputs = inputs, outputs = x)"""

model.summary()

"\ninputs = Input(shape=(max_features,), dtype='int32')\nembed = embedding_layer(inputs)\nx = Bidirectional(LSTM(256))(embed)\nx = BatchNormalization()(x)\nx = Dense(128, activation='relu')(x)\nx = Dropout(0.2)(x)\nx = Dense(5, activation='softmax')(x)\nmodel = Model(inputs = inputs, outputs = x)"

In [90]:
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
lr = 3e-4
epochs = 25
#decay=lr/epochs
adam = Adam(lr=lr)

model.compile(adam, 'categorical_crossentropy', metrics=['accuracy'])

In [91]:
model_chkpoint = ModelCheckpoint(filepath='best_model.hdf5', save_best_only=True, save_weights_only=True)

In [92]:
reduce_lr = ReduceLROnPlateau(monitor='val_acc', 
                                patience=3, 
                                verbose=1, 
                                factor=0.2, 
                                min_lr=1e-7)

In [93]:
import pandas as pd
T_train = pd.get_dummies(y_train)
T_valid = pd.get_dummies(y_valid)

In [94]:
model.fit(X_train, T_train,
          batch_size=batch_size,
          epochs=epochs,
          shuffle=True, 
          validation_split=0.05,
          callbacks=[reduce_lr, model_chkpoint])

Train on 14698 samples, validate on 774 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25

Epoch 00010: ReduceLROnPlateau reducing learning rate to 6.000000284984708e-05.
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25

Epoch 00018: ReduceLROnPlateau reducing learning rate to 1.2000000424450263e-05.
Epoch 19/25
Epoch 20/25
Epoch 21/25

Epoch 00021: ReduceLROnPlateau reducing learning rate to 2.4000000848900527e-06.
Epoch 22/25
Epoch 23/25
Epoch 24/25

Epoch 00024: ReduceLROnPlateau reducing learning rate to 4.800000169780105e-07.
Epoch 25/25


<keras.callbacks.History at 0x7f69e9eed550>

In [None]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional
from utils import get_keras_early_stopping
from keras.optimizers import Adam

def create_model(lr=0.001):
    # create model
    model = Model()
    inputs = Input(shape=(max_features,), dtype='int32')
    embed = embedding_layer(inputs)
    x = Bidirectional(LSTM(64))(embed)
    x = Dropout(0.5)(x)
    x = Dense(5, activation='softmax')(x)
    model = Model(inputs = inputs, outputs = x)
    
    adam = Adam(lr=lr)
    
    # Compile model
    model.compile(adam, 'categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn=create_model, batch_size=32, epochs=15, verbose=0)

In [None]:
from sklearn.model_selection import GridSearchCV

# define the grid search parameters
#batch_size = [10, 20, 40, 60]
#epochs = [10, 25, 50]
lr = [0.001, 0.01, 0.1]
param_grid = dict(lr=lr)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=5,  verbose=10)
grid_result = grid.fit(X_train, y_train)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### Predictions

In [95]:
model.load_weights('best_model.hdf5')
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 128)          51357056  
_________________________________________________________________
bidirectional_7 (Bidirection (None, 512)               788480    
_________________________________________________________________
batch_normalization_7 (Batch (None, 512)               2048      
_________________________________________________________________
dense_13 (Dense)             (None, 128)               65664     
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 5)                 645       
Total params: 52,213,893
Trainable params: 52,212,869
Non-trainable params: 1,024
______________________________________

In [96]:
"""text_pred = df_labeled.text.tolist()
t.fit_on_texts(text_pred)
post_seq_test = t.texts_to_sequences(text_pred)
x_test = pad_sequences(post_seq_test, maxlen=max_features, padding='post')
probs_test = model.predict(x_test)"""
probs_test = model.predict(X_test)

In [97]:
labels_test = df_labeled.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
len(labels_test)

1397

In [98]:
from snorkel.analysis import metric_score
test_acc = metric_score(golds=labels_test, preds=probs_test.argmax(axis=1), metric="accuracy")
print(f"Test Accuracy: {test_acc * 100:.1f}%")

Test Accuracy: 77.8%


### Save Model

In [None]:
from keras.callbacks.callbacks import ModelCheckpoint
filepath = 'models/lstm.hdf5'
checkpointer = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)

### Save Model Probabilities and Labels

In [None]:
train_pred = df_unlabeled.text.tolist()
t.fit_on_texts(train_pred)
post_seq_test = t.texts_to_sequences(train_pred)
x_train = pad_sequences(post_seq_test, maxlen=max_features, padding='post')
probs_train = model.predict(x_train)

In [None]:
len(probs_train)

In [None]:
labels_train = label_model.predict(L_unlabelled)
labels_train

In [None]:
print(len(labels_train), len(labels_test))
print(len(probs_train), len(probs_test))

In [None]:
probs_latent_train = list(zip(probs_train.tolist(), labels_train.tolist()))
probs_latent_test = list(zip(probs_test.tolist(), labels_test.tolist()))
probs_latent_test

In [None]:
df_probs_train = pd.DataFrame(probs_latent_train, columns=['probabilities', 'label'])
df_probs_test = pd.DataFrame(probs_latent_test, columns=['probabilities', 'label'])

In [None]:
df_probs_test.head()

In [None]:
df_probs_train.to_csv('probabilities_train.csv', index = None)
df_probs_test.to_csv('probabilities_test.csv', index = None)