In [1]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
pd.options.display.max_rows = 4000

### Preprocess

In [3]:
def get_text_start_pos(text):
    pos = 0
    match1 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|between|among) (.+?) and (.+?)", text)
    match2 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective|dated|entered|executed|made) (.+?) and (.+?)", text)
    match3 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)(.+?)(the undersigned)(.+?) and (.+?)", text)
    if match1 and match1.start() < 1000:
        pos = match1.start()
    elif match2 and match2.start() < 1000:
        pos = match2.start()
    elif match3 and match3.start() < 1000:
        pos = match3.start()
    return pos

In [4]:
def preprocess(text):
    #Preprocess                
    text = text.replace('\n',' ').lower()
    
    #Remove non-alpha characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    pos = get_text_start_pos(text)
    text = text[pos:]
    #Remove articles
    #articles = ('a', 'an', 'the')
    #text = ' '.join([t for t in text.split() if t not in articles])

    return text

### Import Labeled Data

In [5]:
df_labeled = pd.read_csv('../datafiles/labeled_data_new1.csv') #without removing stop words 
#df_labeled = pd.read_csv('labeled_data.csv') #with removing stop words
df_labeled.head()
print(df_labeled.shape)

(1397, 3)


In [6]:
df_labeled.label.value_counts()

MSA         463
Addendum    314
Others      250
SOW         236
NDA         134
Name: label, dtype: int64

In [7]:
#Split labelled data into test and dev sets
import numpy as np
msk = np.random.rand(len(df_labeled)) < 0.8

df_dev = df_labeled[msk]
df_test = df_labeled[~msk]

In [8]:
print(df_test.shape, df_dev.shape)

(285, 3) (1112, 3)


In [9]:
df_dev.label.value_counts()

MSA         363
Addendum    253
Others      196
SOW         190
NDA         110
Name: label, dtype: int64

In [10]:
y_test = df_test.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_test = np.array(y_test)
np.bincount(y_test)

array([ 61, 100,  24,  54,  46])

In [11]:
y_dev = df_dev.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_dev = np.array(y_dev)
np.bincount(y_dev)

array([253, 363, 110, 196, 190])

In [12]:
df_test.drop('label', axis=1, inplace=True)
df_dev.drop('label', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Import Unlabeled Data

In [13]:
df_unlabeled = pd.read_csv('../datafiles/unlabeled_data_new1.csv') #without removing stop words
#df_unlabeled = pd.read_csv('unlabeled_data.csv') #with removing stop words
df_unlabeled.shape

(15472, 2)

In [14]:
df_unlabeled.tail()

Unnamed: 0,filename,text
15467,D16492.pdf.out.html.txt,ert oep ik ie scope of work corporate mobilit...
15468,D19966.pdf.out.html.txt,w f bof hl lai b statement of work for gss in...
15469,D26253.pdf.out.html.txt,statement of work wolters kluwer united states...
15470,D37991.pdf.out.html.txt,statement of work including all exhibits attac...
15471,D01957.pdf.out.html.txt,statement of work no sow services definitions...


In [15]:
#Split unlabelled data into train and valid sets
import numpy as np
msk = np.random.rand(len(df_unlabeled)) < 0.02

df_train = df_unlabeled[~msk]
df_valid = df_unlabeled[msk]

In [16]:
print(df_train.shape, df_valid.shape)

(15164, 2) (308, 2)


### Labeling Functions

In [17]:
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier

ABSTAIN = -1
MSA = 1
SOW = 4
ADDENDUM = 0
NDA = 2
OTHERS = 3

labl_functions = []

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN

#===============MSA=======================
@labeling_function()
def msa_regex_lookup(x):    
    nonmsa_keywords = ['sow', 'statement of work', 'addendum', 'amendment', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(effective)(.+?) and (.+?)", x.text)
    match3 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(the undersigned)(.+?) and (.+?)", x.text)
        
    if (match1 and not(any(key in x.text[:match1.end()] for key in nonmsa_keywords))) \
        or (match2 and not(any(key in x.text[:match2.end()] for key in nonmsa_keywords))) \
        or (match3 and not(any(key in x.text[:match3.end()] for key in nonmsa_keywords))):
        return MSA
    return ABSTAIN

labl_functions.append(msa_regex_lookup)

msa_keywords = ['indemnified party', 'indemnifying party', 'force majeure', 'intellectual industrial', 
                'wk service provider', 'intellectual industrial property', 'industrial property right', 
                'privacy restricted data', 'prior written notice', 'force majeure event', 'subject matter hereof']

def make_keyword_lf_msa(keywords, label=MSA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))

for key in msa_keywords:
    labl_functions.append(make_keyword_lf_msa([key]))
    
 
#===============ADDENDUM===================
@labeling_function()
def addendum_regex_lookup(x):
    match1 = re.search(r"(addendum|amendment|change request|change order)\s+(\S+\s+){1,30}(by and between|by and among|between) (.+?) and (.+?)", x.text)
    match2 = re.search(r"(addendum|amendment)\s+(\S+\s+){1,30}(schedule a|effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(addendum|amendment) (.+?) (the undersigned) (.+?) and (.+?)", x.text)
    
    if (match1 and match1.start() < 1000) or (match2 and match2.start() < 1000) or (match3 and match3.start() < 1000):
        return ADDENDUM
    return ABSTAIN

labl_functions.append(addendum_regex_lookup)

addendum_keywords = ['rom work', 'addendum number', 'addendum part', 'amendment part',
                     'term addendum', 'term amendment', 'addendum made entered',
                     'addendum entered', 'duration addendum', 
                     'purpose addendum', 
                     'addendum executed', 'subsequent addendum', 'amendment number', 
                     'amendment date', 'amendment entered', 'amendment made', 'amendment executed', 
                     'amendment effective date', 
                     'addendum may executed', 'effective date addendum', 
                     'amendment made entered', 
                     'agreement hereby amended', 'service agreement amendment']

def make_keyword_lf_addendum(keywords, label=ADDENDUM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in addendum_keywords:
    labl_functions.append(make_keyword_lf_addendum([key]))

#===============SOW===================
@labeling_function()
def sow_regex_lookup(x):    
    nonsow_keywords = ['addendum','amendment']
    match1 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|executed by|between|entered into)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(the undersigned) (.+?) and (.+?)", x.text)
       
    if (match1 and match1.start() < 1000 and not(any(key in x.text[:match1.end()] for key in nonsow_keywords)) \
        or (match2 and match2.start() < 1000 and not(any(key not in x.text[:match2.end()] for key in nonsow_keywords))) \
        or match3 and match3.start() < 1000 and not(any(key not in x.text[:match3.end()] for key in nonsow_keywords))):
        return SOW
    return ABSTAIN

labl_functions.append(sow_regex_lookup)

sow_keywords = ['sow effective date', 'work sow', 'sow shall', 'sow term', 'service sow', 'defined sow', 
                'specified sow', 'outlined sow', 'addendum sow', 'client sow', 'sow agreement', 
                'statement work effective', 'sow end date', 'sow duration']

def make_keyword_lf_sow(keywords, label=SOW):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in sow_keywords:
    labl_functions.append(make_keyword_lf_sow([key]))

#===============NDA===================
@labeling_function()
def nda_regex_lookup(x):
    nda_keywords = ['mutual confidentiality', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(disclosure agreement|confidentiality agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    
    if match1 and match1.start() < 1000 and any(key in x.text for key in nda_keywords):
        return NDA
    return ABSTAIN

labl_functions.append(nda_regex_lookup)

nda_keywords = ['mutual confidentiality', 'affiliated entity', 'agreement negotiation', 'disclosure hereunder', 
                'mutual confidentiality agreement', 'non confidential basis', 'confidential information agent', 
                'confidentiality non disclosure', 'party certain confidential information',
                'party desire disclose party', 'party wish protect','party furnish']

def make_keyword_lf_nda(keywords, label=NDA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in nda_keywords:
    labl_functions.append(make_keyword_lf_nda([key]))
    

#===============OTHERS===================
@labeling_function()
def others_lookup(x):
    msa = msa_regex_lookup(x)
    sow = sow_regex_lookup(x)
    addendum = addendum_regex_lookup(x)
    nda = nda_regex_lookup(x)
    
    if msa == ABSTAIN and sow == ABSTAIN and addendum == ABSTAIN and nda == ABSTAIN:
        return OTHERS
    return ABSTAIN    
    
#labl_functions.append(others_lookup)

other_keywords = ['sir madam letter', 'letter inform', 'engagement letter', 'service order form',
                  'change request form', 'signature form', 'agreement service order', 'service component order', 
                  'term service order', 'component order']

def make_keyword_lf_others(keywords, label=OTHERS):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in other_keywords:
    labl_functions.append(make_keyword_lf_others([key]))
    
@labeling_function()
def others_keyword_lookup(x):
    if all(word not in x.text for word in list(set(msa_keywords + sow_keywords + nda_keywords + addendum_keywords))):
        return OTHERS
    return ABSTAIN

#labl_functions.append(others_keyword_lookup)

print(len(labl_functions))

74


### Apply Label Functions to Corpus

In [18]:
#Apply the label functions to the train and valid sets
applier = PandasLFApplier(lfs=labl_functions)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)
L_valid = applier.apply(df=df_valid)
L_unlabelled = applier.apply(df=df_unlabeled)

100%|██████████| 15164/15164 [00:43<00:00, 352.45it/s]
100%|██████████| 1112/1112 [00:04<00:00, 259.91it/s]
100%|██████████| 308/308 [00:00<00:00, 406.27it/s]
100%|██████████| 15472/15472 [00:44<00:00, 351.00it/s]


In [19]:
#Check the coverage of label functions on train set

from snorkel.labeling import LFAnalysis
LFAnalysis(L=L_train, lfs=labl_functions).lf_summary().sort_values(by='Coverage')

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_agreement hereby amended,34,[],0.0,0.0,0.0
keyword_addendum made entered,19,[],0.0,0.0,0.0
keyword_duration addendum,21,[],0.0,0.0,0.0
keyword_addendum may executed,31,[],0.0,0.0,0.0
keyword_amendment made entered,33,[],0.0,0.0,0.0
keyword_specified sow,43,[],0.0,0.0,0.0
keyword_addendum sow,45,[],0.0,0.0,0.0
keyword_outlined sow,44,[],0.0,0.0,0.0
keyword_party desire disclose party,61,[],0.0,0.0,0.0
keyword_party wish protect,62,[],0.0,0.0,0.0


In [20]:
#Check the Coverage and Accuracy of label functions on dev set

LFAnalysis(L=L_dev, lfs=labl_functions).lf_summary(y_dev).sort_values(by='Emp. Acc.')

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
keyword_component order,73,[],0.0,0.0,0.0,0,0,0.0
keyword_addendum entered,20,[],0.0,0.0,0.0,0,0,0.0
keyword_duration addendum,21,[],0.0,0.0,0.0,0,0,0.0
keyword_purpose addendum,22,[],0.0,0.0,0.0,0,0,0.0
keyword_addendum sow,45,[],0.0,0.0,0.0,0,0,0.0
keyword_outlined sow,44,[],0.0,0.0,0.0,0,0,0.0
keyword_specified sow,43,[],0.0,0.0,0.0,0,0,0.0
keyword_addendum made entered,19,[],0.0,0.0,0.0,0,0,0.0
keyword_defined sow,42,[],0.0,0.0,0.0,0,0,0.0
keyword_amendment made,28,[],0.0,0.0,0.0,0,0,0.0


### Label Model

In [24]:
from snorkel.labeling import LabelModel
label_model = LabelModel(cardinality=5, verbose=True)
#label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=50, seed=123)
label_model.fit(L_unlabelled, n_epochs=500, lr=0.001, log_freq=50, seed=123) #this is for gridsearchCV where train and valid split not required

In [23]:
label_model_acc = label_model.score(L=L_dev, Y=y_dev)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

W1122 20:14:20.051949 139883909470016 label_model.py:501] Metrics calculated over data points with non-abstain labels only


Label Model Accuracy:     85.8%


In [25]:
probs_train = label_model.predict_proba(L_train)
probs_train

array([[2.00000000e-01, 2.00000000e-01, 2.00000000e-01, 2.00000000e-01,
        2.00000000e-01],
       [5.70167626e-02, 6.66162159e-01, 1.31370499e-01, 7.77928138e-02,
        6.76577652e-02],
       [2.00000000e-01, 2.00000000e-01, 2.00000000e-01, 2.00000000e-01,
        2.00000000e-01],
       ...,
       [4.12828122e-17, 9.99999994e-01, 1.02300347e-18, 8.91102701e-14,
        5.57330811e-09],
       [1.43905562e-02, 1.32814383e-02, 7.81295201e-03, 1.08649834e-02,
        9.53650070e-01],
       [9.01627765e-03, 1.79591228e-02, 4.73964305e-03, 6.88588797e-03,
        9.61399069e-01]])

In [26]:
probs_valid = label_model.predict_proba(L_valid)
probs_valid

array([[4.03753288e-05, 5.59768801e-03, 9.93979113e-01, 2.94126975e-04,
        8.86966786e-05],
       [2.00000000e-01, 2.00000000e-01, 2.00000000e-01, 2.00000000e-01,
        2.00000000e-01],
       [7.61536245e-04, 8.40472678e-02, 9.09991800e-01, 3.81547115e-03,
        1.38392484e-03],
       ...,
       [9.58499051e-01, 8.16754134e-04, 1.09649684e-02, 1.43933202e-02,
        1.53259061e-02],
       [1.11509370e-04, 6.22984135e-06, 3.79053377e-05, 7.72716881e-05,
        9.99767084e-01],
       [2.00000000e-01, 2.00000000e-01, 2.00000000e-01, 2.00000000e-01,
        2.00000000e-01]])

In [27]:
#this is for gridsearchCV where train and valid split not required
probs_full_train = label_model.predict_proba(L_unlabelled)
probs_full_train

array([[2.00000000e-01, 2.00000000e-01, 2.00000000e-01, 2.00000000e-01,
        2.00000000e-01],
       [5.70167626e-02, 6.66162159e-01, 1.31370499e-01, 7.77928138e-02,
        6.76577652e-02],
       [2.00000000e-01, 2.00000000e-01, 2.00000000e-01, 2.00000000e-01,
        2.00000000e-01],
       ...,
       [4.12828122e-17, 9.99999994e-01, 1.02300347e-18, 8.91102701e-14,
        5.57330811e-09],
       [1.43905562e-02, 1.32814383e-02, 7.81295201e-03, 1.08649834e-02,
        9.53650070e-01],
       [9.01627765e-03, 1.79591228e-02, 4.73964305e-03, 6.88588797e-03,
        9.61399069e-01]])

### Filter unlaabeled data if any

In [28]:
from snorkel.labeling import filter_unlabeled_dataframe

"""df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)"""

#this is for gridsearchCV where train and valid split not required
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_unlabeled, y=probs_full_train, L=L_unlabelled
)

In [29]:
print(df_train_filtered.shape, probs_train_filtered.shape)

(9027, 2) (9027, 5)


In [38]:
df_train_filtered = df_train_filtered.text.tolist()
y_train = probs_train_filtered
#df_valid = df_valid.text.tolist()
#y_valid = probs_valid

### Training

In [39]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 150 # cut texts after this number of words

# prepare tokenizer
t = Tokenizer(num_words=max_features)
t.fit_on_texts(df_train_filtered)
#t.fit_on_texts(df_valid)
post_seq_train = t.texts_to_sequences(df_train_filtered)
#post_seq_valid = t.texts_to_sequences(df_valid)
X_train = pad_sequences(post_seq_train, maxlen=max_features, padding='post')
#X_valid = pad_sequences(post_seq_valid, maxlen=max_features, padding='post')

In [40]:
#print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)
print(X_train.shape, y_train.shape)

(9027, 150) (9027, 5)


In [41]:
word_index = t.word_index
len(word_index)

285742

In [42]:
#Import Embeddings and create embedding dict
embeddings_index = {}
f = open('../w2v_embedding/cms_word2vec_embedding_300.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [43]:
from keras.layers import Embedding
from keras.initializers import Constant

#Create embedding layer
EMBEDDING_DIM = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_features,
                            trainable=False)

W1122 20:17:07.695820 139883909470016 deprecation_wrapper.py:119] From /home/inno/innovation/Shyam/venv_shyam/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=100)
X_train = vectorizer.fit_transform(df_train_filtered)

X_dev = vectorizer.transform(df_dev.text.tolist())
X_valid = vectorizer.transform(df_valid)
X_test = vectorizer.transform(df_test.text.tolist())

In [92]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional
from utils import get_keras_early_stopping

batch_size = 32

inputs = Input(shape=(max_features,), dtype='int32')
embed = embedding_layer(inputs)
               
#model = Sequential()
#model.add(Embedding(max_features, 128, input_length=48)(embed))
#model.add(Bidirectional(LSTM(64)))
#model.add(Dropout(0.5))
#model.add(Dense(5, activation='softmax'))

x = Bidirectional(LSTM(64))(embed)
x = Dropout(0.5)(x)
x = Dense(5, activation='softmax')(x)
model = Model(inputs = inputs, outputs = x)

In [93]:
from keras.optimizers import Adam
lr = 0.01
epochs = 10
#decay=lr/epochs
adam = Adam(lr=lr)

model.compile(adam, 'categorical_crossentropy', metrics=['accuracy'])

In [94]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=[X_valid, y_valid],
          callbacks=[get_keras_early_stopping()])

Train on 8871 samples, validate on 288 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc6b6b08e10>

In [56]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional
from utils import get_keras_early_stopping
from keras.optimizers import Adam

def create_model(lr=0.001):
    # create model
    model = Model()
    inputs = Input(shape=(max_features,), dtype='int32')
    embed = embedding_layer(inputs)
    x = Bidirectional(LSTM(64))(embed)
    x = Dropout(0.5)(x)
    x = Dense(5, activation='softmax')(x)
    model = Model(inputs = inputs, outputs = x)
    
    adam = Adam(lr=lr)
    
    # Compile model
    model.compile(adam, 'categorical_crossentropy', metrics=['accuracy'])
    return model

In [57]:
from keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn=create_model, batch_size=32, epochs=15, verbose=0)

In [58]:
from sklearn.model_selection import GridSearchCV

# define the grid search parameters
#batch_size = [10, 20, 40, 60]
#epochs = [10, 25, 50]
lr = [0.001, 0.01, 0.1]
param_grid = dict(lr=lr)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=5,  verbose=10)
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] lr=0.001 ........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
W1122 20:21:36.819225 139883909470016 deprecation_wrapper.py:119] From /home/inno/innovation/Shyam/venv_shyam/lib/python3.6/site-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1122 20:21:36.823359 139883909470016 deprecation_wrapper.py:119] From /home/inno/innovation/Shyam/venv_shyam/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3576: The name tf.log is deprecated. Please use tf.math.log instead.

W1122 20:21:36.894008 139883909470016 deprecation.py:323] From /home/inno/innovation/Shyam/venv_shyam/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


KeyboardInterrupt: 

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### Predictions

In [141]:
text_pred = df_labeled.text.tolist()
t.fit_on_texts(text_pred)
post_seq_test = t.texts_to_sequences(text_pred)
x_test = pad_sequences(post_seq_test, maxlen=max_features, padding='post')
probs_test = gird.predict(x_test)

In [145]:
labels_test = df_labeled.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
len(labels_test)

1397

In [146]:
from snorkel.analysis import metric_score
test_acc = metric_score(golds=labels_test, preds=probs_test.argmax(axis=1), metric="accuracy")
print(f"Test Accuracy: {test_acc * 100:.1f}%")

Test Accuracy: 45.8%


### Save Model

In [156]:
from keras.callbacks.callbacks import ModelCheckpoint
filepath = 'models/lstm.hdf5'
checkpointer = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)

### Save Model Probabilities and Labels

In [143]:
train_pred = df_unlabeled.text.tolist()
t.fit_on_texts(train_pred)
post_seq_test = t.texts_to_sequences(train_pred)
x_train = pad_sequences(post_seq_test, maxlen=max_features, padding='post')
probs_train = model.predict(x_train)

In [144]:
len(probs_train)

15472

In [147]:
labels_train = label_model.predict(L_unlabelled)
labels_train

array([3, 1, 3, ..., 1, 4, 4])

In [148]:
print(len(labels_train), len(labels_test))
print(len(probs_train), len(probs_test))

15472 1397
15472 1397


In [149]:
probs_latent_train = list(zip(probs_train.tolist(), labels_train.tolist()))
probs_latent_test = list(zip(probs_test.tolist(), labels_test.tolist()))
probs_latent_test

[([0.2624940872192383,
   0.27328163385391235,
   0.08967146277427673,
   0.10267908126115799,
   0.27187368273735046],
  0),
 ([0.03332483023405075,
   0.058378417044878006,
   0.010397524572908878,
   0.016984988003969193,
   0.8809142112731934],
  0),
 ([0.08453257381916046,
   0.27450767159461975,
   0.11299096047878265,
   0.2956554889678955,
   0.2323133945465088],
  0),
 ([0.060679882764816284,
   0.22661525011062622,
   0.11968036741018295,
   0.2584843039512634,
   0.33454015851020813],
  0),
 ([0.18283280730247498,
   0.09910684078931808,
   0.08421888947486877,
   0.23788650333881378,
   0.3959549367427826],
  0),
 ([0.14197483658790588,
   0.40013015270233154,
   0.06504497677087784,
   0.21624170243740082,
   0.17660833895206451],
  0),
 ([0.13794468343257904,
   0.23213954269886017,
   0.15442854166030884,
   0.23267491161823273,
   0.24281232059001923],
  0),
 ([0.11946573108434677,
   0.46066513657569885,
   0.07901173084974289,
   0.21892142295837402,
   0.121935918927

In [153]:
df_probs_train = pd.DataFrame(probs_latent_train, columns=['probabilities', 'label'])
df_probs_test = pd.DataFrame(probs_latent_test, columns=['probabilities', 'label'])

In [154]:
df_probs_test.head()

Unnamed: 0,probabilities,label
0,"[0.2624940872192383, 0.27328163385391235, 0.08...",0
1,"[0.03332483023405075, 0.058378417044878006, 0....",0
2,"[0.08453257381916046, 0.27450767159461975, 0.1...",0
3,"[0.060679882764816284, 0.22661525011062622, 0....",0
4,"[0.18283280730247498, 0.09910684078931808, 0.0...",0


In [155]:
df_probs_train.to_csv('probabilities_train.csv', index = None)
df_probs_test.to_csv('probabilities_test.csv', index = None)