In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install --quiet snorkel
!pip install --quiet tqdm==4.36.1

Collecting snorkel
[?25l  Downloading https://files.pythonhosted.org/packages/ff/f4/a0e419793075b737f1e86a1642d676dfdb17859887542afa06915136c231/snorkel-0.9.3-py3-none-any.whl (139kB)
[K     |██▍                             | 10kB 20.4MB/s eta 0:00:01[K     |████▊                           | 20kB 4.3MB/s eta 0:00:01[K     |███████                         | 30kB 6.1MB/s eta 0:00:01[K     |█████████▍                      | 40kB 7.8MB/s eta 0:00:01[K     |███████████▊                    | 51kB 5.0MB/s eta 0:00:01[K     |██████████████                  | 61kB 5.2MB/s eta 0:00:01[K     |████████████████▍               | 71kB 5.9MB/s eta 0:00:01[K     |██████████████████▉             | 81kB 6.6MB/s eta 0:00:01[K     |█████████████████████▏          | 92kB 7.1MB/s eta 0:00:01[K     |███████████████████████▌        | 102kB 6.6MB/s eta 0:00:01[K     |█████████████████████████▉      | 112kB 6.6MB/s eta 0:00:01[K     |████████████████████████████▏   | 122kB 6.6MB/s eta 0:

Collecting tqdm==4.36.1
[?25l  Downloading https://files.pythonhosted.org/packages/e1/c1/bc1dba38b48f4ae3c4428aea669c5e27bd5a7642a74c8348451e0bd8ff86/tqdm-4.36.1-py2.py3-none-any.whl (52kB)
[K     |██████▏                         | 10kB 24.7MB/s eta 0:00:01[K     |████████████▍                   | 20kB 4.2MB/s eta 0:00:01[K     |██████████████████▋             | 30kB 6.0MB/s eta 0:00:01[K     |████████████████████████▉       | 40kB 7.6MB/s eta 0:00:01[K     |███████████████████████████████ | 51kB 4.9MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.5MB/s 
[?25hInstalling collected packages: tqdm
  Found existing installation: tqdm 4.39.0
    Uninstalling tqdm-4.39.0:
      Successfully uninstalled tqdm-4.39.0
Successfully installed tqdm-4.36.1


In [1]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf

In [0]:
pd.options.display.max_rows = 4000

### Preprocess

In [0]:
def get_text_start_pos(text):
    pos = 0
    match1 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|between|among) (.+?) and (.+?)", text)
    match2 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective|dated|entered|executed|made) (.+?) and (.+?)", text)
    match3 = re.search(r"(addendum|amendment|change request|change order|agreement|sow|statement of work|work order|task order)(.+?)(the undersigned)(.+?) and (.+?)", text)
    if match1 and match1.start() < 1000:
        pos = match1.start()
    elif match2 and match2.start() < 1000:
        pos = match2.start()
    elif match3 and match3.start() < 1000:
        pos = match3.start()
    return pos

In [0]:
def preprocess(text):
    #Preprocess                
    text = text.replace('\n',' ').lower()
    
    #Remove non-alpha characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    pos = get_text_start_pos(text)
    text = text[pos:]
    #Remove articles
    #articles = ('a', 'an', 'the')
    #text = ' '.join([t for t in text.split() if t not in articles])

    return text

### Import Labeled Data

In [17]:
!unzip '/content/drive/My Drive/labeled_data_new1.zip'
!unzip '/content/drive/My Drive/unlabeled_data_new1.zip'
!unzip '/content/drive/My Drive/cms_word2vec_embedding_300.zip'

Archive:  /content/drive/My Drive/cms_word2vec_embedding_300.zip
  inflating: cms_word2vec_embedding_300.txt  


In [18]:
df_labeled = pd.read_csv('labeled_data_new1.csv') #without removing stop words 
#df_labeled = pd.read_csv('labeled_data.csv') #with removing stop words
df_labeled.head()
print(df_labeled.shape)

(1397, 3)


In [19]:
df_labeled.label.value_counts()

MSA         463
Addendum    314
Others      250
SOW         236
NDA         134
Name: label, dtype: int64

In [0]:
#Split labelled data into test and dev sets
import numpy as np
msk = np.random.rand(len(df_labeled)) < 0.8

df_dev = df_labeled[msk]
df_test = df_labeled[~msk]

In [21]:
print(df_test.shape, df_dev.shape)

(260, 3) (1137, 3)


In [22]:
df_dev.label.value_counts()

MSA         374
Addendum    255
Others      202
SOW         195
NDA         111
Name: label, dtype: int64

In [23]:
y_test = df_test.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_test = np.array(y_test)
np.bincount(y_test)

array([59, 89, 23, 48, 41])

In [24]:
y_dev = df_dev.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
y_dev = np.array(y_dev)
np.bincount(y_dev)

array([255, 374, 111, 202, 195])

In [25]:
df_test.drop('label', axis=1, inplace=True)
df_dev.drop('label', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Import Unlabeled Data

In [26]:
df_unlabeled = pd.read_csv('unlabeled_data_new1.csv') #without removing stop words
#df_unlabeled = pd.read_csv('unlabeled_data.csv') #with removing stop words
df_unlabeled.shape

(15472, 2)

In [27]:
df_unlabeled.tail()

Unnamed: 0,filename,text
15467,D16492.pdf.out.html.txt,ert oep ik ie scope of work corporate mobilit...
15468,D19966.pdf.out.html.txt,w f bof hl lai b statement of work for gss in...
15469,D26253.pdf.out.html.txt,statement of work wolters kluwer united states...
15470,D37991.pdf.out.html.txt,statement of work including all exhibits attac...
15471,D01957.pdf.out.html.txt,statement of work no sow services definitions...


In [0]:
#Split unlabelled data into train and valid sets
import numpy as np
msk = np.random.rand(len(df_unlabeled)) < 0.02

df_train = df_unlabeled[~msk]
df_valid = df_unlabeled[msk]

In [29]:
print(df_train.shape, df_valid.shape)

(15171, 2) (301, 2)


### Labeling Functions

In [30]:
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier

ABSTAIN = -1
MSA = 1
SOW = 4
ADDENDUM = 0
NDA = 2
OTHERS = 3

labl_functions = []

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN

#===============MSA=======================
@labeling_function()
def msa_regex_lookup(x):    
    nonmsa_keywords = ['sow', 'statement of work', 'addendum', 'amendment', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(effective)(.+?) and (.+?)", x.text)
    match3 = re.search(r"(agreement agreement|master agreement|master services agreement|this agreement)\s+(\S+\s+){1,30}(the undersigned)(.+?) and (.+?)", x.text)
        
    if (match1 and not(any(key in x.text[:match1.end()] for key in nonmsa_keywords))) \
        or (match2 and not(any(key in x.text[:match2.end()] for key in nonmsa_keywords))) \
        or (match3 and not(any(key in x.text[:match3.end()] for key in nonmsa_keywords))):
        return MSA
    return ABSTAIN

labl_functions.append(msa_regex_lookup)

msa_keywords = ['indemnified party', 'indemnifying party', 'force majeure', 'intellectual industrial', 
                'wk service provider', 'intellectual industrial property', 'industrial property right', 
                'privacy restricted data', 'prior written notice', 'force majeure event', 'subject matter hereof']

def make_keyword_lf_msa(keywords, label=MSA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))

for key in msa_keywords:
    labl_functions.append(make_keyword_lf_msa([key]))
    
 
#===============ADDENDUM===================
@labeling_function()
def addendum_regex_lookup(x):
    match1 = re.search(r"(addendum|amendment|change request|change order)\s+(\S+\s+){1,30}(by and between|by and among|between) (.+?) and (.+?)", x.text)
    match2 = re.search(r"(addendum|amendment)\s+(\S+\s+){1,30}(schedule a|effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(addendum|amendment) (.+?) (the undersigned) (.+?) and (.+?)", x.text)
    
    if (match1 and match1.start() < 1000) or (match2 and match2.start() < 1000) or (match3 and match3.start() < 1000):
        return ADDENDUM
    return ABSTAIN

labl_functions.append(addendum_regex_lookup)

addendum_keywords = ['addendum number', 'addendum part', 'amendment part', 'term addendum', 'term amendment', 'addendum made entered',
                     'addendum entered', 'duration of addendum', 'purpose of addendum', 'addendum executed', 'subsequent addendum', 'amendment number', 
                     'amendment date', 'amendment entered', 'amendment made', 'amendment executed',  'amendment effective date', 
                     'addendum may executed', 'effective date addendum', 'agreement hereby amended', 'service agreement amendment']

def make_keyword_lf_addendum(keywords, label=ADDENDUM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in addendum_keywords:
    labl_functions.append(make_keyword_lf_addendum([key]))

#===============SOW===================
@labeling_function()
def sow_regex_lookup(x):    
    nonsow_keywords = ['addendum','amendment']
    match1 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(by and between|by and among|executed by|between|entered into)(.+?) and (.+?)", x.text)
    match2 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(effective) (.+?) and (.+?)", x.text)
    match3 = re.search(r"(sow|statement of work|work order|task order)\s+(\S+\s+){1,30}(the undersigned) (.+?) and (.+?)", x.text)
       
    if (match1 and match1.start() < 1000 and not(any(key in x.text[:match1.end()] for key in nonsow_keywords)) \
        or (match2 and match2.start() < 1000 and not(any(key not in x.text[:match2.end()] for key in nonsow_keywords))) \
        or match3 and match3.start() < 1000 and not(any(key not in x.text[:match3.end()] for key in nonsow_keywords))):
        return SOW
    return ABSTAIN

labl_functions.append(sow_regex_lookup)

sow_keywords = ['sow effective date', 'work sow', 'sow shall', 'sow term', 'service sow', 'defined sow', 
                'specified sow', 'outlined sow', 'addendum sow', 'client sow', 'sow agreement', 
                'statement work effective', 'sow end date', 'sow duration']

def make_keyword_lf_sow(keywords, label=SOW):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in sow_keywords:
    labl_functions.append(make_keyword_lf_sow([key]))

#===============NDA===================
@labeling_function()
def nda_regex_lookup(x):
    nda_keywords = ['mutual confidentiality', 'confidentiality agreement', 'disclosure agreement']
    match1 = re.search(r"(disclosure agreement|confidentiality agreement)\s+(\S+\s+){1,30}(by and between|by and among|between|among)(.+?) and (.+?)", x.text)
    
    if match1 and match1.start() < 1000 and any(key in x.text for key in nda_keywords):
        return NDA
    return ABSTAIN

labl_functions.append(nda_regex_lookup)

nda_keywords = ['mutual confidentiality', 'affiliated entity', 'agreement negotiation', 'disclosure hereunder', 
                'mutual confidentiality agreement', 'non confidential basis', 'confidential information agent', 
                'confidentiality non disclosure', 'party certain confidential information',
                'party desire disclose party', 'party wish protect','party furnish']

def make_keyword_lf_nda(keywords, label=NDA):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in nda_keywords:
    labl_functions.append(make_keyword_lf_nda([key]))
    

#===============OTHERS===================
@labeling_function()
def others_lookup(x):
    msa = msa_regex_lookup(x)
    sow = sow_regex_lookup(x)
    addendum = addendum_regex_lookup(x)
    nda = nda_regex_lookup(x)
    
    if msa == ABSTAIN and sow == ABSTAIN and addendum == ABSTAIN and nda == ABSTAIN:
        return OTHERS
    return ABSTAIN    
    
labl_functions.append(others_lookup)

other_keywords = ['sir madam letter', 'letter inform', 'engagement letter', 'service order form',
                  'change request form', 'signature form', 'agreement service order', 'service component order', 
                  'term service order', 'component order']

def make_keyword_lf_others(keywords, label=OTHERS):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

for key in other_keywords:
    labl_functions.append(make_keyword_lf_others([key]))
    
@labeling_function()
def others_keyword_lookup(x):
    if all(word not in x.text for word in list(set(msa_keywords + sow_keywords + nda_keywords + addendum_keywords))):
        return OTHERS
    return ABSTAIN

labl_functions.append(others_keyword_lookup)

print(len(labl_functions))

74


### Apply Label Functions to Corpus

In [31]:
#Apply the label functions to the train and valid sets
applier = PandasLFApplier(lfs=labl_functions)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)
L_valid = applier.apply(df=df_valid)
#L_unlabelled = applier.apply(df=df_unlabeled)

100%|██████████| 15171/15171 [02:09<00:00, 117.05it/s]
100%|██████████| 1137/1137 [00:12<00:00, 94.37it/s]
100%|██████████| 301/301 [00:02<00:00, 130.29it/s]


In [0]:
#Check the coverage of label functions on train set

from snorkel.labeling import LFAnalysis
LFAnalysis(L=L_train, lfs=labl_functions).lf_summary().sort_values(by='Coverage')

In [0]:
#Check the Coverage and Accuracy of label functions on dev set

LFAnalysis(L=L_dev, lfs=labl_functions).lf_summary(y_dev).sort_values(by='Emp. Acc.')

### Label Model

In [0]:
from snorkel.labeling import LabelModel
label_model = LabelModel(cardinality=5, verbose=True)
label_model.fit(L_train, n_epochs=500, lr=0.001, log_freq=50, seed=123)
#label_model.fit(L_unlabelled, n_epochs=500, lr=0.001, log_freq=50, seed=123) #this is for gridsearchCV where train and valid split not required

In [33]:
label_model_acc = label_model.score(L=L_dev, Y=y_dev)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")



Label Model Accuracy:     83.6%


In [34]:
probs_train = label_model.predict_proba(L_train)
probs_train

array([[6.78470509e-02, 4.89630438e-02, 6.98495708e-02, 7.45707181e-01,
        6.76331539e-02],
       [8.44067062e-02, 6.76091216e-01, 1.58283656e-01, 3.69052465e-05,
        8.11815165e-02],
       [6.78470509e-02, 4.89630438e-02, 6.98495708e-02, 7.45707181e-01,
        6.76331539e-02],
       ...,
       [3.78049358e-27, 9.99999998e-01, 3.86702493e-21, 9.61056660e-44,
        1.81553261e-09],
       [1.22401894e-02, 1.53024005e-02, 6.40558863e-03, 1.27805935e-09,
        9.66051820e-01],
       [5.25931912e-03, 1.75839629e-02, 3.84083561e-03, 4.34887761e-08,
        9.73315839e-01]])

In [35]:
probs_valid = label_model.predict_proba(L_valid)
probs_valid

array([[6.78470509e-02, 4.89630438e-02, 6.98495708e-02, 7.45707181e-01,
        6.76331539e-02],
       [8.44067062e-02, 6.76091216e-01, 1.58283656e-01, 3.69052465e-05,
        8.11815165e-02],
       [1.27685866e-04, 9.98903056e-01, 4.48491718e-04, 6.92798289e-15,
        5.20766443e-04],
       ...,
       [7.50650196e-05, 2.94037627e-05, 2.56862790e-05, 1.07155980e-17,
        9.99869845e-01],
       [6.78470509e-02, 4.89630438e-02, 6.98495708e-02, 7.45707181e-01,
        6.76331539e-02],
       [1.13068237e-01, 8.75640832e-02, 6.52106213e-02, 6.38024269e-05,
        7.34093256e-01]])

In [0]:
#this is for gridsearchCV where train and valid split not required
#probs_full_train = label_model.predict_proba(L_unlabelled)
#probs_full_train

### Filter unlaabeled data if any

In [36]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

#this is for gridsearchCV where train and valid split not required
"""df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_unlabeled, y=probs_full_train, L=L_unlabelled
)"""

'df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(\n    X=df_unlabeled, y=probs_full_train, L=L_unlabelled\n)'

In [37]:
print(df_train_filtered.shape, probs_train_filtered.shape)

(15171, 2) (15171, 5)


In [0]:
df_train_filtered = df_train_filtered.text.tolist()
y_train = probs_train_filtered
df_valid = df_valid.text.tolist()
y_valid = probs_valid

### Training

In [39]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 150 # cut texts after this number of words

# prepare tokenizer
t = Tokenizer(num_words=20000)
t.fit_on_texts(df_train_filtered)
t.fit_on_texts(df_valid)
"""post_seq_train = t.texts_to_sequences(df_train_filtered)
post_seq_valid = t.texts_to_sequences(df_valid)
X_train = pad_sequences(post_seq_train, maxlen=max_features, padding='post')
X_valid = pad_sequences(post_seq_valid, maxlen=max_features, padding='post')"""

Using TensorFlow backend.


"post_seq_train = t.texts_to_sequences(df_train_filtered)\npost_seq_valid = t.texts_to_sequences(df_valid)\nX_train = pad_sequences(post_seq_train, maxlen=max_features, padding='post')\nX_valid = pad_sequences(post_seq_valid, maxlen=max_features, padding='post')"

In [0]:
#print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)
#print(X_train.shape, y_train.shape)

(13941, 150) (13941, 5) (327, 150) (327, 5)


In [40]:
word_index = t.word_index
len(word_index)

401227

In [0]:
#Import Embeddings and create embedding dict
embeddings_index = {}
f = open('cms_word2vec_embedding_300.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [0]:
from keras.layers import Embedding
from keras.initializers import Constant

#Create embedding layer
EMBEDDING_DIM = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
"""embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_features,
                            trainable=False)"""

'embedding_layer = Embedding(num_words,\n                            EMBEDDING_DIM,\n                            embeddings_initializer=Constant(embedding_matrix),\n                            input_length=max_features,\n                            trainable=False)'

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=100)
X_train = vectorizer.fit_transform(df_train_filtered)

X_dev = vectorizer.transform(df_dev.text.tolist())
X_valid = vectorizer.transform(df_valid)
X_test = vectorizer.transform(df_labeled.text.tolist())

In [0]:
def get_keras_early_stopping(patience=10, monitor="val_acc"):
    """Stops training if monitor value doesn't exceed the current max value after patience num of epochs"""
    return tf.keras.callbacks.EarlyStopping(
        monitor=monitor, patience=patience, verbose=1, restore_best_weights=True
    )

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
#from utils import get_keras_early_stopping

batch_size = 32

               
model = Sequential()
#model.add(Embedding(len(word_index)+1, 300, weights=[embedding_matrix], input_length=max_features, trainable=False))
model.add(Embedding(len(word_index)+1, 300, input_length=100))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

In [44]:
from keras.optimizers import Adam
lr = 0.001
epochs = 25
#decay=lr/epochs
adam = Adam(lr=lr)

model.compile(adam, 'categorical_crossentropy', metrics=['accuracy'])













In [0]:
from keras.callbacks import ModelCheckpoint
filepath="lstm-best-model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=-1, save_best_only=True, mode='max')

In [46]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=[X_valid, y_valid],
          callbacks=[get_keras_early_stopping(),checkpoint])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where











  num_elements)









Train on 15171 samples, validate on 301 samples
Epoch 1/25






























Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f54d81c0da0>

### GridSearchCV

In [0]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional
#from utils import get_keras_early_stopping
from keras.optimizers import Adam

def create_model(lr=0.001):
    # create model
    model = Model()
    inputs = Input(shape=(max_features,), dtype='int32')
    embed = embedding_layer(inputs)
    x = Bidirectional(LSTM(64))(embed)
    x = Dropout(0.5)(x)
    x = Dense(5, activation='softmax')(x)
    model = Model(inputs = inputs, outputs = x)
    
    adam = Adam(lr=lr)
    
    # Compile model
    model.compile(adam, 'categorical_crossentropy', metrics=['accuracy'])
    return model

In [0]:
from keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn=create_model, batch_size=32, epochs=15, verbose=0)

In [0]:
from sklearn.model_selection import GridSearchCV

# define the grid search parameters
#batch_size = [10, 20, 40, 60]
#epochs = [10, 25, 50]
lr = [0.001, 0.01, 0.1]
param_grid = dict(lr=lr)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=5,  verbose=10)
grid_result = grid.fit(X_train, y_train)

In [0]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

### Predictions

In [0]:
# load weights
model.load_weights("lstm-best-model.hdf5")

In [0]:
"""text_pred = df_labeled.text.tolist()
t.fit_on_texts(text_pred)
post_seq_test = t.texts_to_sequences(text_pred)
x_test = pad_sequences(post_seq_test, maxlen=max_features, padding='post')"""
probs_test = model.predict(X_test)

In [73]:
labels_test = df_labeled.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
len(labels_test)

1397

In [74]:
from snorkel.analysis import metric_score
test_acc = metric_score(golds=labels_test, preds=probs_test.argmax(axis=1), metric="accuracy")
print(f"Test Accuracy: {test_acc * 100:.1f}%")

Test Accuracy: 78.5%


In [0]:
!cp lstm-best-model.hdf5 '/content/drive/My Drive/lstm-79.6-90.3-95.1.hdf5'

### Save Model Probabilities and Labels

In [0]:
train_pred = df_unlabeled.text.tolist()
t.fit_on_texts(train_pred)
post_seq_test = t.texts_to_sequences(train_pred)
x_train = pad_sequences(post_seq_test, maxlen=max_features, padding='post')
probs_train = model.predict(x_train)

In [0]:
x_train = vectorizer.transform(df_unlabeled.text.tolist())
probs_train = model.predict(x_train)

In [57]:
len(probs_train)

15472

In [58]:
L_unlabelled = applier.apply(df=df_unlabeled)

100%|██████████| 15472/15472 [02:14<00:00, 115.26it/s]


In [59]:
labels_train = label_model.predict(L_unlabelled)
labels_train

array([3, 1, 3, ..., 1, 4, 4])

In [60]:
labels_test = df_labeled.label.map({'Addendum': 0, 'MSA': 1, 'SOW': 4, 'NDA': 2, 'Others': 3})
len(labels_test)

1397

In [75]:
print(len(labels_train), len(labels_test))
print(len(probs_train), len(probs_test))

15472 1397
15472 1397


In [0]:
probs_latent_train = list(zip(probs_train.tolist(), labels_train.tolist()))
probs_latent_test = list(zip(probs_test.tolist(), labels_test.tolist()))
#probs_latent_test

In [0]:
df_probs_train = pd.DataFrame(probs_latent_train, columns=['probabilities', 'label'])
df_probs_test = pd.DataFrame(probs_latent_test, columns=['probabilities', 'label'])

In [78]:
df_probs_test.head()

Unnamed: 0,probabilities,label
0,"[0.888213038444519, 0.011493748985230923, 0.03...",0
1,"[0.8944848775863647, 0.016249872744083405, 0.0...",0
2,"[0.29428383708000183, 0.06875303387641907, 0.0...",0
3,"[0.9346663355827332, 0.004241131711751223, 0.0...",0
4,"[0.10538450628519058, 0.10119219869375229, 0.0...",0


In [0]:
df_probs_train.to_csv('/content/drive/My Drive/lstm_79.6_probabilities_train.csv', index = None)
df_probs_test.to_csv('/content/drive/My Drive/lstm_79.6_probabilities_test.csv', index = None)