# Keras Word Embeddings

In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
from keras.models import load_model
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking
from keras.optimizers import Adam
from keras.utils import Sequence
from keras.preprocessing.text import Tokenizer

from sklearn.utils import shuffle

from IPython.display import HTML

from itertools import chain
from keras.utils import plot_model
import numpy as np
import pandas as pd
import random
import json
import re

RANDOM_STATE = 50
TRAIN_FRACTION = 0.7

Using TensorFlow backend.


In [4]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import HTML

InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore', category = RuntimeWarning)
warnings.filterwarnings('ignore', category = UserWarning)

import pandas as pd
import numpy as np
from utils import get_data, get_embeddings, find_closest

# Fetch Training Data

* Using patent abstracts from patent search for neural network
* 3000+ patents total


In [5]:
data = pd.read_csv('../data/neural_network_patent_query.csv')
data.head()

Unnamed: 0,patent_abstract,patent_date,patent_number,patent_title
0,""" A """"Barometer"""" Neuron enhances stability in...",1996-07-09,5535303,"""""""Barometer"""" neuron for a neural network"""
1,""" This invention is a novel high-speed neural ...",1993-10-19,5255349,"""Electronic neural network for solving """"trave..."
2,An optical information processor for use as a ...,1995-01-17,5383042,3 layer liquid crystal neural network with out...
3,A method and system for intelligent control of...,2001-01-02,6169981,3-brain architecture for an intelligent decisi...
4,A method and system for intelligent control of...,2003-06-17,6581048,3-brain architecture for an intelligent decisi...


In [6]:
training_dict, word_idx, idx_word, sequences = get_data('../data/neural_network_patent_query.csv', training_len = 50)

There are 16192 unique words.
There are 318563 sequences.


* Sequences of text are represented as integers
    * `word_idx` maps words to integers
    * `idx_word` maps integers to words
* Features are integer sequences of length 50
* Label is next word in sequence
* Labels are one-hot encoded

In [7]:
training_dict['X_train'][:2]
training_dict['y_train'][:2]

array([[  117,     7,   141,   277,     4,    18,    81,   110,    10,
          219,    29,     1,   952,  2453,    19,     5,     6,     1,
          117,    10,   182,  2166,    21,     1,    81,   178,     4,
           13,   117,   894,    14,  6163,     7,   302,     1,     9,
            8,    29,    33,    23,    74,   428,     7,   692,     1,
           81,   183,     4,    13,   117],
       [    6,    41,     2,    87,     3,  1340,    79,     7,     1,
          409,   543,    22,   484,     6,     2,  2113,   728,    24,
            1,   178,     3,     1,  1820,    55,    14, 13942,  7240,
          244,     5,    14, 13943,  7240,   244,     5,     2,  2113,
         7240,   244,     5,     2,    38,  9292,   244,     2,    49,
         9292,   244,    14,    22, 13944]])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [8]:
for i, sequence in enumerate(training_dict['X_train'][:2]):
    text = []
    for idx in sequence:
        text.append(idx_word[idx])
        
    print('Features: ' + ' '.join(text) + '\n')
    print('Label: ' + idx_word[np.argmax(training_dict['y_train'][i])] + '\n')
    

Features: user to provide samples . A recognition operation is performed on the user's handwritten input , and the user is not satisfied with the recognition result . The user selects an option to train the neural network on one or more characters to improve the recognition results . The user

Label: is

Features: and includes a number of amplifiers corresponding to the N bit output sum and a carry generation from the result of the adding process an augend input-synapse group , an addend input-synapse group , a carry input-synapse group , a first bias-synapse group a second bias-synapse group an output feedback-synapse

Label: group



# Make Recurrent Neural Network

* Embedding dimension = 100
* 64 LSTM cells in one layer
    * Dropout and recurrent dropout for regularization
* Fully connected layer with 64 units on top of LSTM
     * 'relu' activation
* Drop out for regularization
* Output layer produces prediction for each word
    * 'softmax' activation
* Adam optimizer with defaults
* Categorical cross entropy loss
* Monitor accuracy

In [9]:
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from keras.optimizers import Adam

from keras.utils import plot_model

In [10]:
model = Sequential()

# Embedding layer
model.add(
    Embedding(
        input_dim=len(word_idx) + 1,
        output_dim=100,
        weights=None,
        trainable=True))

# Recurrent layer
model.add(
    LSTM(
        64, return_sequences=False, dropout=0.1,
        recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(len(word_idx) + 1, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1619200   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16192)             1052480   
Total params: 2,718,080
Trainable params: 2,718,080
Non-trainable params: 0
_________________________________________________________________


In [11]:
h = model.fit(training_dict['X_train'], training_dict['y_train'], epochs=5, batch_size=2048, 
          validation_data = (training_dict['X_valid'], training_dict['y_valid']), verbose=1)

Train on 222994 samples, validate on 95569 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
print('Model Performance: Log Loss and Accuracy on training data')
model.evaluate(training_dict['X_train'], training_dict['y_train'], batch_size = 2048)

print('\nModel Performance: Log Loss and Accuracy on validation data')
model.evaluate(training_dict['X_valid'], training_dict['y_valid'], batch_size = 2048)

Model Performance: Log Loss and Accuracy on training data


[5.924870105172796, 0.09993093985607225]


Model Performance: Log Loss and Accuracy on validation data


[6.064511908455439, 0.10173801127474089]

# Inspect Embeddings

As a final piece of model inspection, we can look at the embeddings and find the words closest to a query word in the embedding space. This gives us an idea of what the network has learned.

In [13]:
embeddings = get_embeddings(model)
embeddings.shape

(16192, 100)

Each word in the vocabulary is now represented as a 100-dimensional vector. This could be reduced to 2 or 3 dimensions for visualization. It can also be used to find the closest word to a query word.

In [14]:
find_closest('network', embeddings, word_idx, idx_word)

Query: network

Word: network         Cosine Similarity: 1.0
Word: neural          Cosine Similarity: 0.785099983215332
Word: based           Cosine Similarity: 0.758899986743927
Word: so              Cosine Similarity: 0.7515000104904175
Word: corresponding   Cosine Similarity: 0.7384999990463257
Word: associated      Cosine Similarity: 0.7196999788284302
Word: signal          Cosine Similarity: 0.715499997138977
Word: order           Cosine Similarity: 0.704200029373169
Word: value           Cosine Similarity: 0.7010999917984009
Word: networks        Cosine Similarity: 0.7002999782562256


In [15]:
find_closest('web', embeddings, word_idx, idx_word)

Query: web

Word: web             Cosine Similarity: 1.0
Word: .sup            Cosine Similarity: 0.8828999996185303
Word: drug            Cosine Similarity: 0.8813999891281128
Word: living          Cosine Similarity: 0.8738999962806702
Word: &#8220          Cosine Similarity: 0.8738999962806702
Word: neurotrophic    Cosine Similarity: 0.8737000226974487
Word: Fuzzy           Cosine Similarity: 0.8734999895095825
Word: remains         Cosine Similarity: 0.8729000091552734
Word: dopamine        Cosine Similarity: 0.8718000054359436
Word: simultaneous    Cosine Similarity: 0.8708999752998352
