In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import HTML

InteractiveShell.ast_node_interactivity = 'all'  # display full outputs in Jupyter
warnings.filterwarnings('ignore', category=RuntimeWarning)

Rather than training our own word embeddings, a very expensive operation, we can use word embeddings that were trained on a large corpus of words. The hope is that these embeddings will generalize from the training corpus to our needs.

This code downloads 100-dimensional word embeddings if you don't already have them. There are a number of different pre-trained word embeddings you can find from [Stanford online](https://nlp.stanford.edu/data/).

In [4]:
import os
import sys
import numpy as np

In [5]:
from keras.utils import get_file
from os.path import realpath, dirname, exists, join
ROOT_DIR = dirname(realpath('.'))
DATA_DIR = join(ROOT_DIR, 'data')

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [6]:
# Download word embeddings from stanford NLP laab
glove_vectors = join(DATA_DIR, 'glove.6B.zip')

if not os.path.exists(glove_vectors):
    glove_vectors = get_file('glove.6B.zip',
                             'http://nlp.stanford.edu/data/glove.6B.zip')
    os.system(f'unzip {glove_vectors}')

256

In [7]:
# Load in unzipped files
glove_vectors = join(DATA_DIR, 'glove.6B.100d.txt')

glove = np.loadtxt(glove_vectors, dtype='str', comments=None)
glove.shape

(400000, 101)

In [8]:
type(glove)

numpy.ndarray

In [9]:
# seperate into words and vectors
words = glove[:, 0]
vectors = glove[:, 1:].astype('float')

In [10]:
# delete glove to save memory
del glove

In [11]:
vectors[1000]
words[1000]
vectors.shape

array([-0.21843  ,  0.022696 , -0.062105 , -0.25557  , -0.2222   ,
        0.75584  , -0.58643  , -0.3236   ,  0.0036797, -0.52816  ,
       -0.18682  ,  0.16995  ,  0.38306  ,  0.26499  , -0.081493 ,
       -0.85389  ,  0.078729 ,  0.55321  , -0.94035  , -0.046033 ,
        0.25873  , -0.51662  ,  0.17764  , -0.54664  , -0.64107  ,
       -0.71131  , -0.66956  , -0.16875  ,  0.25056  , -0.073421 ,
        0.742    ,  0.21894  , -0.60056  , -0.66511  ,  0.87591  ,
       -0.43214  , -0.16481  ,  0.15383  , -0.4014   , -0.17786  ,
       -0.57662  ,  0.038627 , -0.1438   , -0.21172  ,  0.023644 ,
       -0.38741  , -0.091636 ,  0.80288  , -0.56324  , -0.7643   ,
       -0.15529  ,  0.40837  ,  0.023216 ,  1.6483   , -0.36147  ,
       -1.8609   ,  0.40398  , -0.41986  ,  1.5969   ,  0.2239   ,
       -0.26619  ,  1.3771   , -0.43608  ,  0.1363   ,  0.62087  ,
        0.33013  ,  0.90322  ,  0.22929  , -0.072946 , -0.16841  ,
       -0.13554  ,  0.0075493, -0.2734   , -0.25576  ,  0.0611

'themselves'

(400000, 100)

Now we only keep the words with embeddings that appear in our vocabulary. For words that are in our vocabulary but don't have an embedding, they will be represented as all 0s. We can address that by training our own embeddings.

In [12]:
# create a dictionary for word look up table
word_lookup = {word: vector for word, vector in zip(words, vectors)}
word_lookup['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [13]:
# create word and tokens from our own copora. 
import pandas as pd
data = pd.read_csv('../data/neural_network_patent_query.csv', parse_dates=['patent_date'])
data.head()

# Extract abstracts
original_abstracts = list(data['patent_abstract'])
len(original_abstracts)

Unnamed: 0,patent_abstract,patent_date,patent_number,patent_title
0,""" A """"Barometer"""" Neuron enhances stability in...",1996-07-09,5535303,"""""""Barometer"""" neuron for a neural network"""
1,""" This invention is a novel high-speed neural ...",1993-10-19,5255349,"""Electronic neural network for solving """"trave..."
2,An optical information processor for use as a ...,1995-01-17,5383042,3 layer liquid crystal neural network with out...
3,A method and system for intelligent control of...,2001-01-02,6169981,3-brain architecture for an intelligent decisi...
4,A method and system for intelligent control of...,2003-06-17,6581048,3-brain architecture for an intelligent decisi...


3522

In [14]:
from keras.preprocessing.text import Tokenizer
from utils import make_sequences, format_sequence

In [15]:
original_abstracts = list(data['patent_abstract'])

In [16]:
abstracts = [format_sequence(a) for a in original_abstracts]

In [17]:
abstracts[1]

'" This invention is a novel high-speed neural network based processor for solving the ""traveling salesman"" and other global optimization problems . It comprises a novel hybrid architecture employing a binary synaptic array whose embodiment incorporates the fixed rules of the problem , such as the number of cities to be visited . The array is prompted by analog voltages representing variables such as distances . The processor incorporates two interconnected feedback networks , each of which solves part of the problem independently and simultaneously , yet which exchange information dynamically . "'

In [18]:
word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, features, labels = make_sequences(abstracts)

There are 11695 unique words.
There are 293001 sequences.


In [24]:
word_lookup = {word: vector for word, vector in zip(words, vectors)}

In [19]:
embedding_matrix = np.zeros((num_words, vectors.shape[1]))

not_found = 0

for i, word in enumerate(word_idx.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')

There were 1224 words without pre-trained embeddings.


In [32]:
import gc
gc.enable()
del vectors
gc.collect()

0

Each word is represented by 100 numbers with a number of words that can't be found. We can find the closest words to a given word in embedding space using the cosine distance. This requires first normalizing the vectors to have a magnitude of 1.

In [33]:
# Normalize and convert nan to 0
embedding_matrix = embedding_matrix / \
    np.linalg.norm(embedding_matrix, axis=1).reshape((-1, 1))
embedding_matrix = np.nan_to_num(embedding_matrix)

In [54]:
def find_closest(query, embedding_matrix, word_idx, idx_word, n=10):
    """Find closest words to a query word in embeddings"""

    idx = word_idx.get(query, None)
    # Handle case where query is not in vocab
    if idx is None:
        print(f'{query} not found in vocab.')
        return
    else:
        vec = embedding_matrix[idx]
        # Handle case where word doesn't have an embedding
        if np.all(vec == 0):
            print(f'{query} has no pre-trained embedding.')
            return
        else:
            # Calculate distance between vector and all others
            dists = np.dot(embedding_matrix, vec)

            # Sort indexes in reverse order
            idxs = np.argsort(dists)[::-1][:n]
            sorted_dists = dists[idxs]
            closest = [idx_word[i] for i in idxs]

    print(f'Query: {query}\n')
    max_len = max([len(i) for i in closest])
    # Print out the word and cosine distances
    for word, dist in zip(closest, sorted_dists):
        print(f'Word: {word:15} Cosine Similarity: {round(dist, 4)}')

In [42]:
idx = word_idx.get('invention', None)
vec = embedding_matrix[idx]
dists = np.dot(embedding_matrix, vec)

np.argsort(dists)

array([10463, 11531, 11042, ...,   428,  8083,    71])

In [46]:
idxs = np.argsort(dists)[::-1][:10]

In [50]:
sorted_dists = dists[idxs]

In [52]:
closest = [idx_word[i] for i in idxs]

In [53]:
for word, dist in zip(closest, sorted_dists):
    print(f'Word: {word:15} Cosine Similarity: {round(dist, 4)}')

Word: invention       Cosine Similarity: 1.0
Word: invented        Cosine Similarity: 0.7513
Word: technique       Cosine Similarity: 0.6008
Word: concept         Cosine Similarity: 0.5966
Word: introduction    Cosine Similarity: 0.5679
Word: method          Cosine Similarity: 0.559
Word: design          Cosine Similarity: 0.5575
Word: patent          Cosine Similarity: 0.555
Word: experiment      Cosine Similarity: 0.5463
Word: modern          Cosine Similarity: 0.5449


In [55]:
find_closest('design', embedding_matrix, word_idx, idx_word)

Query: design

Word: design          Cosine Similarity: 1.0
Word: designs         Cosine Similarity: 0.8617
Word: designed        Cosine Similarity: 0.7899
Word: architecture    Cosine Similarity: 0.7741
Word: model           Cosine Similarity: 0.735
Word: architectural   Cosine Similarity: 0.7183
Word: concept         Cosine Similarity: 0.7056
Word: developed       Cosine Similarity: 0.6946
Word: models          Cosine Similarity: 0.6872
Word: structure       Cosine Similarity: 0.6872


In [56]:
find_closest('dnn', embedding_matrix, word_idx, idx_word)

dnn has no pre-trained embedding.
