In [1]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import pandas as pd
import numpy as np

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data = pd.read_csv('patents.csv')
abstracts = data['patent_abstract']
abstracts

0       " A ""Barometer"" Neuron enhances stability in...
1       " This invention is a novel high-speed neural ...
2       An optical information processor for use as a ...
3       A method and system for intelligent control of...
4       A method and system for intelligent control of...
                              ...                        
4537    A neural network is disclosed in which communi...
4538    [Object] An object is to provide an apparatus ...
4539    An XML-based symbolic computer language, inter...
4540    A convolution engine, such as a convolution ne...
4541    This disclosure relates to improved sketch-bas...
Name: patent_abstract, Length: 4542, dtype: object

In [3]:
abstracts[100][:300]

'Neural signal amplifiers include an operational amplifier and a feedback network coupled between an output and an input thereof. The feedback network includes a tunnel field effect transistor (“TFET”) pseudo resistor that exhibits bi-directional conductivity. A drain region of the TFET may be electr'

In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer (num_words=None,
                      filters='"#$%&*+/:;<=>?@[\\]^_`{|}~\t\n',
                      lower=True, split=' ')

# training tokenizer on abstracts would result in tokenizer
# assigning distinct int value for each distinct word
tokenizer.fit_on_texts(abstracts)
sequences = tokenizer.texts_to_sequences(abstracts)
sequences[100][:15]

[6, 35, 1689, 95, 11, 716, 1018, 4, 2, 303, 10, 292, 58, 11, 19]

In [5]:
# mapping indexes to words
idx_word = tokenizer.index_word

# converting tokenized list to string
' '.join(idx_word[w] for w in sequences[100][:40])

'neural signal amplifiers include an operational amplifier and a feedback network coupled between an output and an input thereof. the feedback network includes a tunnel field effect transistor (“tfet”) pseudo resistor that exhibits bi-directional conductivity. a drain region of the'

## Give the network a sequence of words and train it to predict the next word.

i.e.  we give our network 50 words and train it to predict the 51st.

## Creating features and labels

We use the first 50 words as features with the 51st as the label, then use words 2–51 as features and predict the 52nd and so on.

In [6]:
features = []
labels = []

training_length = 50

for seq in sequences:
    # create multiple training examples for each sequence
    # range from 50 to length/end ofcurrent sequence
    for i in range(training_length, len(seq)):
        # extract features and label
        extract = seq[i - training_length:i + 1]
        
        features.append(extract[:-1])
        labels.append(extract[-1])

features = np.array(features)

# we have 364,883 sequences with 50 features each 
# (or each sequence has 50 timesteps)
features.shape 

(364883, 50)

# Training and Validation data

In [7]:
from sklearn.utils import shuffle


def create_train_valid(features,
                       labels,
                       num_words,
                       train_fraction=0.7):
    """Create training and validation features and labels."""

    # Randomly shuffle features and labels
    features, labels = shuffle(features, labels, random_state=50)

    # Decide on number of samples for training
    train_end = int(train_fraction * len(labels))

    train_features = np.array(features[:train_end])
    valid_features = np.array(features[train_end:])

    train_labels = labels[:train_end]
    valid_labels = labels[train_end:]

    # Convert to arrays
    X_train, X_valid = np.array(train_features), np.array(valid_features)

    # Using int8 for memory savings
    y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
    y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

    # One hot encoding of labels
    for example_index, word_index in enumerate(train_labels):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(valid_labels):
        y_valid[example_index, word_index] = 1

    # Memory management
    import gc
    gc.enable()
    del features, labels, train_features, valid_features, train_labels, valid_labels
    gc.collect()

    return X_train, X_valid, y_train, y_valid

In [9]:
num_words = len(idx_word) + 1

X_train, X_valid, y_train, y_valid = create_train_valid(
    features, labels, num_words)
X_train.shape
y_train.shape

(255418, 22026)

We do want to be careful about using up too much memory. One hot encoding the labels creates massive numpy arrays so I took care to delete the un-used objects from the workspace.

In [10]:
import sys

def check_sizes(gb_min=1):
    for x in globals():
        size = sys.getsizeof(eval(x)) / 1e9
        if size > gb_min:
            print(f'Object: {x:10}\tSize: {size} GB.')


check_sizes(gb_min=1)

Object: y_train   	Size: 5.62583698 GB.
Object: y_valid   	Size: 2.411076202 GB.
