In [1]:
#!pip install tensorflow
#!pip install transformers

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import word_tokenize

from transformers import BertTokenizer, TFBertModel, BertConfig

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

import matplotlib.pyplot as plt

First I will define a function to checkout some of the corpus statistics.

In [3]:
def corpus_stats(corpus):
  print('Corpus Stats:')
  print('Number of Documents: ' + str(len(corpus.fileids())))
  print('Number of Paragraphs ' + str(len(corpus.paras())))
  print('Number of sentences: ' + str(len(corpus.sents())))
  print('Number of words: ' + str(len(corpus.words())))
  print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
  print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
  print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

Load the sample cover letter .txt files via NLTK's PlaintextCorpusReader.

In [4]:
path = './cover_letter_samples'
doc_pattern = r'.*\.txt'
corpus = PlaintextCorpusReader(path, doc_pattern)

In [5]:
corpus_stats(corpus)

Corpus Stats:
Number of Documents: 51
Number of Paragraphs 246
Number of sentences: 625
Number of words: 14942
Vocabulary: 2564
Avg chars per word: 5.6
Avg words per sentence: 23.9


In [6]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

Remove "\n" and "\r" characters

In [7]:
docs = [doc.replace('\n', '') for doc in docs]
docs = [doc.replace('\r', '') for doc in docs]
docs = [doc.replace(')', '') for doc in docs]
docs = [doc.replace('(', '') for doc in docs]

Create a list of individual words for each document

In [8]:
tokenized = [word_tokenize(doc) for doc in docs]

lens = [len(token) for token in tokenized]

Make a single list of all documents

In [9]:
tokens_list = []
for token in tokenized:
    tokens_list.extend(token)

# Encoding

Now we will encode the text to numeric vectors using BERT encoder because it is pre-trained and can understand the meaning of words.

In [10]:
tz = BertTokenizer.from_pretrained("bert-base-cased")

In [11]:
encoded = tz.encode_plus(
    text=tokens_list,  # the text to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = len(tokens_list),  # maximum length of a document
    truncation = True,
    padding = 'max_length',  # Add [PAD]s
    return_attention_mask = True,  # Generate the attention mask
    return_tensors = 'tf',  # ask the function to return TensorFlow tensors
)
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']

The BERT encoder outputs a list of lists, so I will consolidate them into a single list

In [12]:
input_ids_list = []
for input_id in input_ids:
    input_ids_list.extend(input_id)

The BERT encoder also outputs tensors, I need to convert them to numpy arrays.

In [13]:
input_ids_int = []

for tensor in input_ids_list:
    input_ids_int.append(tensor.numpy())

Now that the words are numeric vectors, I will need to be able to decode the model's output. I will also need to be able to encode a test sample.

In [14]:
id_to_word = {input_ids_int[i]: tokens_list[i] for i in range(len(input_ids_int))}
word_to_id = {tokens_list[i]: input_ids_int[i] for i in range(len(tokens_list))}

We are modeling sequence-to-sequence, I will create sequences and the word that immediately follows that sequence to use as "labels."

In [15]:
seq_len = 10

In [16]:
X = []
y = []
for i in range(0, len(input_ids_list) - seq_len, 1):
    in_seq = input_ids_list[i:i+seq_len]
    out_seq = input_ids_list[i + seq_len]
    X.append(in_seq)
    y.append(out_seq)

Reshape the input to (*time steps*, *batch size*, *something else*) 

In [17]:
X_array = np.reshape(X, (len(X), seq_len, 1))

One-hot encode the labels

In [18]:
y_array = np_utils.to_categorical(y)

In [19]:
y_array.shape

(14084, 27934)

# Modeling

In [21]:
model = Sequential()
model.add(LSTM(256, input_shape=(X_array.shape[1], X_array.shape[2]), return_sequences=True))
model.add(Dense(256))
model.add(LSTM(128, return_sequences = True))
model.add(Dense(128))
model.add(LSTM(256))
model.add(Dense(y_array.shape[1], activation='softmax'))

In [22]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

I'm using ModelCheckpoint to save the model weights, so I don't have to retrain the model every time I restart the notebook.

In [23]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
history = model.fit(X_array, y_array, validation_split = 0.2, epochs=20, batch_size=256, callbacks=desired_callbacks)

Epoch 1/20

Epoch 00001: loss did not improve from 5.78636
Epoch 2/20

Epoch 00002: loss improved from 5.78636 to 5.75127, saving model to model_weights_saved.hdf5
Epoch 3/20

Epoch 00003: loss improved from 5.75127 to 5.72917, saving model to model_weights_saved.hdf5
Epoch 4/20

Epoch 00004: loss did not improve from 5.72917
Epoch 5/20

Epoch 00005: loss improved from 5.72917 to 5.72456, saving model to model_weights_saved.hdf5
Epoch 6/20

Epoch 00006: loss did not improve from 5.72456
Epoch 7/20

Epoch 00007: loss improved from 5.72456 to 5.71908, saving model to model_weights_saved.hdf5
Epoch 8/20

Epoch 00008: loss improved from 5.71908 to 5.71825, saving model to model_weights_saved.hdf5
Epoch 9/20

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()

In [26]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

Now to test the model. I input the beginning of a cover letter. The loop deletes the first token in the sample, predicts on a sequence of the specified length, append the predicted word to the end of the sample. The first word is dropped and the model predicts on the next sequence. 

In [44]:
sample = "I am excited to be applying for your company because I"
sample_split = word_tokenize(sample)
sample_split.insert(0, 0)

for i in range(5):
    sample_split = sample_split[1:]
    print(sample_split)
    sample_ids = [word_to_id[word] for word in sample_split]
    print(sample_ids)
    sample_array = tf.convert_to_tensor(sample_ids)
    sample_array = [list(sample_array)]
    sample_array = np.reshape(sample_array, (1, seq_len, 1))
    prediction = model.predict(sample_array)
    print(prediction)
    # the below line is not correct. For some reason only predicts 'analyses'
    #pred_index = np.where(prediction == prediction.max())
    #print(pred_index)
    #ind_tup = list(zip(pred_index[0], pred_index[1]))[0]
    #print(ind_tup)
   
    #new_word = tokens_list[pred_index]
    #print(new_word)
    #sample_split.append(new_word)
    #sample = sample + ' ' + new_word

print(sample)

['I', 'am', 'excited', 'to', 'be', 'applying', 'for', 'your', 'company', 'because', 'I']
[119, 100, 1821, 100, 1106, 1196, 1128, 1111, 1240, 1700, 119]


ValueError: cannot reshape array of size 11 into shape (1,10,1)

The output is English text, but it does not make sense. The model simply predicts the word that appears immediately before it. This could probably be solved by more training data. Neural nets require large training sets, and my corpus only contains 51 cover letter samples.