In [1]:
#!pip install tensorflow
#!pip install transformers

In [2]:
import numpy as np
import pandas as pd
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import word_tokenize

from transformers import BertTokenizer, TFBertModel, BertConfig

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [3]:
def corpus_stats(corpus):
  print('Corpus Stats:')
  print('Number of Documents: ' + str(len(corpus.fileids())))
  print('Number of Paragraphs ' + str(len(corpus.paras())))
  print('Number of sentences: ' + str(len(corpus.sents())))
  print('Number of words: ' + str(len(corpus.words())))
  print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
  print("Avg chars per word: " + str(round(len(corpus.raw())/len(corpus.words()),1)))
  print("Avg words per sentence: " + str(round(len(corpus.words())/len(corpus.sents()),1)))

In [4]:
path = './cover_letter_samples'
doc_pattern = r'.*\.txt'
corpus = PlaintextCorpusReader(path, doc_pattern)

In [5]:
corpus_stats(corpus)

Corpus Stats:
Number of Documents: 32
Number of Paragraphs 169
Number of sentences: 429
Number of words: 11042
Vocabulary: 2298
Avg chars per word: 5.5
Avg words per sentence: 25.7


In [6]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
docs[0]

"I would like to introduce myself as an applicant for the Data Scientist position at River Tech, a prestigious and reputable name in innovative technology. I am confident in my ability to perform as a Data Scientist at River Tech due to my extensive education and work experience.\r\n\r\nDuring my work experience at Crane & Jenkins, I had an extensive range of responsibilities including selecting features, optimizing classifiers, mining data, expanding the company's data by incorporating third-party sources, improving data collection techniques, processing data, and doing ad-hoc analyses. As a Data Scientist, I was required to have excellent communication skills, understanding of algorithms, excellence in the MatLab tool kit, proficiency in GGplot, knowledge of SQL, and excellence in applied statistics. During my eight-year tenure at Crane & Jenkins, I applied these skills daily and performed exceptionally at the company.\r\n\r\nMy abilities as a Data Scientist are rooted in a sturdy ed

In [7]:
docs = [doc.replace('\r\n\r\n', '') for doc in docs]

In [8]:
docs[0]

"I would like to introduce myself as an applicant for the Data Scientist position at River Tech, a prestigious and reputable name in innovative technology. I am confident in my ability to perform as a Data Scientist at River Tech due to my extensive education and work experience.During my work experience at Crane & Jenkins, I had an extensive range of responsibilities including selecting features, optimizing classifiers, mining data, expanding the company's data by incorporating third-party sources, improving data collection techniques, processing data, and doing ad-hoc analyses. As a Data Scientist, I was required to have excellent communication skills, understanding of algorithms, excellence in the MatLab tool kit, proficiency in GGplot, knowledge of SQL, and excellence in applied statistics. During my eight-year tenure at Crane & Jenkins, I applied these skills daily and performed exceptionally at the company.My abilities as a Data Scientist are rooted in a sturdy education in mathe

In [9]:
tokenized = [word_tokenize(doc) for doc in docs]

lens = [len(token) for token in tokenized]

In [10]:
tokens_list = []
for token in tokenized:
    tokens_list.extend(token)

In [11]:
tz = BertTokenizer.from_pretrained("bert-base-cased")

In [12]:
encoded = tz.encode_plus(
    text=tokens_list,  # the text to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = len(tokens_list),  # maximum length of a document
    truncation = True,
    padding = 'max_length',  # Add [PAD]s
    return_attention_mask = True,  # Generate the attention mask
    return_tensors = 'tf',  # ask the function to return TensorFlow tensors
)
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']

In [13]:
input_ids_list = []
for input_id in input_ids:
    input_ids_list.extend(input_id)

In [14]:
input_ids_int = []

for tensor in input_ids_list:
    input_ids_int.append(tensor.numpy())

In [28]:
# For use decoding output
id_to_word = {input_ids_int[i]: tokens_list[i] for i in range(len(input_ids_int))}
word_to_id = {tokens_list[i]: input_ids_int[i] for i in range(len(tokens_list))}

In [17]:
X = []
y = []
for i in range(0, len(input_ids_list) - 100, 1):
    in_seq = input_ids_list[i:i+100]
    out_seq = input_ids_list[i + 100]
    X.append(in_seq)
    y.append(out_seq)

In [18]:
X_array = np.reshape(X, (len(X), 100, 1))

In [19]:
X_val = X_array[0]
X_train = X_array[1:]

In [33]:
X_train.shape

(10458, 100, 1)

In [35]:
X_test = np.reshape(X_val, (1, 100, 1))

In [36]:
X_test.shape

(1, 100, 1)

In [20]:
y_array = np_utils.to_categorical(y)

In [21]:
y_train = y_array[1:]
y_test = y_array[0]

In [22]:
model = Sequential()
model.add(LSTM(256, input_shape=(X_array.shape[1], X_array.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y_array.shape[1], activation='softmax'))

In [23]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [24]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [25]:
model.fit(X, y, validation_split = 0.2, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 7.89874, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 7.89874 to 6.02796, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 6.02796 to 5.94107, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 5.94107 to 5.90234, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x20075b67b48>

In [26]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [58]:
prediction = model.predict(X_test)
prediction[0].argmax()

100

In [62]:
tokens_list[100]

'Data'

In [None]:
ample = 'I am applying for the position of <job>'