In [108]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from Utilfuns import read_file, words_index, get_total_words, input_seq, get_max_seq_len, print_input_seq, add_padding_pre
from Utilfuns import predict_next_word


## Text Preprocessing

In [109]:
filepath = 'i-485instr.txt'
# filepath = 'sample.txt'

text = read_file(filepath)
tokens = words_index(text)
total_words = get_total_words(tokens)
inputseq = input_seq(text,tokens)
max_seq_len = get_max_seq_len(inputseq)
pad_inputseq = add_padding_pre(inputseq)


print('-------File Data----------\n',text[:500]) 
print('-------  Tokens ----------\n',tokens.word_index)  
print('--------InputSeq----------')
print_input_seq(inputseq[:5], tokens)
print('------Max Seq length------\n', max_seq_len)
print('-------PaddedSeq----------')
print_input_seq(pad_inputseq[:5], tokens)


-------File Data----------
 what is the purpose of form i-485?
form i-485, application to register permanent residence or adjust status, is used by a person in the united states to
apply for lawful permanent resident status. throughout these instructions, we will sometimes refer to form i-485 as an
application for adjustment of status or as an adjustment application.
who may file form i-485?
the immigration and nationality act (ina) and certain other federal laws provide many different ways to adjust status to
that of a la
-------  Tokens ----------
--------InputSeq----------
what is
what is the
what is the purpose
what is the purpose of
what is the purpose of form
------Max Seq length------
 29
-------PaddedSeq----------
- - - - - - - - - - - - - - - - - - - - - - - - - - - what is
- - - - - - - - - - - - - - - - - - - - - - - - - - what is the
- - - - - - - - - - - - - - - - - - - - - - - - - what is the purpose
- - - - - - - - - - - - - - - - - - - - - - - - what is the purpose of
-

## Train Test Split

In [16]:
# x: all words but last word
# y: last word
x=pad_inputseq[:,:-1]
y=pad_inputseq[:,-1]

print('-------PaddedSeq----------')
print(pad_inputseq[0:3])
print('-------X-Y Split----------')
for i in range(3):
    print(f'{x[i]}-[{y[i]}]')

# One-hot for y
y=tf.keras.utils.to_categorical(y,num_classes=total_words)
print('-------X-Y final----------')
for i in range(3):
    print(f'{x[i]}-[{y[i]}]')


# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)

-------PaddedSeq----------
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0 142  20]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0 142  20   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0 142  20   1 519]]
-------X-Y Split----------
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 142]-[20]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0 142  20]-[1]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0 142  20   1]-[519]
-------X-Y final----------
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 142]-[[0. 0. 0. ... 0. 0. 0.]]
[  0   0   0   0   0   0   0   0   0   0   0   0   0  

## Training LSTM

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [18]:
model=Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 28, 100)           204600    
                                                                 
 lstm_4 (LSTM)               (None, 28, 150)           150600    
                                                                 
 dropout_2 (Dropout)         (None, 28, 150)           0         
                                                                 
 lstm_5 (LSTM)               (None, 100)               100400    
                                                                 
 dense_2 (Dense)             (None, 2046)              206646    
                                                                 
Total params: 662246 (2.53 MB)
Trainable params: 662246 (2.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
#Model Train
history = model.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test), verbose=1)

## Predict next 5 words

In [126]:
input_text = 'USCIS will not accept'

for _ in range(5):
    max_sequence_len = model.input_shape[1]+1
    next_word = predict_next_word(model, tokens, input_text, max_sequence_len)
    input_text += ' '+next_word
    print(input_text)

USCIS will not accept a
USCIS will not accept a stamped
USCIS will not accept a stamped or
USCIS will not accept a stamped or typewritten
USCIS will not accept a stamped or typewritten name


In [127]:
#Save Model weights
model.save("next_word_LSTM.h5")
## Save the tokenizer
import pickle
with open('tokens.pickle','wb') as handle:
    pickle.dump(tokens,handle,protocol=pickle.HIGHEST_PROTOCOL)

  saving_api.save_model(
