In [1]:
def read_file(file_path):
    try:
        with open(file_path) as file:
            return file.read()
    except FileNotFoundError as err:
        print('Check file path!')

In [2]:
file_path = '/home/viper/Downloads/UPDATED_NLP_COURSE/TextFiles/moby_dick_four_chapters.txt'

In [3]:
import spacy

2023-01-19 01:24:47.310769: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-19 01:24:47.685771: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-19 01:24:47.685803: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-19 01:24:49.489993: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [4]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])

In [5]:
nlp.max_length = 1200000

In [6]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

In [7]:
text = read_file(file_path)

In [8]:
tokens = separate_punc(text)



In [9]:
"""
Given first 25 words --> network to predict 26th word.
"""

'\nGiven first 25 words --> network to predict 26th word.\n'

In [10]:
train_len = 25 + 1
text_sequences = []

for i in range(train_len, len(tokens)):
    text_sequence = tokens[i - train_len : i]
    text_sequences.append(text_sequence)

In [11]:
from keras.preprocessing.text import Tokenizer

In [12]:
tokenizer = Tokenizer()

In [13]:
tokenizer.fit_on_texts(text_sequences)

In [23]:
sequences = tokenizer.texts_to_sequences(text_sequences)
# text converted to sequence of numbers or indices of text
# each of the number in sequence is id of a word

In [24]:
word_index_dict = tokenizer.index_word

In [25]:
for id in sequences[0]:
    print(f"{id} --> {tokenizer.index_word[id]}")

956 --> call
14 --> me
263 --> ishmael
51 --> some
261 --> years
408 --> ago
87 --> never
219 --> mind
129 --> how
111 --> long
954 --> precisely
260 --> having
50 --> little
43 --> or
38 --> no
314 --> money
7 --> in
23 --> my
546 --> purse
3 --> and
150 --> nothing
259 --> particular
6 --> to
2713 --> interest
14 --> me
24 --> on


In [27]:
word_count = tokenizer.word_counts

In [28]:
vocabulary_size = len(tokenizer.word_counts)

In [29]:
vocabulary_size

2718

In [30]:
# Converting id sequences to matrix

In [31]:
import numpy as np

In [32]:
sequences = np.array(sequences)

In [34]:
from keras.utils import to_categorical

In [35]:
X = sequences[:,:-1]
y = sequences[:, -1]

In [36]:
y = to_categorical(y, num_classes=vocabulary_size + 1) 
#  num_classes=vocabulary_size + 1 --> because Keras needs one more class to hold 0. 

In [37]:
y.shape

(11312, 2719)

In [38]:
seq_len = X.shape[1]

In [45]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [114]:
def create_model(vocabulary_size: int, seq_len: int):
    
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size, output_dim=seq_len, input_length=seq_len))
    model.add(LSTM(seq_len*2, return_sequences=True))
    model.add(LSTM(seq_len*2))
    
    model.add(Dense(50, activation='relu'))
    
    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    return model

In [115]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 25)            67975     
                                                                 
 lstm_2 (LSTM)               (None, 25, 50)            15200     
                                                                 
 lstm_3 (LSTM)               (None, 50)                20200     
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 2719)              138669    
                                                                 
Total params: 244,594
Trainable params: 244,594
Non-trainable params: 0
_________________________________________________________________


In [48]:
from pickle import dump, load

In [116]:
model.fit(X, y, batch_size=128, epochs=20, verbose=0)

<keras.callbacks.History at 0x7f4f0858b070>

In [117]:
model.save('mobi_dick_lstm_20epoch.h5')

In [52]:
dump(tokenizer, open('mobi_dick_tokenizer', 'wb'))

In [54]:
from keras.utils import pad_sequences

In [102]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        pred_word_index = model.predict(pad_encoded, verbose=0)
        
        pred_word_index = np.argmax(pred_word_index,axis=1)[0]

        pre_word = tokenizer.index_word[pred_word_index]

        input_text = ' ' + pre_word
        output_text.append(pre_word)

    return ' '.join(output_text)

In [134]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))

In [135]:
random_seed_text = text_sequences[random_pick]

In [136]:
seed_text = ' '.join(random_seed_text)
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [121]:
generate_text(model, tokenizer, seq_len, seed_text, 25)

'a little harpooneer the little harpooneer the little harpooneer the little harpooneer the little harpooneer the little harpooneer the little harpooneer the little harpooneer the'

In [122]:
from keras.models import load_model

In [126]:
import keras.preprocessing

In [131]:
model = load_model('/home/viper/Downloads/UPDATED_NLP_COURSE/06-Deep-Learning/epochBIG.h5')

In [132]:
tokenizer = load(open('/home/viper/Downloads/UPDATED_NLP_COURSE/06-Deep-Learning/epochBIG', 'rb'))

In [137]:
generate_text(model, tokenizer, seq_len, seed_text, 25)

'to him me whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever whatever'