In [30]:
import spacy
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,Bidirectional,Dropout
from tensorflow.keras.utils import to_categorical
from random import randint
from pickle import load
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
def read_file(p):
    with open(p,'r') as f:
        txt = f.read()
        return txt

Disabling them will make the computations faster

In [3]:
nlp = spacy.load('en_core_web_sm',disable=['parser', 'tagger','ner'])
nlp.max_length = 1198623

In [4]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2223d378460>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2223d50dec0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2223d5116c0>)]

In [5]:
def tokenize(txt):
    return [tok.text.lower() for tok in nlp(txt) if tok.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [6]:
d = read_file('moby_dick.txt')
tokens = tokenize(d)
print(len(tokens))

11338




In [7]:
print(tokens[:100])

['call', 'me', 'ishmael', 'some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', 'i', 'thought', 'i', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world', 'it', 'is', 'a', 'way', 'i', 'have', 'of', 'driving', 'off', 'the', 'spleen', 'and', 'regulating', 'the', 'circulation', 'whenever', 'i', 'find', 'myself', 'growing', 'grim', 'about', 'the', 'mouth', 'whenever', 'it', 'is', 'a', 'damp', 'drizzly', 'november', 'in', 'my', 'soul', 'whenever', 'i', 'find', 'myself', 'involuntarily', 'pausing', 'before', 'coffin', 'warehouses', 'and', 'bringing', 'up', 'the', 'rear', 'of', 'every', 'funeral', 'i', 'meet', 'and', 'especially', 'whenever', 'my']


In [8]:
train_len = 25+1
text_seq = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_seq.append(seq)

In [9]:
' '.join(text_seq[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [10]:
' '.join(text_seq[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_seq)
sequences = tokenizer.texts_to_sequences(text_seq)

In [12]:
sequences[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 314,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2713,
 14,
 24]

In [13]:
for i,k in enumerate(tokenizer.word_index.keys()):
    print(i,':',k)
    if i == 150:
        break

0 : the
1 : a
2 : and
3 : of
4 : i
5 : to
6 : in
7 : it
8 : that
9 : he
10 : his
11 : was
12 : but
13 : me
14 : with
15 : as
16 : at
17 : this
18 : you
19 : is
20 : all
21 : for
22 : my
23 : on
24 : be
25 : 's
26 : not
27 : from
28 : there
29 : one
30 : up
31 : what
32 : him
33 : so
34 : bed
35 : now
36 : about
37 : no
38 : into
39 : by
40 : were
41 : out
42 : or
43 : harpooneer
44 : had
45 : then
46 : have
47 : an
48 : upon
49 : little
50 : some
51 : old
52 : like
53 : if
54 : they
55 : would
56 : do
57 : over
58 : landlord
59 : thought
60 : room
61 : when
62 : could
63 : n't
64 : night
65 : here
66 : head
67 : such
68 : which
69 : man
70 : did
71 : sea
72 : time
73 : other
74 : very
75 : go
76 : these
77 : more
78 : though
79 : first
80 : sort
81 : said
82 : last
83 : down
84 : most
85 : been
86 : never
87 : your
88 : them
89 : must
90 : tell
91 : much
92 : good
93 : see
94 : off
95 : myself
96 : are
97 : yet
98 : sleep
99 : who
100 : seemed
101 : light
102 : way
103 : their
104 : ju

In [14]:
len(tokenizer.index_word)

2718

In [15]:
for i in sequences[0]:
    print(i,':',tokenizer.index_word[i])

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
314 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2713 : interest
14 : me
24 : on


In [16]:
vocab_sz = len(tokenizer.word_counts)
vocab_sz

2718

In [17]:
sequences = np.array(sequences)
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

In [18]:
sequences.shape

(11312, 26)

In [34]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(Bidirectional(LSTM(150, return_sequences=True)))
    # model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(150)))
    # model.add(Dropout(0.2))
    model.add(Dense(300, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [20]:
sequences = sequences[::3]
x = sequences[:,:-1]
y = sequences[:,-1]

In [21]:
x.shape

(3771, 25)

In [22]:
x.shape[1]

25

In [24]:
y = to_categorical(y,num_classes=vocab_sz+1)

In [25]:
y.shape

(3771, 2719)

In [26]:
seq_len = x.shape[1]

In [35]:
model = create_model(vocab_sz+1,seq_len)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 25)            67975     
                                                                 
 bidirectional_2 (Bidirectio  (None, 25, 300)          211200    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 300)              541200    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 300)               90300     
                                                                 
 dense_3 (Dense)             (None, 2719)              818419    
                                                                 
Total params: 1,729,094
Trainable params: 1,729,094
No

In [31]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='loss')

In [37]:
model.fit(x,y,batch_size=128,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2226d66c0a0>

In [38]:
from pickle import dump,load
model.save('epochBIG.h5')
dump(tokenizer, open('epochBIG', 'wb'))

In [48]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])
        pad_encoded = pad_sequences(encoded_text, maxlen=seq_len, truncating='pre')
        predictions = model.predict(pad_encoded)
        predicted_classes = np.argmax(predictions, axis=1)
        pred_word = tokenizer.index_word[predicted_classes[0]] 
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [49]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_seq))
random_seed_text = text_seq[random_pick]
seed_text = ' '.join(random_seed_text)
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [50]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)



'i then i should crawl way or have thinks to get to get and so so so so this idea of me and i be should be be be not be not be be not sure it it was this idea of the same way on it it it it'