In [1]:
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

In [2]:
in_filename = 'republic.txt'
doc = load_doc(in_filename)
print(doc[:200])


BOOK I. The Republic opens with a truly Greek scene--a festival in
honour of the goddess Bendis which is held in the Piraeus; to this is
added the promise of an equestrian torch-race in the evening. T


### Data Cleaning

In [3]:
import string 
def clean_doc(doc):
    doc = doc.replace('-',' ')
    
    tokens = doc.split()
    tokens = [t for t in tokens if t not in string.punctuation]
    text = [t for t in tokens if t.isalpha()]
    
    text = [t.lower() for t in text]
    return text

In [4]:
tokens = clean_doc(doc)

In [5]:
print(tokens[:100])

['book', 'the', 'republic', 'opens', 'with', 'a', 'truly', 'greek', 'scene', 'a', 'festival', 'in', 'honour', 'of', 'the', 'goddess', 'bendis', 'which', 'is', 'held', 'in', 'the', 'to', 'this', 'is', 'added', 'the', 'promise', 'of', 'an', 'equestrian', 'torch', 'race', 'in', 'the', 'the', 'whole', 'work', 'is', 'supposed', 'to', 'be', 'recited', 'by', 'socrates', 'on', 'the', 'day', 'after', 'the', 'festival', 'to', 'a', 'small', 'consisting', 'of', 'and', 'this', 'we', 'learn', 'from', 'the', 'first', 'words', 'of', 'the', 'when', 'the', 'rhetorical', 'advantage', 'of', 'reciting', 'the', 'dialogue', 'has', 'been', 'the', 'attention', 'is', 'not', 'distracted', 'by', 'any', 'reference', 'to', 'the', 'nor', 'is', 'the', 'reader', 'further', 'reminded', 'of', 'the', 'extraordinary', 'length', 'of', 'the', 'of', 'the']


In [6]:
print('total Tokens',len(tokens))
print('Unique Tokens',len(set(tokens)))

total Tokens 182860
Unique Tokens 8827


In [7]:
seq = tokens[:10]
print(seq)
lines = ' '.join(seq)
print(lines)

['book', 'the', 'republic', 'opens', 'with', 'a', 'truly', 'greek', 'scene', 'a']
book the republic opens with a truly greek scene a


In [8]:
# we can organize the list of tokens into sequences of 50 inputs and 1 output word
length = 50 + 1
seq = list()

for i in range(length,len(tokens)):
    sequence = tokens[i-length:i]
    lines = ' '.join(sequence)
    seq.append(lines)
print('total sequences',len(seq))

total sequences 182809


In [9]:
print(len(seq[0]))

259


In [10]:
# save tokens to file,one dialog per line
def save_doc(lines,filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [11]:
save_doc(seq,'republic_seq.txt')

In [12]:
print(len(seq[0].split()))

51


### Train language model

We can now train a statistical language model from the prepared data. The model we will train is a neural language model. It has a few unique characteristics:

 It uses a distributed representation for words so that different words with similar meanings
will have a similar representation.

 It learns the representation at the same time as learning the model.

 It learns to predict the probability for the next word using the context of the last 100 words.

Specifically, we will use an Embedding Layer to learn the representation of words, and a
Long Short-Term Memory (LSTM) recurrent neural network to learn to predict words based on
their context. Let’s start by loading our training data.

In [13]:
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

In [27]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(seq)
tokens = tokenizer.texts_to_sequences(seq)

In [28]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8828

In [29]:
sequences = array(tokens)

In [30]:
X,y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y,num_classes=vocab_size)

In [31]:
embedding_index = dict()

f = open('glove.6B.200d.txt',mode='rt',encoding='utf-8')
    

In [32]:
import numpy as np
for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coeffs
f.close()

In [33]:
print('Loaded word vectors',len(embedding_index))

Loaded word vectors 400000


In [35]:
embedding_matrix = np.zeros((vocab_size,200))

for word,index in tokenizer.word_index.items():
    embedding_vec = embedding_index.get(word)
    if embedding_vec is not None:
        embedding_matrix[index] = embedding_vec
    

In [36]:
X.shape

(182809, 50)

In [37]:
print(X[0])

[ 306    1  379 8827   26    7  359  305 3871    7 1925    6  328    2
    1 3104 8826   11    5  688    6    1    4   30    5 1413    1 1412
    2   43 5335 3103  943    6    1    1  152  274    5  364    4   10
 5334   22  334   50    1  405  137    1]


In [38]:
seq_length = X.shape[1]

In [41]:
def build_model(vocab_size,seq_length):
    model = Sequential()
    model.add(Embedding(vocab_size,200,weights=[embedding_matrix],trainable=False,input_length=seq_length))
    model.add(LSTM(100,return_sequences=True))
    
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()

    return model

In [50]:
model = build_model(vocab_size,seq_length)
model.fit(X,y,batch_size=128,epochs=4)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 200)           1765600   
_________________________________________________________________
lstm_4 (LSTM)                (None, 50, 100)           120400    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 8828)              891628    
Total params: 2,868,128
Trainable params: 1,102,528
Non-trainable params: 1,765,600
_________________________________________________________________
Epoch 1/4
Epoch 2/4
 207/1429 [===>..........................] - ETA: 6:17 - loss: 5.5822 - accuracy: 

KeyboardInterrupt: 

In [None]:
seq[0]

In [44]:
from random import randint
seed_text = seq[randint(0,len(seq))]

In [45]:
print(seed_text)

that is and of this kind i spoke as the although in the search after it the soul is compelled to use not ascending to a first because she is unable to rise above the region of but employing the objects of which the shadows below are resemblances in their turn


In [46]:
print(seq_length)

50


In [47]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def generate_seq(model,tokenizer,seq_length,seed_txt,n_words):
    in_txt = seed_txt
    result = list()
    for _ in range(n_words):
        
        encoded = tokenizer.texts_to_sequences([in_txt])[0]
        
        encoded = pad_sequences([encoded],maxlen=seq_length,truncating='pre')
        
        yhat = model.predict_classes(encoded)
        
        out_word = ''
        for word,index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_txt += ' '+out_word
        result.append(out_word)
    return ' '.join(result),in_txt

In [48]:
generated,in_txt = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)



the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and


In [49]:
print(in_txt)

that is and of this kind i spoke as the although in the search after it the soul is compelled to use not ascending to a first because she is unable to rise above the region of but employing the objects of which the shadows below are resemblances in their turn the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and the and
