In [1]:
import pandas as pd
import numpy as np

# Read the Republic Data file - Plato document

    - This file is available at http://www.gutenberg.org/cache/epub/1497/pg1497.txt
    

In [2]:
def read_file(file):
    f = open(file, 'r', encoding='utf-8')
    lines = f.readlines()
    data = '\t'.join([line.strip() for line in lines])
    return data

data = read_file('republic.txt')

# Clean the Read file

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Aditya
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Text cleansing

    - Good article available at https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

In [4]:
import re
import string
from nltk.corpus import stopwords
import pandas as pd

def clean_doc(doc):
    
    #replace -- with a space
    doc = doc.replace('-',' ')
    
    #replace -- with a space
    doc = doc.replace('.',' ')
    
    #split into tokens by white space
    tokens = doc.split()
    
    # prepare for char filtering
    re_punc = re.compile('[%s]'%re.escape(string.punctuation))
    
    #remove punctuation from words
    tokens = [re_punc.sub('',w) for w in tokens]
    
    #remove remaining tokens that are not alphanumerics
    tokens = [word for word in tokens if word.isalpha()]
    
    #make lower case
    tokens = [word.lower() for word in tokens]
    
    #remove stop words & words which appear less than 1 times
    stops = set(stopwords.words("english"))
    tokens = [word for word in tokens if not word in stops and len(word)>=3]

    return tokens

tokens = clean_doc(data)
                 
print ('Number of tokens are ', len(tokens))
print ('Number of unique tokens is ', len(set(tokens)))

Number of tokens are  34821
Number of unique tokens is  6788


# Generate Sequences

    - This method takes in bunch of tokens and then converts those tokens into sequences. The idea here is that we create
    sequences of 21 tokens (words) each and then feed each sequence one at a time to the network. each word in the sequence represents a value at a given timesteps. All words in effect become the features

In [5]:
words_per_sequence = 20

def generate_sequences(size, tokens):
    sequences = list()
    size_seq = size + 1
    for i in range (0, len(tokens)-size):
        seq = tokens[i:size_seq+i]
        line = ' '.join(seq)
        sequences.append(line)
        
    print ('Total number of sequenes is ', len(sequences))
    return sequences

sequences = generate_sequences(words_per_sequence, tokens)

Total number of sequenes is  34801


# Save generated sequences in new file

In [6]:
def save_sequences(filename, sequences):
    
    file = open(filename,'w',encoding='utf-8')
    lines = '\n'.join(sequences)
    file.write(lines)
    file.close()
    
save_sequences('cleaned-republic.txt', sequences)

# Read cleaned sequences from file

In [7]:
def read_sequences(file):
    f = open(file, 'r', encoding='utf-8')
    lines = f.read()
    return lines

lines = read_sequences('cleaned-republic.txt')
lines = lines.split('\n')

# Tokenize & Create Sequences of Numbers

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import numpy as np

def tokenize_documents(lines):
    tokenizer = Tokenizer(num_words=6789)
    tokenizer.fit_on_texts(lines)
    sequences = tokenizer.texts_to_sequences(lines)
    return (tokenizer, sequences)

(tokenizer, sequences) = tokenize_documents(lines)
sequences = pad_sequences(sequences, maxlen=words_per_sequence, padding='pre')

print (sequences[0])

Using TensorFlow backend.


[ 111 6788  435   29 3600  938  279 6787 3598  666 3597  665 1093 6785
 2483  339 2482   55  303  304]


In [None]:
vocab_len = len (tokenizer.word_index)+1
print ('Vocab length is ', vocab_len)
print ('Number of sequences is ', len(sequences))
maxlength = max([len(seq) for seq in sequences])
print ('Max length of all sequences is ', maxlength)

Vocab length is  6789
Number of sequences is  34801
Max length of all sequences is  20


In [None]:
from keras.utils import to_categorical

sequences = np.array(sequences)
X = sequences[:,:words_per_sequence]
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocab_len)

# Configure Callbacks which would be executed during the training phase

In [None]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import TensorBoard

def create_callbacks():
    callbacks = [
        #EarlyStopping(monitor='acc', patience=5),
        ModelCheckpoint(filepath='republic-model.h5', monitor='val_loss', save_best_only=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)
    ]
    return callbacks

# Actual Language Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import Conv1D

def create_model():
    model = Sequential()
    
    #Embedding layer has three attributes
        # input_dim - size of the vocabulary
        # output_dim - number of dimensions in which each word is embedded
        # input_length - length of each word
    model.add(Embedding(input_dim=vocab_len,output_dim=50,input_length=words_per_sequence))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_len, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
print (X.shape)
print (y.shape)
print (X[0])
print (y[0])

(34801, 20)
(34801, 6789)
[ 111 6788  435   29 3600  938  279 6787 3598  666 3597  665 1093 6785
 2483  339 2482   55  303  304]
[0. 0. 0. ... 0. 0. 0.]


In [None]:
model = create_model()
cbk = create_callbacks()
history = model.fit(X,y, epochs=2000, validation_split=0.1, batch_size=128, callbacks=cbk)

W0914 02:38:09.227744  1572 deprecation_wrapper.py:119] From C:\MachineLearning\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0914 02:38:09.245744  1572 deprecation_wrapper.py:119] From C:\MachineLearning\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0914 02:38:09.248744  1572 deprecation_wrapper.py:119] From C:\MachineLearning\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0914 02:38:09.589744  1572 deprecation_wrapper.py:119] From C:\MachineLearning\anaconda\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0914 02:38:09.608744  1572 deprecation_wrapper.py:119] From C:\Mach

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            339450    
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 6789)              685689    
Total params: 1,176,039
Trainable params: 1,176,039
Non-trainable params: 0
_________________________________________________________________


W0914 02:38:10.958744  1572 deprecation_wrapper.py:119] From C:\MachineLearning\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 31320 samples, validate on 3481 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000

# Visualize the Training Metadata (Accuracy & Loss)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

figure = plt.figure()

ax1 = figure.add_subplot(211)
ax1.plot(history.history['acc'])
ax1.plot(history.history['val_acc'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(labels=['Training','Validation'])

ax1 = figure.add_subplot(212)
ax1.plot(history.history['loss'])
ax1.plot(history.history['val_loss'])
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(labels=['Training','Validation'])

plt.show()

# Save the model

In [None]:
from pickle import dump
model.save('languagemodelling-lstm-model.h5')
dump(tokenizer, open('languagemodelling-tokenizer.h5','wb'))

# Load the model

In [None]:
from pickle import load
from keras.models import load_model

lmodel = load_model('languagemodelling-lstm-model.h5')
ltokenizer = load(open('languagemodelling-tokenizer.h5','rb'))

#load clean token file
def read_cleaned_file(file):
    f = open(file,'r', encoding='utf-8')
    lines = f.read()
    return lines

lines = read_cleaned_file('cleaned-republic.txt')
lines = lines.split('\n')

In [None]:
# Randomly select a line

import random
linenumber = random.randint(0,200)

seed = lines[linenumber]
print ('%s'%(seed))

In [None]:
from keras.preprocessing.sequence import pad_sequences

def generate_sequences(model, tokenizer, seed, number):
    result = list()
    
    orig_seed = seed
    
    for _ in range (number):
        
        encoder = tokenizer.texts_to_sequences([seed])[0]
        
        padded = pad_sequences([encoder], truncating='pre', maxlen=words_per_sequence)
        yhat = model.predict_classes(padded)
        
        output = ' '
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                output = word
                break
        print ('%s'%(output))
        seed = seed + ' ' + output
        
        result.append(output)
        
generate_sequences(model, tokenizer, seed, 5)