In [1]:
import pandas as pd
import numpy as np

# Read the Republic Data file - Plato document

    - This file is available at http://www.gutenberg.org/cache/epub/1497/pg1497.txt
    

In [2]:
def read_file(file):
    f = open(file, 'r', encoding='utf-8')
    lines = f.readlines()
    data = '\t'.join([line.strip() for line in lines])
    return data

data = read_file('republic.txt')

# Clean the Read file

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Aditya
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import re
import string
from nltk.corpus import stopwords
import pandas as pd

def clean_doc(doc):
    
    #replace -- with a space
    doc = doc.replace('--',' ')
    
    #split into tokens by white space
    tokens = doc.split()
    
    # prepare for char filtering
    re_punc = re.compile('[%s]'%re.escape(string.punctuation))
    
    #remove punctuation from words
    tokens = [re_punc.sub('',w) for w in tokens]
    
    #remove remaining tokens that are not alphanumerics
    tokens = [word for word in tokens if word.isalpha()]
    
    #make lower case
    tokens = [word.lower() for word in tokens]
    
    #remove stop words
    stops = set(stopwords.words("english"))
    tokens = [word for word in tokens if not word in stops and len(word)>=3]
    
    return tokens

tokens = clean_doc(data)
                 
print ('Number of tokens is ', len(tokens))
print ('Number of unique tokens is ', len(set(tokens)))

Number of tokens is  90229
Number of unique tokens is  10117


# Generate Sequences

    - This method takes in bunch of tokens and then converts those tokens into sequences. The idea here is that we create
    sequences of 21 tokens (words) each and then feed each sequence one at a time to the network. each word in the sequence represents a value at a given timesteps. All words in effect become the features

In [5]:
words_per_sequence = 20

def generate_sequences(size, tokens):
    sequences = list()
    size_seq = size + 1
    for i in range (0, len(tokens)-size):
        seq = tokens[i:size_seq+i]
        line = ' '.join(seq)
        sequences.append(line)
        
    print ('Total number of sequenes is ', len(sequences))
    return sequences

sequences = generate_sequences(words_per_sequence, tokens)

Total number of sequenes is  90209


# Save generated sequences in new file

In [6]:
def save_sequences(filename, sequences):
    
    file = open(filename,'w',encoding='utf-8')
    lines = '\n'.join(sequences)
    file.write(lines)
    file.close()
    
save_sequences('cleaned-republic.txt', sequences)

# Read cleaned sequences from file

In [7]:
def read_sequences(file):
    f = open(file, 'r', encoding='utf-8')
    lines = f.read()
    return lines

lines = read_sequences('cleaned-republic.txt')
lines = lines.split('\n')

# Tokenize & Create Sequences of Numbers

In [8]:
from keras.preprocessing.text import Tokenizer
import numpy as np

def tokenize_documents(lines):
    tokenizer = Tokenizer(num_words=2000)
    tokenizer.fit_on_texts(lines)
    sequences = tokenizer.texts_to_sequences(lines)
    return (tokenizer, sequences)

(tokenizer, sequences) = tokenize_documents(lines)
sequences = np.array(sequences)

Using TensorFlow backend.


In [9]:
vocab_len = len (tokenizer.word_index)+1
print ('Vocab length is ', vocab_len)
print ('Number of sequences is ', len(sequences))
maxlength = max([len(seq) for seq in sequences])
print ('Max length of all sequences is ', maxlength)

Vocab length is  10118
Number of sequences is  90209
Max length of all sequences is  21


In [10]:
from keras.utils import to_categorical

sequences = np.array(sequences)
X = sequences[:,:words_per_sequence]
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocab_len)

IndexError: too many indices for array

# Actual Language Model

In [None]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import TensorBoard

def create_callbacks():
    callbacks = [
        EarlyStopping(monitor='acc', patience=3)
    ]
    return callbacks

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import LSTM

def create_model():
    model = Sequential()
    
    #Embedding layer has three attributes
        # input_dim - size of the vocabulary
        # output_dim - number of dimensions in which each word is embedded
        # input_length - length of each word
    model.add(Embedding(input_dim=vocab_len,output_dim=20,input_length=words_per_sequence))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_len, activation='sigmoid'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
model = create_model()
cbk = create_callbacks()
history = model.fit(X,y, epochs=100, batch_size=128, callbacks=cbk)

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())