* Read Language model tutorial ---> https://medium.com/@shivambansal36/language-modelling-text-generation-using-lstms-deep-learning-for-nlp-ed36b224b275
* Find one english corpus with poetries in the internet (e.g from here) --> https://www.poetryfoundation.org/poems
* You can use whatever corpus you want (e.g. your favorite book)
* Encapsulate LSTM building like MLP from the first task
* Train LSTM as language model on your corpus like in the tutorial
* Also, you need to compare 1-layer and 2-layer LSTMs
* Compare texts, generated by your models

In [192]:
from __future__ import print_function

import keras
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.utils import np_utils
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import adam, adagrad, adadelta
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.regularizers import l2,l1

import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid, train_test_split

import os
import numpy as np
import pandas as pd


os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [158]:
# Read Data
# folder_name = '.'
# filename = os.path.join(folder_name, 'task3_corpus.csv')
filename = 'task3_corpus.csv'
file_type = 'csv'

In [166]:
def read_data(filename, file_type):
    if file_type == 'csv':
        data = pd.read_csv(filename)
        data = data['text']
        
    return data

In [167]:
df = read_data(filename, file_type)

In [168]:
text = '\n'.join([row for row in df])

In [169]:
token = tokenizer.texts_to_sequences(['Hello World'])[0]
for i in range(1, len(token)):
    print(token[:i+1])


[1, 6]


In [198]:
class ModelFormer:
    def __init__(self):
        self.x = []
        self.y = []
        self.tokenizer = Tokenizer()
        self.model = Sequential()
        
    def fit_data(self, text):
        self.original_corpus = text
        self.corpus = self.original_corpus.lower().split('\n')
        self.tokenizer.fit_on_texts(self.corpus)
        self.word_count = len(self.tokenizer.word_index) + 1
        input_sequences = []
        for line in self.corpus:
            tokens = self.tokenizer.texts_to_sequences([line])[0]
            for i in range(1, len(tokens)):
                n_grams_sequence = tokens[:i+1]
                input_sequences.append(n_grams_sequence)
        
        input_sequences = self.pad_input_sequences(input_sequences)
        
        x_data, y_data = input_sequences[:,:-1], input_sequences[:,-1]
        y_data = np_utils.to_categorical(y_data, num_classes=self.word_count)
        
        return x_data, y_data
              
    def pad_input_sequences(self,input_sequences):
        max_sequence_length = max([len(sentence) for sentence in input_sequences])
        input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))
        return input_sequences
    
    def compile_model(self, x_data, y_data , layers={ 'LSTM': 150, 'Dropout': 0.2 }, activation='tanh', optimizer='adam', lr=0.01):
        self.x_data = x_data
        self.y_data = y_data
        self.model.add(Embedding(self.word_count, 15, input_length=len(x_data[0]) ))
        for layer in layers:
            if layer == 'LSTM':
                self.model.add(LSTM(layers[layer], activation=activation))
            if layer == 'Dropout':
                self.model.add(Dropout(layers[layer]))
        
        self.model.add(Dense(self.word_count, activation='softmax'))
        if optimizer == 'adam':
            optimizer = adam(lr=lr)
        elif optimizer == 'adadelta':
            optimizer = adadelta(lr=lr)
        self.model.compile(loss='categorical_crossentropy', optimizer=optimizer)
        self.model.summary()
    
    def fit(self,epochs=100, verbose=1):
        self.model.fit(self.x_data, self.y_data, epochs=epochs, verbose=verbose)
        
        

In [199]:
m = ModelFormer()
X, Y= m.fit_data(text)


In [200]:
x_train , x_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)

In [201]:
print(len(x_train), len(y_train), len(x_test), len(y_test))

8847 8847 3792 3792


In [202]:
m.compile_model(x_train, y_train)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 54, 15)            32775     
_________________________________________________________________
lstm_9 (LSTM)                (None, 150)               99600     
_________________________________________________________________
dropout_8 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 2185)              329935    
Total params: 462,310
Trainable params: 462,310
Non-trainable params: 0
_________________________________________________________________


In [203]:
m.fit()

Epoch 1/100
2016/8847 [=====>........................] - ETA: 1:00 - loss: 6.7370

KeyboardInterrupt: 