* Read Language model tutorial ---> https://medium.com/@shivambansal36/language-modelling-text-generation-using-lstms-deep-learning-for-nlp-ed36b224b275
* Find one english corpus with poetries in the internet (e.g from here) --> https://www.poetryfoundation.org/poems
* You can use whatever corpus you want (e.g. your favorite book)
* Encapsulate LSTM building like MLP from the first task
* Train LSTM as language model on your corpus like in the tutorial
* Also, you need to compare 1-layer and 2-layer LSTMs
* Compare texts, generated by your models

In [0]:
from __future__ import print_function

import keras
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.utils import np_utils
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import adam, adagrad, adadelta, rmsprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.regularizers import L1L2

import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid, train_test_split

import os
import numpy as np
import pandas as pd


os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# Read Data
folder_name = '/content/gdrive/My Drive/Colab Notebooks/'
filename = os.path.join(folder_name, 'task3_corpus.csv')
# filename = 'task3_corpus.csv'
file_type = 'csv'

In [0]:
def read_data(filename, file_type):
    if file_type == 'csv':
        data = pd.read_csv(filename)
        data = data['text']
        
    return data

In [0]:
df = read_data(filename, file_type)

In [0]:
text = '\n'.join([row for row in df])

In [0]:
layers = [( 'LSTM', 150), ('Dropout', 0.2), ('LSTM', 120)]
count = [x for x,_ in layers].count('LSTM')
print(count)


2


In [0]:
class ModelFormer:
    def __init__(self):
        self.x = []
        self.y = []
        self.tokenizer = Tokenizer()
        self.best_model = Sequential()
        self.best_accuracy = 0
        self.best_parameters = {}
        
    def fit_data(self, text):
        self.original_corpus = text
        self.corpus = self.original_corpus.lower().split('\n')
        self.tokenizer.fit_on_texts(self.corpus)
        self.word_count = len(self.tokenizer.word_index) + 1
        input_sequences = []
        for line in self.corpus:
            tokens = self.tokenizer.texts_to_sequences([line])[0]
            for i in range(1, len(tokens)):
                n_grams_sequence = tokens[:i+1]
                input_sequences.append(n_grams_sequence)
        
        input_sequences = self.pad_input_sequences(input_sequences)
        
        x_data, y_data = input_sequences[:,:-1], input_sequences[:,-1]
        y_data = np_utils.to_categorical(y_data, num_classes=self.word_count)
        
        return x_data, y_data
              
    def pad_input_sequences(self,input_sequences):
        max_sequence_length = max([len(sentence) for sentence in input_sequences])
        input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))
        return input_sequences
    
    def fit(self, x_data, y_data , layers= [( 'LSTM', 150), ('Dropout', 0.2), ('LSTM', 120)], activation='tanh', optimizer='adam', lr=0.01, epochs=20):
        self.model = Sequential()
        
        self.x_data = x_data
        self.y_data = y_data
        x_train, x_val, y_train, y_val = train_test_split(self.x_data, self.y_data)
        
        
        self.model.add(Embedding(self.word_count, 10, input_length=len(x_data[0]) ))
        count_lstm_retn_flag = [x for x,_ in layers].count('LSTM') - 1

        for layer,value in layers:
            if layer == 'LSTM':
                if count_lstm_retn_flag:
                    count_lstm_retn_flag -= 1
                    return_sequences = True 
                else:
                    return_sequences = False
                self.model.add(LSTM(value, activation=activation, return_sequences=return_sequences))
            if layer == 'Dropout':
                self.model.add(Dropout(value))
        
        self.model.add(Dense(self.word_count, activation='softmax'))
        if optimizer == 'adam':
            optimizer = adam(lr=lr)
        elif optimizer == 'adadelta':
            optimizer = adadelta(lr=lr)
        elif optimizer == 'rmsprop':
            optimizer = rmsprop(lr=lr)
            
            
        self.model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        self.model.summary()
        
        fit_summary = self.model.fit(x_train, y_train, epochs=epochs, verbose=1, validation_data=(x_val, y_val), batch_size=20)
        if fit_summary.history['acc'][-1] > self.best_accuracy:
            self.best_model = self.model
            self.best_accuracy = fit_summary.history['acc'][-1]
            self.best_parameters = (layers, activation, optimizer, lr, epochs)
        
        return fit_summary
        
        

In [0]:
m = ModelFormer()
X, Y= m.fit_data(text)


In [0]:
x_train , x_test, y_train, y_test = train_test_split(X,Y, test_size=0.3)

In [0]:
print(len(x_train), len(y_train), len(x_test), len(y_test))

8847 8847 3792 3792


In [0]:
# Define Grid Search with Parameter Grid

In [0]:
hyperparameters = { 'layers': [ [( 'LSTM', 200), ('Dropout', 0.2)], [( 'LSTM', 200), ('Dropout', 0.2), ('LSTM', 400), ('Dropout', 0.2) ]], 
                     'activation': ['tanh'],
                     'optimizer' : [ ('adam', 0.01 ), ('adam', 0.001 ) , ('adadelta', 1 ), ('rmsprop', 0.1 )],
                     'epochs' : [50]
                   }

In [0]:
combinations = list(ParameterGrid(hyperparameters))
combinations

[{'activation': 'tanh',
  'epochs': 50,
  'layers': [('LSTM', 200), ('Dropout', 0.2)],
  'optimizer': ('adam', 0.01)},
 {'activation': 'tanh',
  'epochs': 50,
  'layers': [('LSTM', 200), ('Dropout', 0.2)],
  'optimizer': ('adam', 0.001)},
 {'activation': 'tanh',
  'epochs': 50,
  'layers': [('LSTM', 200), ('Dropout', 0.2)],
  'optimizer': ('adadelta', 1)},
 {'activation': 'tanh',
  'epochs': 50,
  'layers': [('LSTM', 200), ('Dropout', 0.2)],
  'optimizer': ('rmsprop', 0.1)},
 {'activation': 'tanh',
  'epochs': 50,
  'layers': [('LSTM', 200), ('Dropout', 0.2), ('LSTM', 400), ('Dropout', 0.2)],
  'optimizer': ('adam', 0.01)},
 {'activation': 'tanh',
  'epochs': 50,
  'layers': [('LSTM', 200), ('Dropout', 0.2), ('LSTM', 400), ('Dropout', 0.2)],
  'optimizer': ('adam', 0.001)},
 {'activation': 'tanh',
  'epochs': 50,
  'layers': [('LSTM', 200), ('Dropout', 0.2), ('LSTM', 400), ('Dropout', 0.2)],
  'optimizer': ('adadelta', 1)},
 {'activation': 'tanh',
  'epochs': 50,
  'layers': [('LSTM', 

In [0]:
fit_summary_array = []

In [0]:
for combination in combinations:
    print('Current Combination : {}'.format(combination))
    m.fit(x_train, y_train, layers=combination['layers'], activation=combination['activation'], optimizer=combination['optimizer'][0], lr=combination['optimizer'][1], epochs=combination['epochs'])

Current Combination : {'activation': 'tanh', 'epochs': 50, 'layers': [('LSTM', 200), ('Dropout', 0.2)], 'optimizer': ('adam', 0.01)}
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_31 (Embedding)     (None, 54, 10)            21850     
_________________________________________________________________
lstm_44 (LSTM)               (None, 200)               168800    
_________________________________________________________________
dropout_42 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_31 (Dense)             (None, 2185)              439185    
Total params: 629,835
Trainable params: 629,835
Non-trainable params: 0
_________________________________________________________________
Train on 6635 samples, validate on 2212 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

In [222]:
m.best_accuracy

0.935644302632275

In [223]:
m.best_parameters

([('LSTM', 200), ('Dropout', 0.2)],
 'tanh',
 <keras.optimizers.Adam at 0x7f69dc151a20>,
 0.001,
 50)

In [225]:
print('Best Accuracy : {}, with best Parameters : {}'.format(m.best_accuracy*100, m.best_parameters))

Best Accuracy : 93.56443026322751, with best Parameters : ([('LSTM', 200), ('Dropout', 0.2)], 'tanh', <keras.optimizers.Adam object at 0x7f69dc151a20>, 0.001, 50)


In [226]:
# Generate Sentences : 
def generate_n_sentences(n=5):
    final_sentences = []
    for _ in range(n):
        prediction = x_test[np.random.randint(len(x_test))]
        prediction = np.delete(prediction, 0)
        first_prediction = m.best_model.predict_classes([x_test[0].reshape(1,54)])
        prediction = np.append(prediction,first_prediction)
        for _ in range(5):
            next_prediction = m.best_model.predict_classes(prediction.reshape(1,54))
            prediction = np.delete(prediction, 0)
            prediction = np.append(prediction,next_prediction)



        output_word = ""
        for i in prediction:
            if i:
                for word,index in m.tokenizer.word_index.items():
                    if index == i:
                        output_word += word + ' '
                        break

        final_sentences.append(output_word)
    return final_sentences

In [76]:
generate_n_sentences(10)

['we negotiated a ceasefire in parts our terrible amp russia the russians ',
 'should federal election commission and or fcc look into this there must be collusion with the democrats and of course russia such one sided media coverage most of it our fake news media collusion so ',
 'slippery james comey the worst fbi director in history was not our scheme of the democrats lead ',
 '“i have seen all of the russian ads and i can say very definitively that swaying the election was not the main goal ”rob goldmanvice president of facebook our p more leakin’ i nice ',
 'not associated our vindicates “trump” on russia and ',
 'if it was the goal of russia to create discord our investigation to into the details ',
 'why did the obama administration start an investigation into the trump campaign with zero proof of wrongdoing our country of this fbi against ',
 'the mainstream media has refused to cover the fact that the head of the very important senate intelligence committee after two years of 