<div style="font-size:40px; font-weight:bold; margin:20px; margin-bottom:100px; text-align: justify;text-shadow: 1px 1px 1px #919191,
        1px 2px 1px #919191,
        1px 3px 1px #919191,
        1px 4px 1px #919191,
        1px 5px 1px #919191,
        1px 6px 1px #919191,
        1px 7px 1px #919191,
        1px 8px 1px #919191,
        1px 9px 1px #919191,
        1px 10px 1px #919191,
    1px 18px 6px rgba(16,16,16,0.4),
    1px 22px 10px rgba(16,16,16,0.2),
    1px 25px 35px rgba(16,16,16,0.2),
    1px 30px 60px rgba(16,16,16,0.4)">Long Short Term Memory Recurrent Networks</div>

<div style="font-style: italic; font-weight: bold; font-size:35px; text-align:center; font-family: Garamond">by Rubén Cañadas Rodríguez</div>

<div style="font-size: 30px; margin: 20px; margin-bottom: 40px; margin-left: 0px; line-height: 40pt">

<div style="font-size: 30px; font-family: Garamond; font-weight: bold; margin: 30px; margin-left: 0px; margin-bottom: 10px; ">Contents</div>
<ol>
<li>Introduction</li>
<li>Recurrent Neural Networks (RNNs)</li>
<li>Long Short Term Memory (LSTM)</li> 
<li>Text generation</li> 
<li>Coding</li> 
</ol>
</div>
<div style="font-size: 30px; font-weight: bold; margin-bottom: 20px; margin-top: 30px"> Introduction </div>
<div style="font-size: 30px; font-weight: bold; margin-bottom: 20px; margin-top: 30px"> Recurrent Neural Networks (RNNs) </div>
<div style="font-size: 30px; font-weight: bold; margin-bottom: 20px; margin-top: 30px"> Long Short Term Memory (LSTM) </div>
<div style="font-size: 30px; font-weight: bold; margin-bottom: 20px; margin-top: 30px"> Text generation </div>
<div style="font-size: 30px; font-weight: bold; margin-bottom: 20px; margin-top: 30px"> Coding </div>

<div style="font-size: 40px; font-weight: bold; text-align: left">Import packages and modules</div>
<br>
<br>


In [None]:
import pandas as pd
import re #regular expressions for treating text
import numpy as np
from nltk import word_tokenize #Natural Language Processing package
from nltk import word_tokenize
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.models import model_from_json
from keras.layers import Input, Activation, Dense, Dropout
from keras.layers import LSTM, Bidirectional

In [None]:
class DataPreparation(object):
    
    def __init__(self, max_lenght, step):
        
        self._csv = "QUOTE.csv"
        self.__df = pd.read_csv(self._csv)
        self.__quotes = list(self.__df.quote + "\n") #adding a break at the end of each quote!
        self.__chars_to_remove = ['#', '$', '%', '(', ')', '=', ';' ,':',  '*', '+', '£' , '—','’']
        self.__cleaned_quotes = []
        self._chars = None
        self._char_indices = None
        self._indices_char = None
        self._sentences = []
        self._next_chars = []
        self._max_length = max_lenght
        self._step = step
        
    def __str__(self):  
        return "{}".format(self.__quotes)
    
    def __len__(self): 
        return len(self.__quotes)
        
    
    def __getitem__(self, index):
        return self.__quotes[index]
    
    @property
    def chars_to_remove(self):
        return self.__chars_to_remove #Getter method for obtaining the default chars to remove
    
    @chars_to_remove.setter
    def chars_to_remove(self, list_of_chars):
        self.__chars_to_remove = list_of_chars #Setter method for changing the chars to remove from sentences
        
        
    def __remove_unused_chars(self):
        
        """
        This method removes caracthers that we do not want to include in our model
        saved in self.__chars_to_remove attribute. Also we remove more than two spaces
        in ours sentences. This method appends the results to cleaned_chars variable
        """
        
        for quote in self.__quotes:
            for char in self.__chars_to_remove:
                new_quote = quote.replace(char, ' ')
                pattern = re.compiler(r'\s{2,}') #regular expression for replacing more than two white spaces
                quote = re.sub(pattern, ' ', quote)
                self.__cleaned_quotes.append(quote)
                  
    def __obtain_char_indices(self):
        
        self.__remove_unused_chars() #Creatin cleaned_chars variables that was initialized as empty list
        text = ' '.join(self.__cleaned_chars)
        self._chars = sorted(list(set(text))) #We extract all the characters (not repeated ) that are present in the sentences
        self._char_indices = dict((c, i) for i, c in enumerate(chars)) #To each character we assign a number
        self._indices_char = dict((i, c) for i, c in enumerate(chars)) #The contrary, to each number we asssign a character
        
    
    def _generate_sentences(self):
        
        for quote in self.__cleaned_quotes:
            for i in range(0, len(quote) - self._max_length, self._step):
                sentences.append(quote[i: i + self._max_length]) #sentence of lenght maxlen
                next_chars.append(quote[i + self._max_length]) #next char after sentence of max lenght
            self._sentences.append(quote[-self._max_length:])
            self._next_chars.append(quote[-1])
        self._sentences = self._sentences[:100] #Optional to reduce time consumption we limit the number of sentences

    
    def _vectorization(self):
        
        if not self._sentences:
            raise ValueError("_generate_sentences method has to be applied before vectorizing!! Otherwise execute generate_and_vectorize ")
        
        x = np.zeros((len(self._sentences), self._max_length, len(self._chars)), dtype=np.bool) #Three dimensional tensor: for each sentence and 
        #each char of the sentence we assign an index corresponding to a particular char
        y = np.zeros((len(self._sentences), len(self._chars)), dtype=np.bool) #Two dimensional tensor, for each sentence (of maxlen) we assign a next
        #char that the LSTM will have to guess given X. 
        for i, sentence in enumerate(self._sentences):
            for t, char in enumerate(sentence): 
                x[i, t, self._char_indices[char]] = 1 # Tensor[sentences, chars, indices_char]
            y[i, self._char_indices[self._next_chars[i]]] = 1 # Tensor [sentence, next_char_in_sentence]
            
        return x,y
        
    def generate_and_vectorize(self):
        
        self._generate_sentences()
        return self._vectorization() 


<table style="width:100%; margin: 20px; margin-left:-300px">
  <tr>
    <th>Predictors (X train)</th>
    <th>Labels (Y train)</th>
  </tr>
  <tr>
    <td>they</td>
    <td>are</td>
  </tr>
  <tr>
    <td>they are</td>
    <td>learning</td>
  </tr>
  <tr>
    <td>they are learning</td>
    <td>artificial</td>
  </tr>
      <tr>
    <td>they are learning artificial</td>
    <td>inteligence</td>
  </tr>
</table>

In [None]:
class TrainLSTM(DataPreparation):

    def __init__(self, epochs=15, batch_size=1000, max_lenght=15, step=6):
        super(TrainLSTM, self).__init__(max_lenght, step)
        self.__epochs = epochs
        self.__batch_size = batch_size
        self.__model = Sequential()
    
    
    def __str__(self):
        return "Model parameters: \n batch size: {}\n number of epochs: {}".format(self.__batch_size, self.__epochs)
    
    @property    
    def num_epochs(self):
        return self.__epochs
    
    @property
    def batch_size(self):
        return self.__batch_size
        
    def model(self):

        self.__model.add(Bidirectional(LSTM(256, return_sequences= True, 
                                     input_shape=(self._max_length, len(self._chars))), name = 'bidirectional'))
        self.__model.add(Dropout(0.1, name = 'dropout_bidirectional_lstm'))
        self.__model.add(LSTM(64, input_shape=(self._max_length, len(self._chars)), name = 'lstm'))
        self.__model.add(Dropout(0.1,  name = 'drop_out_lstm'))
        self.__model.add(Dense(15 * len(self._chars), name = 'first_dense'))
        self.__model.add(Dropout(0.1,  name = 'drop_out_first_dense'))
        self.__model.add(Dense(5 * len(self._chars), name = 'second_dense'))
        self.__model.add(Dropout(0.1,  name = 'drop_out_second_dense'))
        self.__model.add(Dense(len(self._chars), name = 'last_dense'))
        self.__model.add(Activation('softmax', name = 'activation'))
        self.__model.compile(optimizer='adam', loss='categorical_crossentropy')

    def train(self):
        
        model.fit([x], y, batch_size=self.__batch_size, epochs=self.__epochs)
        
