In [2]:
#Importing library
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.utils import *
from keras.initializers import *
import tensorflow as tf
import time, random
import re
import string

Using TensorFlow backend.


In [3]:
#Hyperparameters
batch_size = 64
latent_dim = 256
num_samples = 10000

In [4]:
from string import digits
import pandas as pd
from keras.optimizers import Adam

In [5]:
data = pd.read_csv('Reviews.csv')
data_set = data[['Summary','Text']].copy(deep = True)

In [6]:
data_set.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [7]:
data_set.isnull().sum()

Summary    27
Text        0
dtype: int64

In [8]:
data_set.dropna(inplace = True)

In [9]:
data_set.drop_duplicates(inplace=True)

In [10]:
data_set.reset_index(inplace = True)

In [11]:
# Lowercase all characters
data_set['Summary'] = data_set['Summary'].apply(lambda x: x.lower())
data_set['Text'] = data_set['Text'].apply(lambda x: x.lower())

In [12]:
# Remove quotes
data_set['Summary']=data_set['Summary'].apply(lambda x: re.sub("'", '', x))
data_set['Text']=data_set['Text'].apply(lambda x: re.sub("'", '', x))

In [13]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
data_set['Summary']=data_set['Summary'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data_set['Text']=data_set['Text'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [14]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
data_set['Summary']=data_set['Summary'].apply(lambda x: x.translate(remove_digits))
data_set['Text']=data_set['Text'].apply(lambda x: x.translate(remove_digits))
data_set['Text'] = data_set['Text'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
data_set['Summary']=data_set['Summary'].apply(lambda x: x.strip())
data_set['Text']=data_set['Text'].apply(lambda x: x.strip())
data_set['Summary']=data_set['Summary'].apply(lambda x: re.sub(" +", " ", x))
data_set['Text']=data_set['Text'].apply(lambda x: re.sub(" +", " ", x))


In [15]:
data_set.head()

Unnamed: 0,index,Summary,Text
0,0,good quality dog food,i have bought several of the vitality canned d...
1,1,not as advertised,product arrived labeled as jumbo salted peanut...
2,2,delight says it all,this is a confection that has been around a fe...
3,3,cough medicine,if you are looking for the secret ingredient i...
4,4,great taffy,great taffy at a great price there was a wide ...


In [16]:
data_set.drop(labels = ['index'], axis = 1, inplace = True)

In [17]:
data_set.tail()

Unnamed: 0,Summary,Text
394962,will not do without,great for sesame chickenthis is a good if not ...
394963,disappointed,im disappointed with the flavor the chocolate ...
394964,perfect for our maltipoo,these stars are small so you can give of those...
394965,favorite training and reward treat,these are the best treats for training and rew...
394966,great honey,i am very satisfied product is as advertised i...


In [18]:
#Vectorize the data.
input_texts = []
target_texts = []
input_chars = set()
target_chars = set()


    
for i in range(min(num_samples, len(data_set) - 1)):
    input_text = data_set.loc[i,'Text']
    target_text = data_set.loc[i,'Summary']
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    for char in input_text:
        if char not in input_chars:
            input_chars.add(char)
    for char in target_text:
        if char not in target_chars:
            target_chars.add(char)

input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

#Print size
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 10000
Number of unique input tokens: 34
Number of unique output tokens: 33
Max sequence length for inputs: 9763
Max sequence length for outputs: 127


In [19]:
#Define data for encoder and decoder
input_token_id = dict([(char, i) for i, char in enumerate(input_chars)])
target_token_id = dict([(char, i) for i, char in enumerate(target_chars)])

encoder_in_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')

decoder_in_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_in_data[i, t, input_token_id[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_in_data[i, t, target_token_id[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_id[char]] = 1.

In [20]:
#Define and process the input sequence
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
#We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

#Using `encoder_states` set up the decoder as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [21]:
#Final model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [22]:
#Model Summary
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 34)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 33)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 297984      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  296960      input_2[0][0]                    
                                                                 lstm_1[0][1]               

In [23]:
#Model data Shape
print("encoder_in_data shape:",encoder_in_data.shape)
print("decoder_in_data shape:",decoder_in_data.shape)
print("decoder_target_data shape:",decoder_target_data.shape)

encoder_in_data shape: (10000, 9763, 34)
decoder_in_data shape: (10000, 127, 33)
decoder_target_data shape: (10000, 127, 33)


In [1]:
#Compiling and training the model
model.compile(optimizer=Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.001), loss='categorical_crossentropy')

model.fit([encoder_in_data, decoder_in_data], decoder_target_data, batch_size = batch_size, epochs=50, validation_split=0.2)

NameError: name 'model' is not defined

In [None]:
#Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
reverse_input_char_index = dict((i, char) for char, i in input_token_id.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_id.items())

#Define Decode Sequence
def decode_sequence(input_seq):
    #Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    #Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    #Get the first character of target sequence with the start character.
    target_seq[0, 0, target_token_id['\t']] = 1.

    #Sampling loop for a batch of sequences
    #(to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        #Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        #Exit condition: either hit max length
        #or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        #Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        #Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
for seq_index in range(20):
    input_seq = encoder_in_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)