In [52]:
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
import re
from collections import Counter

In [8]:
reviews = pd.read_csv('Reviews.csv')

In [9]:
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [10]:
reviews.dropna()
reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator',
                        'Score','Time'], 1)
reviews = reviews.reset_index(drop=True)

In [42]:
reviews.Text.isnull().sum()

0

In [43]:
reviews = reviews[reviews.Summary.notnull()]

In [175]:
# Analysing some of the reviews
for i in range(712,714):
    print('Reviews #',i)
    print(reviews.Text[i])
    print(reviews.Summary[i])
    print('\n')

Reviews # 712
My husband (who, being Mexican, is very picky about his tortilla chips) and I absolutely love these!  The texture is light and crispy, rather than thick and crunchy. He actually usually prefers a very hearty, cruncy chip (Like El Ranchero), but the flavor of these is so fantastic that we're both thilled with them. The bean, rice and corn base makes them incredibly flavorful, and they have a touch of onion and garlic in addition to that. We go through an embarrassing amount of them.  I never, ever like plain chips, but these I can eat without anything else, although they're particularly amazing with a fresh salsa.  I highly recommend these!
Perfect tortilla chip goodness!


Reviews # 713
<a href="http://www.amazon.com/gp/product/B000GWLUGU">Plocky's Tortilla Chips, Red Beans 'N Rice, 7 Ounce Bag (Pack of 12)</a>  I first tasted these chips while visiting relatives in KY.  They are not available where I live, so I ordered them from Amazon.  WOW!  My friends and family are a

### Preprocessing of Dataset

In [45]:
# Some contraction to expansion
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [66]:
def clean_text(text,remove_stopwords=False):
    
    text = text.lower()
    clean_text = []
    for word in text.split():
        if word in contractions:
            clean_text.append(contractions[word])
        else:
            clean_text.append(word)
    text = " ".join(clean_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br', ' ', text)
    text = re.sub(r'/>', ' ', text)
    text = re.sub(r'>', ' ', text)
    text = re.sub(r'<', ' ', text)
    text = re.sub(r'`', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

'<a href="http://www.amazon.com/gp/product/B000GWLUGU">Plocky\'s Tortilla Chips, Red Beans \'N Rice, 7 Ounce Bag (Pack of 12)</a>  I first tasted these chips while visiting relatives in KY.  They are not available where I live, so I ordered them from Amazon.  WOW!  My friends and family are all addicted to them.  The spicy flavor grabs you at the first bite.  Once a bag is open, it is gone!'

In [185]:
clean_text(str(reviews.Text[713]))

TypeError: 'list' object is not callable

In [68]:
clean_summary = [clean_text(summary) for summary in reviews.Summary]
clean_text = [clean_text(text) for text in reviews.Text]

### Build Vocabulary

In [84]:
def build_vocabulary(texts,summarys):
    tokens = []
    for text in texts:
        tokens.extend(text.split())
    for summary in summarys:
        tokens.extend(summary.split())
    return Counter(tokens)

In [85]:
vocab = build_vocabulary(clean_text,clean_summary)

In [86]:
print("Size of Vocabulary:", len(vocab))

Size of Vocabulary: 126931


### Embedding 
#### Using pre-trained Conceptnet Numberbatch's Embeddings (https://github.com/commonsense/conceptnet-numberbatch)

In [76]:
embed_dim = 300
embeddings = {}
with open('embeddings/numberbatch-en-17.06.txt',encoding='utf-8') as em:
    for embed in em:
        em_line = embed.split(' ')
        word = em_line[0]
        embedding = np.array(em_line[1:])
        embeddings[word] = embedding
print('Word embeddings:', len(embeddings))

Word embeddings: 417195


In [79]:
# Count no. of words not in Embeddings
threshold = 20 # Discard the words appearing less then 20 times.
missing_word_count = 0

for word in vocab:
    if vocab[word] >= 20 and word not in embeddings:
        missing_word_count += 1

print("Missing word count : ", missing_word_count)

Missing word count :  3609


In [87]:
# remove words having count less than threshold
new_vocab = {word:count for word,count in vocab.items() if count >= threshold or word in embeddings}

print("Size of New Vocabulary:", len(new_vocab))
print("Percent of actual words used : ",(len(new_vocab)/len(vocab) * 100))

Size of New Vocabulary: 59213
Percent of actual words used :  46.64975459107704


In [88]:
vocab_to_int = {}
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]
for i,code in enumerate(codes):
    vocab_to_int[code] = i

for i,word in enumerate(new_vocab.keys(),4):
    vocab_to_int[word] = i

In [94]:
int_to_vocab = {i:word for word,i in vocab_to_int.items()}

In [101]:
def convert_text_int(texts):
    int_list = []
    for text in texts:
        word_ints = []
        for word in text.split():
            if word in vocab_to_int:
                word_ints.append(vocab_to_int[word])
            else:
                word_ints.append(vocab_to_int["<UNK>"])
        int_list.append(word_ints)
    return int_list

In [103]:
summary_int = convert_text_int(clean_summary)
print(clean_summary[2])
print(summary_int[2])

 delight  says it all
[48751, 55192, 7566, 34070]


In [174]:
print(clean_summary[713])
print(summary_int[713])

these chips are addictive 
[50906, 16491, 4098, 56250]


In [108]:
text_int = convert_text_int(clean_text)

In [110]:
print(clean_text[201])
print(text_int[201])

i love  and use the empty containers for medicine advil in my purse  desk  suitcase  also line with felt to keep earrings  perfect little size to disguise small valuables when traveling and love the mints  tiny and powerful without burning my mouth 
[43923, 9028, 25752, 51565, 37515, 46647, 18540, 40405, 58764, 40248, 2555, 31965, 22875, 49243, 10690, 16755, 51368, 45174, 23204, 7940, 54857, 54746, 27269, 8711, 20535, 7940, 17739, 23437, 17499, 40305, 50181, 25752, 9028, 37515, 16415, 24552, 25752, 31476, 18910, 27745, 31965, 8130]


In [120]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings:
        word_embedding_matrix[i] = embeddings[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

59217


### Model Inputs

In [111]:
def model_inputs():
    input_ = tf.placeholder(dtype=tf.int32,shape=(None,None),name="input")
    target = tf.placeholder(dtype=tf.int32,shape=(None,None),name = "target")
    keep_prob = tf.placeholder(dtype=tf.float32,name="keep_prob")
    learning_rate = tf.placeholder(dtype=tf.float32,name="learning_rate")
    
    #for encoder decoder
    source_sequence_length = tf.placeholder(dtype=tf.int32,shape=(None,),name="source_sequence_length")
    target_sequence_length = tf.placeholder(dtype=tf.int32,shape=(None,),name="target_sequence_length")
    max_target_length = tf.reduce_max(target_sequence_length,name="max_target_length")
    return input_,target,keep_prob,learning_rate,source_sequence_length,target_sequence_length,max_target_length

In [112]:
#Process decoder input
def process_decoder_input(target_data,vocab_to_int,batch_size):
    
    strided_target = tf.strided_slice(target_data,(0,0),(batch_size,-1),(1,1))
    go = tf.fill(value=vocab_to_int["<GO>"],dims=(batch_size,1))
    decoder_input = tf.concat((go,strided_target),axis=1)
    return decoder_input

#### Creating LSTM cells

In [113]:
# Create LSTM cells
def get_lstm(rnn_size,keep_prob=0.7):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm,input_keep_prob=keep_prob)
    return drop

### Encoding Layer

In [114]:
def encoding_layer(embeded_rnn_input,rnn_size,keep_prob,num_layers,batch_size,source_sequence_length):
#     forward lstm layer
    cell_fw = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size,keepProb) for _ in range(num_layers)])
    cell_fw.zero_state(batch_size)
#     backward lstm layer
    cell_bw = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size,keepProb) for _ in range(num_layers)])
    cell_bw.zero_state(batch_size)
    
    output,output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw,cell_bw=cell_bw,inputs=embeded_rnn_input,
                                    sequence_length=source_sequence_length)
    
    output=tf.concat(output,axis=2)
    return output,output_states
    

### Decoding Layer

#### Training Decoder

In [121]:
def training_decoder(embeded_rnn_input,target_sequence_length,decoder_cell,encoder_state,
                     output_layer,max_target_length):
    helper = tf.contrib.seq2seq.TrainingHelper(embeded_rnn_input,target_sequence_length)
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,initial_state=encoder_state,
                                              output_layer=output_layer)
    final_outputs, final_state = tf.contrib.seq2seq.dynamic_decode(decoder,impute_finished=True,
                                                     maximum_iterations=max_target_length)
    return final_outputs

#### Inference Decoder

In [124]:
def inference_decoder(embed_rnn_input,embedding_rnn_input,target_sequence_length,decoder_cell,encoder_state,
                     output_layer,max_target_length,batch_size):
    
    start_tokens = tf.tile(tf.constant(dtype=tf.int32,value=[vocab_to_int["<GO>"]]),multiples=[batch_size],name="start_tokens")
    
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_rnn_input,
                                                      start_tokens=start_tokens,
                                                      end_token=vocab_to_int["<EOS>"])
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,initial_state=encoder_state,
                                              output_layer=output_layer)
    final_output, final_state = tf.contrib.seq2seq.dynamic_decode(decoder,impute_finished=True,
                                                     maximum_iterations=max_target_length)
    return final_output

#### Building Decoder

In [125]:
def decoding_layer(target_inputs,target_sequence_length,encoder_state,max_target_length,batch_size,num_layers,
                   rnn_size):
    
    vocab_len = len(vocab_to_int)
    lstm_cell = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size) for _ in range(num_layers)])
    output_layer = tf.layers.dense(vocab_len,kernel_initializer=tf.truncated_normal_initializer(stddev=0.1))

    embedding = word_embedding_matrix
    embed = tf.nn.embedding_lookup(embedding,target_inputs)
    
    with tf.variable_scope("decoding"):
        train_decode_output = training_decoder(embed,target_sequence_length,lstm_cell,
                                                                  encoder_state,output_layer,max_target_length)
        
    with tf.variable_scope("decoding",reuse=True):
        infer_decode_output = inference_decoder(embedding,target_sequence_length,lstm_cell,encoder_state,
                                                output_layer,max_target_length,batch_size)
    
    
    return train_decode_output,infer_decode_output

### Seq2Seq Modeling

In [127]:
def seq2seq_model(source_input,target_input,rnn_size,keep_prob,num_layers,batch_size,source_sequence_length,
                  target_sequence_length,max_target_length):
    
    embedding = word_embedding_matrix
    input_embed = tf.nn.embedding_lookup(embedding,source_input) 
    
    encoder_output,encoder_states = encoding_layer(input_embed,rnn_size,keep_prob,num_layers,batch_size,
                                                   source_sequence_length)
    
    output_embed = tf.nn.embedding_lookup(embedding,target_input)
    
    decoder_train_output, decoder_infer_output = decoding_layer(target_input, target_sequence_length, 
                                                                encoder_state, max_target_length, batch_size, 
                                                                num_layers, rnn_size)
    return decoder_train_output, decoder_infer_output

### Batching

In [131]:
# Padding batches
def pad_sentence_batch(sentence_batch):
    max_length = max([len(sent) for sent in sentence_batch])
    print(max_length)
    padded_sentences = []
    for sent in sentence_batch:
        sent_len = len(sent)
        if len(sent) < max_length:
            padded_sentences.append(sent + [vocab_to_int["<PAD>"] for _ in range(max_length - sent_len)])
        else:
            padded_sentences.append(sent)
    return padded_sentences

In [132]:
sent= [[43923, 9028, 25752, 51565, 37515, 46647, 18540],
      [43923, 9028, 25752, 51565, 37515, 46647, 18540, 9028, 25752, 51565, 37515]]

print(pad_sentence_batch(sent))

11
[[43923, 9028, 25752, 51565, 37515, 46647, 18540, 1, 1, 1, 1], [43923, 9028, 25752, 51565, 37515, 46647, 18540, 9028, 25752, 51565, 37515]]


In [151]:
# Creating Batches
from operator import itemgetter

#sort the text from smallest to longest text
text_len_sorted= [(i,len(text)) for i,text in enumerate(text_int)]



In [156]:
len(text_int[494051])

853

In [163]:
sorted_text_len = sorted(text_len_sorted,key= lambda x:x[1])

In [164]:
sorted_text_int = [text_int[i] for i,j in sorted_text_len]

In [173]:
clean_summary[713]

'these chips are addictive '

In [169]:
sorted_text_len

[(713, 0),
 (835, 0),
 (1059, 0),
 (1345, 0),
 (2032, 0),
 (2304, 0),
 (2508, 0),
 (2663, 0),
 (2680, 0),
 (2781, 0),
 (2888, 0),
 (3888, 0),
 (4466, 0),
 (4627, 0),
 (4696, 0),
 (5011, 0),
 (5031, 0),
 (5615, 0),
 (5845, 0),
 (6339, 0),
 (7025, 0),
 (7164, 0),
 (7336, 0),
 (7461, 0),
 (7507, 0),
 (10093, 0),
 (10494, 0),
 (10534, 0),
 (10762, 0),
 (11060, 0),
 (12216, 0),
 (12258, 0),
 (12461, 0),
 (13200, 0),
 (14660, 0),
 (14868, 0),
 (14919, 0),
 (15118, 0),
 (15691, 0),
 (15955, 0),
 (16418, 0),
 (16775, 0),
 (16852, 0),
 (16918, 0),
 (17341, 0),
 (18375, 0),
 (18525, 0),
 (20882, 0),
 (21012, 0),
 (21280, 0),
 (21470, 0),
 (21865, 0),
 (21882, 0),
 (22687, 0),
 (22701, 0),
 (23576, 0),
 (24569, 0),
 (24777, 0),
 (24780, 0),
 (24913, 0),
 (25576, 0),
 (25616, 0),
 (26309, 0),
 (27215, 0),
 (27727, 0),
 (28176, 0),
 (28479, 0),
 (28496, 0),
 (29036, 0),
 (29114, 0),
 (29536, 0),
 (29943, 0),
 (30020, 0),
 (30077, 0),
 (30079, 0),
 (30130, 0),
 (30197, 0),
 (30429, 0),
 (30520, 0),


### Hyperparameter 

In [128]:
# Number of Epochs
epochs = 3
# Batch Size
batch_size = 250
# RNN Size
rnn_size = 100
# Number of Layers
num_layers = 3
# Embedding Size
encoding_embedding_size = 200
decoding_embedding_size = 200
# Learning Rate
learning_rate = 0.01
# Dropout Keep Probability
keep_probability = 0.8
display_step = 5

### Building Graph

In [None]:
save_path = 'checkpoints/dev'

source_int_text = text_int
target_int_text = summary_int

train_graph = tf.Graph()
with train_graph.as_Default():
    
    input_,target,keep_prob,learning_rate,source_sequence_length,target_sequence_length,max_target_length = model_inputs()
    seq2seq_graph = seq2seq_model(input_,target,rnn_size,keep_probability,num_layers,batch_size,
                                  source_sequence_length,max_target_length)
    