In [1]:
import numpy as np
import numpy as np
import re, sys
import itertools
from collections import Counter
import pandas as pd
import pickle

# Loading data from CSV file

In [2]:
df = pd.read_csv("Reviews_summaries.csv",engine='python')
df.head()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,predicted_summary
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,good quality
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,peanuts
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,best almonds ever
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,not what expected
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great


In [3]:
df = df[['Score', 'Text','Summary','predicted_summary']]
df = df[pd.notnull(df['Text'])]
df = df[pd.notnull(df['Summary'])]

# Generating Train and Test Indices

In [5]:
np.random.seed(0)
shuffle_indices = np.random.permutation(np.arange(len(df)))
train_len = int(len(shuffle_indices) * 0.8)
train_indices = shuffle_indices[:train_len]
test_indices = shuffle_indices[train_len:]

In [6]:
np.save('train_indices.npy',train_indices)
np.save('test_indices.npy',test_indices)

# Data Cleaning

In [7]:
def clean_str(string):
    
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


In [8]:
labels = df['Score'].values

In [9]:
text_data = df['Text'].values
summary_data = df['predicted_summary'].values

In [10]:
def load_text_data(text_data):
    x_text = [s.strip() for s in text_data]
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    return x_text

In [11]:
x_text_review = load_text_data(text_data)
x_gen_summary = load_text_data(summary_data)

In [12]:
def pad_sentences(sentences, padding_word="<PAD/>"):
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences


In [13]:
x_text_review = pad_sentences(x_text_review)
x_gen_summary = pad_sentences(x_gen_summary)

# Creating Vocabulary for Text reviews and Generated summary

In [14]:
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

In [15]:
vocabulary_text_review, vocabulary_inv_text_review = build_vocab(x_text_review)
vocabulary_gen_summary, vocabulary_inv_gen_summary = build_vocab(x_gen_summary)

In [16]:
def build_input_data(sentences, vocabulary):
    """
    Maps sentencs and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    
    return x

In [17]:
x_text_review = build_input_data(x_text_review, vocabulary_text_review)
x_gen_summary = build_input_data(x_gen_summary, vocabulary_gen_summary)

In [18]:
np.save('text_review.npy',x_text_review)
np.save('predicted_summary_x.npy',x_gen_summary)

In [19]:
np.save('ratings.npy',labels)

In [20]:
with open('text_review_vocabulary.pkl', 'wb') as f:
    pickle.dump(vocabulary_text_review, f, pickle.HIGHEST_PROTOCOL)
with open('predicted_summary_vocabulary.pkl', 'wb') as f:
    pickle.dump(vocabulary_gen_summary, f, pickle.HIGHEST_PROTOCOL)

In [21]:
np.save('text_review_vocabulary_inv.npy',vocabulary_inv_text_review)
np.save('predicted_summary_vocabulary_inv.npy',vocabulary_inv_gen_summary)