In [35]:
import pandas as pd 
from collections import Counter
import string
import scipy as np
from tqdm import tqdm_notebook
from keras.preprocessing import sequence
import tensorflow as tf

In [2]:
data = pd.read_csv('./Reviews.csv').dropna()
print(data.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


In [3]:
data.drop_duplicates(subset=['Text', 'Summary', 'Score'], inplace=True)

In [4]:
data = data[data['Score'] != 3]
data['Score'] = [1 if item>3 else 0 for item in data['Score']]

In [5]:
def preprocess(x):
    for punc in string.punctuation:
        if punc != "\'":
            x = x.replace(punc, f' {punc} ')
    return ' '.join(x.split()).lower()

data['Summary'] = [preprocess(item) for item in data['Summary']]
data['Text'] = [preprocess(item) for item in data['Text']]

In [6]:
X_data = [i+' '+j for i,j in zip(list(data['Summary'].values), list(data['Text']))]
Y_data = list(data['Score'].values)

In [19]:
corpus = dict(Counter(' '.join(X_data).split()))
print('Number of unique tokens:',len(corpus))

Number of unique tokens: 133571


In [20]:
min_word_count = np.percentile(list(corpus.values()), 90)
print('minimum word count:',min_word_count)

minimum word count: 41.0


In [21]:
# Deleting those words which occur less than 41 times
words = list(corpus.keys())
for w in words:
    if corpus[w]<41:
        del corpus[w]
    
print('Number of unique tokens:', len(corpus))

Number of unique tokens: 13472


In [22]:
seq_lens = [len(item.split()) for item in X_data ]

suitable_seq_len = np.percentile(seq_lens, 90)
print('Suitable sequence lenght:', suitable_seq_len)

Suitable sequence lenght: 199.0


In [25]:
# Creating the interger ids for the words 
word_ids = {
    item: index+1 for index, item in enumerate(corpus.keys())
}

In [32]:
X_data_int = []; Y_data_new = []
for item, y in zip(X_data, Y_data):
    temp = [word_ids.get(word, -1) for word in item.split()]
    if temp:
        X_data_int.append(temp)
        Y_data_new.append(y)

In [34]:
X_data_int = sequence.pad_sequences(X_data_int, maxlen=int(suitable_seq_len))

In [37]:
def one_hot_maker(x):
    with tf.Session() as sess:
        return sess.run(tf.one_hot(x, depth=len(np.unique(x))))
    
Y_data_new = one_hot_maker(Y_data_new)

In [38]:
Y_data_new[1]

array([1., 0.], dtype=float32)