In [34]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import re
import string
import math
import pickle

In [2]:
df = pd.read_csv('../data/Reviews.csv')

### Dropping duplicates:
***

In [3]:
imp_cols = set(df.columns)-{'Id','ProductId'}
df = df.drop_duplicates(subset=imp_cols)
print ('Dimension after eliminating duplicates',df.shape)

Dimension after eliminating duplicates (396309, 10)


### Discarding 3 star reviews:
***

In [4]:
df = df.query('Score != 3')

In [5]:
df.shape

(366402, 10)

### Sorting using time-stamp:
***

In [6]:
df = df.sort_values(by='Time')

### Refining class labels:
***

In [7]:
Y = df.Score.tolist()

In [8]:
for i in range(len(Y)):
    if(Y[i]>3):
        Y[i]=1
    else:
        Y[i]=0

In [9]:
df['Score'] = np.array(Y)

### Refining text of Reviews:
***

In [10]:
X = df.Text.tolist()

In [11]:
# Removing HTML tags
X = [re.sub('<[^>]*>', '',i.lower()) for i in X]

In [12]:
X[1]

"i can remember seeing the show when it aired on television years ago, when i was a child.  my sister later bought me the lp (which i have to this day,  i'm thirty something).i used this series of books &amp; songs when i did my  student teaching for preschoolers &amp; turned the whole school on to it.  i am now purchasing it on cd, along with the books for my children 5 &amp;  2.  the tradition lives on!"

In [13]:
# Counting number of sentences
# Here we are just going to split the text with a '.' delimeter 
# and not worry much as to how the reviewers has written the review.
count_sen = []
for i in range(len(X)):
    tmp = X[i].split('.')
    count_sen.append(len(tmp))

In [14]:
df['num_of_sen'] = np.array(count_sen);
del count_sen

In [15]:
# Now we need to find the number of words in the text of the review
count_word = []
for i in range(len(X)):
    tmp = X[i].split()
    count_word.append(len(tmp))

In [16]:
df['num_of_word'] = np.array(count_word)
del count_word

In [17]:
# Removing punctuation marks
intab = string.punctuation
outtab = "                                "
trantab = str.maketrans(intab, outtab)

In [18]:
X = [text.translate(trantab)for text in X]

In [19]:
for i in range(len(X)):
    X[i] = X[i].split()
    X[i] = ' '.join(X[i])

In [20]:
X[1]

'i can remember seeing the show when it aired on television years ago when i was a child my sister later bought me the lp which i have to this day i m thirty something i used this series of books amp songs when i did my student teaching for preschoolers amp turned the whole school on to it i am now purchasing it on cd along with the books for my children 5 amp 2 the tradition lives on'

In [21]:
df['Text'] = X

### Cleaning summary of reviews:
***

In [22]:
X = df.Summary.tolist()

In [23]:
# There were some pure numbers in summary. Had to convert them to str
for i in range(len(X)):
    X[i] = str(X[i])

In [24]:
# Removing punctuation marks
X = [text.translate(trantab)for text in X]

In [25]:
for i in range(len(X)):
    X[i] = X[i].split()
    X[i] = ' '.join(X[i])

In [26]:
df['Summary'] = X

### Dropping unecessary columns
***

In [30]:
df.columns

Index(['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time',
       'Summary', 'Text', 'num_of_sen', 'num_of_word'],
      dtype='object')

In [28]:
col = ['Id','ProductId','UserId','ProfileName']
df = df.drop(col,axis=1)

### Saving the dataframe:
***

In [35]:
with open('../data/clean-data.pkl','wb') as fp:
    pickle.dump(df,fp)
fp.close()