# Cleaning and pre-processing the data

    Loading the data

In [1]:
import warnings;
warnings.filterwarnings('ignore');

In [2]:
import pandas as pd
df = pd.read_csv('../data/Reviews.csv')
df.shape

(568454, 10)

In [3]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

## Dropping duplicates

In [4]:
imp_cols = set(df.columns)-{'Id','ProductId'}
df = df.drop_duplicates(subset=imp_cols)
print ('Dimension after eliminating duplicates',df.shape)

Dimension after eliminating duplicates (396309, 10)


***
    1. Neglecting 3 star reviews 
    2. Sorting by time-stamp
    3. Extracting Reviews and Summary and concatenating them
    4. Defining <3 score as negative and >3 as positive

In [5]:
df = df.query('Score != 3')

In [6]:
df.shape

(366402, 10)

In [7]:
df = df.sort_values(by='Time')

In [8]:
tmp1 = df.Text.tolist()
tmp2 = df.Summary.tolist()

### Class labels:

In [9]:
Y = df.Score.tolist()

In [10]:
for i in range(len(Y)):
    if(Y[i]>3):
        Y[i]=1
    else:
        Y[i]=0

### Datapoints:

In [11]:
X = [str(tmp1[i])+' '+str(tmp2[i]) for i in range(len(tmp1))]

### Removing HTML tags

In [12]:
import re
X = [re.sub('<[^>]*>', '',i.lower()) for i in X]

### Removing Punctuation marks

In [13]:
import string
intab = string.punctuation
outtab = "                                "
trantab = str.maketrans(intab, outtab)

In [14]:
X = [text.translate(trantab)for text in X]

In [15]:
print (X[1])

i can remember seeing the show when it aired on television years ago  when i was a child   my sister later bought me the lp  which i have to this day   i m thirty something  i used this series of books  amp  songs when i did my  student teaching for preschoolers  amp  turned the whole school on to it   i am now purchasing it on cd  along with the books for my children 5  amp   2   the tradition lives on  this whole series is great way to spend time with your child


### Saving without stemming

In [16]:
import pickle
with open('../data/clean-data-nostem.pkl','wb') as fp:
    pickle.dump((X,Y),fp)
fp.close()

### Saving as a csv for using in other kernels

In [17]:
data = pd.DataFrame({'X':X,'Y':Y})

In [18]:
data.to_csv('../data/clean-data-nostem.csv')

### Tokenizing

In [19]:
for i in range(len(X)):
    #print (i,end='\t')
    X[i] = str.split(X[i])

### Stemming

In [20]:
from nltk import PorterStemmer
stemmer = PorterStemmer()

In [21]:
for i in range(len(X)):
    X[i] = [stemmer.stem(j) for j in X[i]]

### Join after stem

In [22]:
X = [' '.join(i) for i in X]

### Saving the data

In [23]:
import pickle
with open('../data/clean-data.pkl','wb') as fp:
    pickle.dump((X,Y),fp)
fp.close()