In [1]:
import pandas as pd
reviews = pd.read_csv('reviews.csv')

In [2]:
reviews.sample()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
123252,123253,B001E5E2ZO,A3CFSULXSCEC84,boudmaster,1,1,5,1344729600,Best there is!,I have been using the chocolate syrup for my c...


In [3]:
reviews.shape

(568454, 10)

# Data Preprocessing

## 1. Filter out some (Score, Summary, Text) features from dataset

In [4]:
df = reviews.filter(['Score','Summary','Text'], axis = 1)
df.iloc[:5]

Unnamed: 0,Score,Summary,Text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


## 2.   Splitting the data into train and test

In [5]:
# finding the number of rows in our dataset
rows = df.shape[0]
rows

568454

In [6]:
train_size = int(rows * 0.70)
test_size = rows - train_size

In [7]:
rows == train_size + test_size

True

In [8]:
train_data = pd.DataFrame(columns=df.columns)
test_data  = pd.DataFrame(columns=df.columns)

In [9]:
print("Train Data : {}".format(train_data))
print("\nTest Data: {}".format(test_data))
print("\ntest_size : {} train_size : {}".format(test_size, train_size))

Train Data : Empty DataFrame
Columns: [Score, Summary, Text]
Index: []

Test Data: Empty DataFrame
Columns: [Score, Summary, Text]
Index: []

test_size : 170537 train_size : 397917


In [10]:
import random

# indices list contains all indices from 0 to rows..
indices =list(range(rows))
random.shuffle(indices)

# get random indices from shuffled indices array for train and test data..
trnind = indices[:train_size]
tstind = indices[train_size:rows]

print(trnind[:10])
print(tstind[:10])

[151120, 269769, 231256, 302130, 63156, 502378, 64509, 164090, 415966, 136633]
[396476, 190002, 427258, 457189, 135698, 172317, 93630, 288461, 76673, 17169]


In [11]:
# get the training data with that trnindices array
train_data = df.iloc[trnind]
train_data.shape

(397917, 3)

In [12]:
# get the test data with that tstindices array
test_data = df.iloc[tstind]
test_data.shape

(170537, 3)

In [13]:
train_data.iloc[:3]

Unnamed: 0,Score,Summary,Text
151120,5,Great Product Outrageous Price,I have bred and sold dogs for over 45 years. ...
269769,4,It's really salty...,I bought this product as a gift for a friend w...
231256,5,Great tasting drink,I have checked many places and this product ca...


In [14]:
test_data.iloc[:3]

Unnamed: 0,Score,Summary,Text
396476,5,"Fast Service, Right Product, GREAT OLIVES!",I ordered same product from another shop earli...
190002,5,Best Popcorn Ever!,This is the best popping corn ever. It pops up...
427258,4,Timothy's Great Coffee,"This is a great tasting, strong coffee that we..."


## 3. Remove all special characters and html tags

In [44]:
import re
def preProcessString(text):
    # remove all html tags
    text = re.sub('<.*?>', ' ', text)
    
    # remove all special characters
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    
    # converting all text into small letters
    text = text.lower()
    
    return text

In [58]:
sample_df = test_data.iloc[:10]
sample_df

Unnamed: 0,Score,Summary,Text
396476,5,"Fast Service, Right Product, GREAT OLIVES!",I ordered same product from another shop earli...
190002,5,Best Popcorn Ever!,This is the best popping corn ever. It pops up...
427258,4,Timothy's Great Coffee,"This is a great tasting, strong coffee that we..."
457189,4,I LOVE THESE CRACKERS!!,"Now that I got the caps out of my system, yes,..."
135698,5,Clif never lets me down,This crunch granola bar is on par with every o...
172317,4,Good but NOT low in calories!,I was looking for a new kind of snack and saw ...
93630,5,YUMMY!,Yummy! Enough Said. These little cookies are s...
288461,5,"Yum, Ginger Snap tastes like gingerbread!",The Ginger Snap Larabars are great as are many...
76673,5,Bold Taste,This is my husbands favorite. We have tried a...
17169,5,"RiceSelect Royal Blend, Whole Grain Texmati Br...","This blend of rice, wheat berries, and rye ber..."


In [61]:
sample_df['Summary'] = sample_df['Summary'].apply(preProcessString)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [62]:
sample_df

Unnamed: 0,Score,Summary,Text
396476,5,fast service right product great olives,I ordered same product from another shop earli...
190002,5,best popcorn ever,This is the best popping corn ever. It pops up...
427258,4,timothy s great coffee,"This is a great tasting, strong coffee that we..."
457189,4,i love these crackers,"Now that I got the caps out of my system, yes,..."
135698,5,clif never lets me down,This crunch granola bar is on par with every o...
172317,4,good but not low in calories,I was looking for a new kind of snack and saw ...
93630,5,yummy,Yummy! Enough Said. These little cookies are s...
288461,5,yum ginger snap tastes like gingerbread,The Ginger Snap Larabars are great as are many...
76673,5,bold taste,This is my husbands favorite. We have tried a...
17169,5,riceselect royal blend whole grain texmati bro...,"This blend of rice, wheat berries, and rye ber..."
