In [19]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

In [2]:
df = pd.read_csv('archive/Reviews.csv')

# 1.Data Preprocessing 

## 1.1 Removing Null Entries

In [3]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [4]:
df = df[df['Summary'].notna()]

In [5]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## 1.2 Keeping just Summary and Text

In [6]:
df=df[['Summary','Text']]

In [7]:
df.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


### Checking the data types of the row

In [8]:
df.dtypes

Summary    object
Text       object
dtype: object

## 1.3 Changing to lower case

In [9]:
df['Summary']=df['Summary'].str.lower()
df['Text']=df['Text'].str.lower()

## 1.4 Removing Punctuation

In [10]:
df['Summary'] = df['Summary'].str.replace('[^\w\s]','')
df['Text'] = df['Text'].str.replace('[^\w\s]','')
df

Unnamed: 0,Summary,Text
0,good quality dog food,i have bought several of the vitality canned d...
1,not as advertised,product arrived labeled as jumbo salted peanut...
2,delight says it all,this is a confection that has been around a fe...
3,cough medicine,if you are looking for the secret ingredient i...
4,great taffy,great taffy at a great price there was a wide...
...,...,...
568449,will not do without,great for sesame chickenthis is a good if not ...
568450,disappointed,im disappointed with the flavor the chocolate ...
568451,perfect for our maltipoo,these stars are small so you can give 1015 of ...
568452,favorite training and reward treat,these are the best treats for training and rew...


## 1.5 Removing Stopwords

In [11]:
stop = stopwords.words('english')
df['Summary']=df['Summary'].apply(lambda x:' '.join([word for word in x.split(' ') if word not in (stop)]))
df['Text']=df['Text'].apply(lambda x:' '.join([word for word in x.split(' ') if word not in (stop)]))

In [12]:
df.head()

Unnamed: 0,Summary,Text
0,good quality dog food,bought several vitality canned dog food produc...
1,advertised,product arrived labeled jumbo salted peanutsth...
2,delight says,confection around centuries light pillowy cit...
3,cough medicine,looking secret ingredient robitussin believe f...
4,great taffy,great taffy great price wide assortment yummy...


## 1.6 Converting Emoji and Emoticons to words

In [26]:
# Converting emojis to words
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
        return text
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
        return text
df['Summary'] = df['Summary'].apply(convert_emojis)
df['Summary'] = df['Summary'].apply(convert_emoticons)
df['Text'] = df['Text'].apply(convert_emojis)
df['Text'] = df['Text'].apply(convert_emoticons)

## 1.7 Stemming the words

## 1.5 Spelling Correction

In [None]:
# pd.df.to_csv('new.csv')