In [1]:
import pandas as pd
import numpy as np
from contractions import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

In [2]:
def readAllDataAndMerge():
    data = pd.read_csv('dataset/avg_cleaned_25k_yelp.csv')
    posReviews = pd.read_csv('dataset/pos_cleaned_25k_yelp.csv')
    negReviews = pd.read_csv('dataset/neg_cleaned_25k_yelp.csv')
    
    data = data.append(posReviews, ignore_index = True) 
    data = data.append(negReviews, ignore_index = True) 
    return data

In [3]:
data = readAllDataAndMerge()

In [4]:
data.head()

Unnamed: 0,stars,date,text,useful
0,3,2015-06-18,Very expensive. All I had was drinks. They are...,0
1,3,2017-08-04,Summary: a very nice congee restaurant that is...,0
2,3,2015-08-13,"Coffee was good, people were nice, but a coffe...",0
3,3,2012-08-06,My husband got the shrimp and I got the crabet...,4
4,3,2015-06-19,I was really looking forward to trying sidecar...,2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 4 columns):
stars     75000 non-null int64
date      75000 non-null object
text      75000 non-null object
useful    75000 non-null int64
dtypes: int64(2), object(2)
memory usage: 2.3+ MB


In [6]:
def assignPolarityToReviews(stars):
    if stars > 3:
        return 1
    if stars == 3:
        return 0
    return -1

In [7]:
starsColumn = data['stars']
polarity = starsColumn.map(assignPolarityToReviews)
data['polarity'] = polarity #1: Good | 0:Neutral | 1:Bad

In [8]:
data = data.sample(frac=1).reset_index(drop=True)
data.head(10)

Unnamed: 0,stars,date,text,useful,polarity
0,4,2012-01-11,My sister and I really enjoyed Cartet in April...,0,1
1,1,2016-03-03,Watch their pricing! I wanted to buy 18 of a...,0,-1
2,4,2017-04-09,I like sampling the different teas and checkin...,0,1
3,5,2017-01-19,The crispy chicken sandwich was so good I'm co...,0,1
4,5,2016-06-26,My boyfriend and I have one trip to Vegas annu...,0,1
5,4,2010-10-26,Went here for lunch today with a local who had...,3,1
6,3,2017-01-19,This used to be my favorite raspado place. The...,1,0
7,3,2017-12-10,The tacos are really good. But the seating are...,0,0
8,3,2011-06-12,A friend recommended this restaurant to me for...,1,0
9,1,2013-01-20,They said they would show up between 5:00 pm a...,11,-1


In [16]:
#Stopwords selection
#https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/WASSA/pdf/WASSA14.pdf

stopWords = set(stopwords.words('english'))
print(stopWords)

{'am', 'will', 'mightn', "mustn't", "weren't", 't', 'then', 'after', 'can', "mightn't", 'does', 'just', "needn't", 'is', 'with', 'from', 'this', 'her', 'what', 'but', "isn't", 'to', 'not', 'out', 'yours', 'by', 'll', 've', 'was', 'for', 'again', 'in', 'me', 'wasn', 'themselves', "should've", 'if', 'very', 'under', 'ourselves', 'so', "wouldn't", 'theirs', "she's", 'while', 'further', 'each', 'herself', 'isn', 'have', 'between', 'all', "don't", 'him', 'itself', 'there', 'you', 'did', 'through', 'be', 'most', 'which', 'doesn', 'some', 'off', 'only', 'his', 'too', 'over', 're', 'at', 'shouldn', 'your', 'couldn', 'where', "aren't", 'them', 'i', 'once', 'before', 'no', 'below', 'into', 'y', 'didn', "you've", 'o', 'these', 'than', 'ours', "couldn't", "that'll", 'aren', 'why', 'himself', 'wouldn', 'few', 'd', "you'll", "wasn't", 'weren', 'because', 'those', 'should', "won't", 'won', 'any', 'haven', 'now', 'been', 'had', 'up', 'needn', 'that', 'yourself', 'and', 'are', 'myself', 'my', 'doing', 

In [18]:
def filterStopWords(words):
    filteredList = set()
    for word in words:
        if "n't" not in word:
            filteredList.add(word)
    return filteredList



stopWords = filterStopWords(stopWords)
print(stopWords)

{'am', 'will', 'mightn', 't', 'then', 'after', 'can', 'does', 'just', 'is', 'with', 'from', 'this', 'her', 'what', 'but', 'to', 'not', 'out', 'yours', 'by', 'll', 've', 'was', 'for', 'again', 'in', 'me', 'wasn', 'themselves', "should've", 'if', 'very', 'under', 'ourselves', 'so', 'theirs', "she's", 'while', 'further', 'each', 'herself', 'isn', 'have', 'between', 'all', 'him', 'itself', 'there', 'you', 'did', 'through', 'be', 'most', 'which', 'doesn', 'some', 'off', 'only', 'his', 'too', 'over', 're', 'at', 'shouldn', 'your', 'couldn', 'where', 'them', 'i', 'once', 'before', 'no', 'below', 'into', 'y', 'didn', "you've", 'o', 'these', 'than', 'ours', "that'll", 'aren', 'why', 'himself', 'wouldn', 'few', 'd', "you'll", 'weren', 'because', 'those', 'should', 'won', 'any', 'haven', 'now', 'been', 'had', 'up', 'needn', 'that', 'yourself', 'and', 'are', 'myself', 'my', 'doing', 'down', "you're", 'don', 'he', 'it', 'yourselves', 'being', 'mustn', 'were', 'our', 'as', 'ma', 'until', "it's", 'ha

In [19]:

wordsRequiredOfStopWords = ['not', 'nor', 'neither' , 'no']

stopWords = stopWords.difference(wordsRequiredOfStopWords)

In [20]:
#Improving performance in string manipulation
def expandContractions(text):
    words = text.split(' ')
    expandedTextList = []
    for word in words:
        if(word in contractions):
            expandedTextList.append(contractions[word].split('/')[0]) #Always considering 1st form of verb for list
        else:
            expandedTextList.append(word)
        expandedTextList.append(' ')
    return ''.join(expandedTextList)

def getNonStopWordsLowerCase(text):
    textTokens = word_tokenize(text)
    retText = []
    for w in textTokens:
        w = w.lower()
        if(w not in stopWords and len(w) > 1) :
            retText.append(w + ' ')
            
    return ''.join(retText)

def textCleaning(text):
    
    #Remove Hyperlinks
    # Regex: r'\S+', # sequence of non-whitespace
    text = re.sub(r'http\S+', '', text)
    
    #Remove all tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    #Expand texts
    text = expandContractions(text)
    
    #Remove all words containing numbers
    #https://stackoverflow.com/questions/18082130/python-regex-to-remove-all-words-which-contains-number
    text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s))
    
    #Remove special characters
    text = re.sub('[^A-Za-z]+', ' ', text)
    
    #Get non stop words in lower case
    text = getNonStopWordsLowerCase(text).strip()
    
    return text


In [21]:

preprocessed_texts = []

for text in tqdm(data['text'].values):
    doneText = textCleaning(text)
    #if(len(doneText) < 2):
    #   continue
    preprocessed_texts.append(doneText)


100%|██████████████████████████████████████████████████████████████████████████| 75000/75000 [00:49<00:00, 1502.21it/s]


In [22]:
len(preprocessed_texts)

75000

In [23]:
#Sample prints

def printPreprocessedTexts(idx):

    print("Original Text:\t",data['text'][idx] , end='\n\n')
    print("Preprocessed Text:\t",preprocessed_texts[idx], end='\n\n')

    print("Polarity of the text: ", data['polarity'][idx])

In [24]:
printPreprocessedTexts(0)

Original Text:	 My sister and I really enjoyed Cartet in April.  If the day is nice and the wait is long, get prepared items from the shop section and picnic.  We waited just long enough for me to buy blueberry chocolate spread from the shop (AMAZING).  Brunch menu is extensive, and we both ordered huge plates with a little bit of everything.  Sister--Brunch de L'Atlantique: Eggs with smoked salmon, half a bagel with all the typical accoutrements, greens, a little sweet potato and salmon cake, and fruit. Me: Brunch de Sante--yogurt with granola and blueberries, poached eggs on greens, and grilled bread, cheddar sticks, and a fig.  And it came with coffee and juice, all for $15.  This is the kind of plate I would dream up for myself at home.

Preprocessed Text:	 sister really enjoyed cartet april day nice wait long get prepared items shop section picnic waited long enough buy blueberry chocolate spread shop amazing brunch menu extensive ordered huge plates little bit everything sister b

In [25]:
printPreprocessedTexts(1)

Original Text:	 Watch their pricing!  I wanted to  buy 18 of an item and they attempted to charge me the price for a dozen and then a higher price for the last half dozen!  The cashier said they had only two buttons on their register.  The "manager" on duty said that higher price for the last half dozen was their pricing policy!  In the spirit of "let the the buyer beware", consider yourself warned!

Preprocessed Text:	 watch pricing wanted buy item attempted charge price dozen higher price last half dozen cashier said two buttons register manager duty said higher price last half dozen pricing policy spirit let buyer beware consider warned

Polarity of the text:  -1


In [26]:
printPreprocessedTexts(2)

Original Text:	 I like sampling the different teas and checking out the new products. I was happy to see the company redesigned one of tumblers. The staff at his location is very knowledgeable about their products.

Preprocessed Text:	 like sampling different teas checking new products happy see company redesigned one tumblers staff location knowledgeable products

Polarity of the text:  1


In [27]:
#Writing to CSV to avoid recomputation

#colList = ['Review','Polarity','Useful']
to_save = pd.DataFrame({'Review': preprocessed_texts, 'Polarity': data['polarity'], 'Useful':data['useful'] })
to_save.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 3 columns):
Review      75000 non-null object
Polarity    75000 non-null int64
Useful      75000 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.7+ MB


In [28]:
to_save.to_csv('dataset/preprocessed.csv', index=False)