In [1]:
import pandas as pd
import numpy as np
from contractions import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

In [2]:
def readAllDataAndMerge():
    data = pd.read_csv('dataset/avg_cleaned_25k_yelp.csv')
    posReviews = pd.read_csv('dataset/pos_cleaned_25k_yelp.csv')
    negReviews = pd.read_csv('dataset/neg_cleaned_25k_yelp.csv')
    
    data = data.append(posReviews, ignore_index = True) 
    data = data.append(negReviews, ignore_index = True) 
    return data

In [3]:
data = readAllDataAndMerge()

In [4]:
data.head()

Unnamed: 0,stars,date,text,useful
0,3,2015-06-18,Very expensive. All I had was drinks. They are...,0
1,3,2017-08-04,Summary: a very nice congee restaurant that is...,0
2,3,2015-08-13,"Coffee was good, people were nice, but a coffe...",0
3,3,2012-08-06,My husband got the shrimp and I got the crabet...,4
4,3,2015-06-19,I was really looking forward to trying sidecar...,2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 4 columns):
stars     75000 non-null int64
date      75000 non-null object
text      75000 non-null object
useful    75000 non-null int64
dtypes: int64(2), object(2)
memory usage: 2.3+ MB


In [6]:
def assignPolarityToReviews(stars):
    if stars > 3:
        return 1
    if stars == 3:
        return 0
    return -1

In [7]:
starsColumn = data['stars']
polarity = starsColumn.map(assignPolarityToReviews)
data['polarity'] = polarity #1: Good | 0:Neutral | 1:Bad

In [12]:
data = data.sample(frac=1).reset_index(drop=True)
data.head(10)

Unnamed: 0,stars,date,text,useful,polarity
0,5,2014-11-15,Best schnitzel place in Toronto. Friendly serv...,0,1
1,2,2017-03-07,We went for sushi . Service was super slow . S...,1,-1
2,3,2015-06-14,Santi and I came to this iconic pizzeria for d...,12,0
3,2,2014-06-04,portions are large.. price is cheap... thats a...,0,-1
4,3,2014-04-30,Had dinner with my wonderful and handsome husb...,0,0
5,1,2014-10-20,WORST place I have ever been canceled appointm...,5,-1
6,4,2010-12-13,I came here on Saturday night to try and unwin...,2,1
7,3,2015-08-29,This is the only Costco in Henderson so its al...,0,0
8,3,2016-09-03,This place is pretty overhyped for what it act...,0,0
9,3,2012-04-25,Full-size Wal-Mart that is a hair's breadth ac...,2,0


In [13]:
#Stopwords selection
#https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/WASSA/pdf/WASSA14.pdf

stopWords = set(stopwords.words('english'))
print(stopWords)

{'doing', 'while', 'ain', 'there', 'during', "don't", 'themselves', 'under', 'than', 'won', "won't", 'up', 'down', 'through', 'do', 'it', 'them', "that'll", 't', 'ourselves', 'now', 'such', 'this', 'nor', 'yours', "mightn't", "wouldn't", 'more', 'yourselves', 'which', "isn't", 'needn', "needn't", 'shan', 'didn', 'her', 'into', 'these', 'those', 'any', 'again', 'ma', 'all', 'at', 'over', "she's", 'same', 's', 'we', 'because', 'too', 'should', 'few', 'mustn', 'our', 'their', "you've", 'wouldn', 'o', "didn't", 'why', "it's", 'couldn', 'did', "hadn't", "haven't", 'theirs', 'between', 'before', 'once', 'its', 'own', 'having', 'with', 'not', 'am', 'an', 'is', 'but', 'had', 'other', 'd', 'wasn', 'mightn', 'are', 'haven', 'that', 'how', 'only', 'of', 'my', 'most', "you're", 'he', 'll', 'who', 'y', 'does', 'for', 'i', 'himself', 'to', 'yourself', 'and', "shan't", 'will', 'further', 'or', 'from', 'a', 'the', 'here', "weren't", 'doesn', 'she', "you'll", 'hasn', 'on', "should've", "wasn't", 'hadn'

In [14]:

wordsRequiredOfStopWords = ['not', 'nor', 'neither' , 'no', "wasn't"]

stopWords = stopWords.difference(wordsRequiredOfStopWords)

In [17]:
#Improving performance in string manipulation
def expandContractions(text):
    words = text.split(' ')
    expandedTextList = []
    for word in words:
        if(word in contractions):
            expandedTextList.append(contractions[word].split('/')[0]) #Always considering 1st form of verb for list
        else:
            expandedTextList.append(word)
        expandedTextList.append(' ')
    return ''.join(expandedTextList)

def getNonStopWordsLowerCase(text):
    textTokens = word_tokenize(text)
    retText = []
    for w in textTokens:
        w = w.lower()
        if(w not in stopWords and len(w) > 1) :
            retText.append(w + ' ')
            
    return ''.join(retText)

def textCleaning(text):
    
    #Remove Hyperlinks
    # Regex: r'\S+', # sequence of non-whitespace
    text = re.sub(r'http\S+', '', text)
    
    #Remove all tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    #Expand texts
    text = expandContractions(text)
    
    #Remove all words containing numbers
    #https://stackoverflow.com/questions/18082130/python-regex-to-remove-all-words-which-contains-number
    text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s))
    
    #Remove special characters
    text = re.sub('[^A-Za-z]+', ' ', text)
    
    #Get non stop words in lower case
    text = getNonStopWordsLowerCase(text).strip()
    
    return text


In [18]:

preprocessed_texts = []

for text in tqdm(data['text'].values):
    doneText = textCleaning(text)
    #if(len(doneText) < 2):
    #   continue
    preprocessed_texts.append(doneText)


100%|██████████████████████████████████████████████████████████████████████████| 75000/75000 [00:49<00:00, 1523.06it/s]


In [19]:
len(preprocessed_texts)

75000

In [20]:
#Sample prints

def printPreprocessedTexts(idx):

    print("Original Text:\t",data['text'][idx] , end='\n\n')
    print("Preprocessed Text:\t",preprocessed_texts[idx], end='\n\n')

    print("Polarity of the text: ", data['polarity'][idx])

In [21]:
printPreprocessedTexts(0)

Original Text:	 Best schnitzel place in Toronto. Friendly service- cheap price. They're loosing the location moving out on Dec 31st.

Preprocessed Text:	 best schnitzel place toronto friendly service cheap price loosing location moving dec

Polarity of the text:  1


In [22]:
printPreprocessedTexts(1)

Original Text:	 We went for sushi . Service was super slow . Sushi didn't taste fresh . And it wasn't even that busy. Looks like a tourist trap for convention people . Definitely not worth the price .

Preprocessed Text:	 went sushi service super slow sushi not taste fresh not even busy looks like tourist trap convention people definitely not worth price

Polarity of the text:  -1


In [23]:
printPreprocessedTexts(2)

Original Text:	 Santi and I came to this iconic pizzeria for dinner, and as usual, it was a positive experience. The wait was about half an hour for us. The dim lit red brick interior started with a patio dining area and small cozy interior with an open kitchen, brick oven, and display of their signature bread. They specialize in a small number of artisanal pizzas with a high mark up. The noise level was high, as we were seated on the counter overlooking a number of pizzas delivered to the tables. 

The Margherita ($15) came with tomato sauce, fresh mozzarella, and basil. Their pizza crust was excellent: thin, crisp, carried great flavor and a crunch in every bite. Their tomato sauce and mozzarella were equally excellent, but more basil could have helped to boost the flavor. I also suggest interested diners who wish for a savory anchor to order it with prosciutto. 

The Wisebuy ($18) was made with wood-roasted onion, house smoked mozzarella, and fennel sausage. The mozzarella had a nic

In [25]:
#Writing to CSV to avoid recomputation

#colList = ['Review','Polarity','Useful']
to_save = pd.DataFrame({'Review': preprocessed_texts, 'Polarity': data['polarity'], 'Useful':data['useful'] })
to_save.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 3 columns):
Review      75000 non-null object
Polarity    75000 non-null int64
Useful      75000 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.7+ MB


In [26]:
to_save.to_csv('dataset/preprocessed.csv', index=False)