In [1]:
import pandas as pd
import numpy as np
from contractions import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer #BoW

In [2]:
def readAllDataAndMerge():
    #data = pd.read_csv('dataset/avg_cleaned_25k_yelp.csv')
    
    data = pd.read_csv('dataset/pos_cleaned_25k_yelp.csv')
    negReviews = pd.read_csv('dataset/neg_cleaned_25k_yelp.csv')
    
    data = data.append(negReviews, ignore_index = True) 
    return data

In [3]:
data = readAllDataAndMerge()

In [4]:
data.head()

Unnamed: 0,stars,date,text,useful
0,5,2016-10-11,Nice Find!\n\nWe just happened on this restaur...,0
1,4,2015-01-13,So customer service might not be the greatest ...,2
2,4,2014-05-29,Never knew about this coffee shop until they o...,0
3,4,2014-11-13,Came here the other day to get a 60-minute det...,7
4,5,2016-11-10,This place is a bit of an institution and I us...,0


In [5]:
def assignPolarityToReviews(stars):
    if stars > 3:
        return 1
    return 0

In [6]:
starsColumn = data['stars']
polarity = starsColumn.map(assignPolarityToReviews)
data['polarity'] = polarity #1: Good | 0:Bad

In [7]:
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,stars,date,text,useful,polarity
0,1,2017-04-16,I always go to toms in mesa cause i work out t...,1,0
1,5,2015-04-28,great tacos. in my opinion better than Tacos e...,0,1
2,5,2009-10-12,Hands down my favorite sushi restaurant in Veg...,1,1
3,5,2017-07-02,This was a great find. The menu has a great se...,2,1
4,5,2015-01-18,Their macarons are soo unique compared to othe...,0,1


In [8]:
#Stopwords selection

stopWords = set(stopwords.words('english'))

wordsRequiredOfStopWords = ['not', 'nor', 'neither' , 'no']

stopWords = stopWords.difference(wordsRequiredOfStopWords)

In [9]:

def expandContractions(text):
    words = text.split(' ')
    expandedText = ''
    for word in words:
        if(word in contractions):
            expandedText += contractions[word].split('/')[0] #Always considering 1st form of verb for list
        else:
            expandedText += word
        expandedText += ' '
    return expandedText

def getNonStopWordsLowerCase(text):
    textTokens = word_tokenize(text)
    retText = ''
    for w in textTokens:
        w = w.lower()
        if(w not in stopWords and len(w) > 1) :
            retText += w + ' '
            
    return retText

def textCleaning(text):
    
    #Remove Hyperlinks
    # Regex: r'\S+', # sequence of non-whitespace
    text = re.sub(r'http\S+', '', text)
    
    #Remove all tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    #Expand texts
    text = expandContractions(text)
    
    #Remove all words containing numbers
    #https://stackoverflow.com/questions/18082130/python-regex-to-remove-all-words-which-contains-number
    text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s))
    
    #Remove special characters
    text = re.sub('[^A-Za-z]+', ' ', text)
    
    #Get non stop words in lower case
    text = getNonStopWordsLowerCase(text).strip()
    
    return text


In [15]:

preprocessed_texts = []

for text in tqdm(data['text'].values):
    doneText = textCleaning(text)
    #if(len(doneText) < 2):
    #   continue
    preprocessed_texts.append(doneText)


100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:33<00:00, 1493.52it/s]


In [16]:
#Sample prints

def printPreprocessedTexts(idx):

    print("Original Text:\t",data['text'][idx] , end='\n\n')
    print("Preprocessed Text:\t",preprocessed_texts[idx], end='\n\n')

    print("Polarity of the text: ", data['polarity'][idx])

In [17]:
printPreprocessedTexts(0)

Original Text:	 I always go to toms in mesa cause i work out there and I love it!! Try this location cause i live on the west side and omfg!! They are out of tips,no potatoes, and out of fried pickles?!?!?!?!?  The bathroom stinks and it looks run down in here..also there is a sewage smell.. Never again very disappointed in this location. Never again!! I will continue to go to the mesa location. The lady at the desk didn't even give me my points for 5star!! Wtf!!!

Preprocessed Text:	 always go toms mesa cause work love try location cause live west side omfg tips no potatoes fried pickles bathroom stinks looks run also sewage smell never disappointed location never continue go mesa location lady desk not even give points wtf

Polarity of the text:  0


In [18]:
printPreprocessedTexts(1)

Original Text:	 great tacos. in my opinion better than Tacos el Gordo. great customer service friendly staff fast and delicious. i recomend the alambres feed 2-3 people

Preprocessed Text:	 great tacos opinion better tacos el gordo great customer service friendly staff fast delicious recomend alambres feed people

Polarity of the text:  1


In [19]:
#Writing to CSV to avoid recomputation

#colList = ['Review','Polarity','Useful']
to_save = pd.DataFrame({'Review': preprocessed_texts, 'Polarity': data['polarity'], 'Useful':data['useful'] })
to_save.to_csv('dataset/preprocessed.csv', index=False)