In [14]:
import pandas as pd
import numpy as np
from contractions import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer #BoW

In [2]:
def readAllDataAndMerge():
    #data = pd.read_csv('dataset/avg_cleaned_25k_yelp.csv')
    
    data = pd.read_csv('dataset/pos_cleaned_25k_yelp.csv')
    negReviews = pd.read_csv('dataset/neg_cleaned_25k_yelp.csv')
    
    data = data.append(negReviews, ignore_index = True) 
    return data

In [3]:
data = readAllDataAndMerge()

In [4]:
data.head()

Unnamed: 0,stars,date,text,useful
0,5,2016-10-11,Nice Find!\n\nWe just happened on this restaur...,0
1,4,2015-01-13,So customer service might not be the greatest ...,2
2,4,2014-05-29,Never knew about this coffee shop until they o...,0
3,4,2014-11-13,Came here the other day to get a 60-minute det...,7
4,5,2016-11-10,This place is a bit of an institution and I us...,0


In [5]:
def assignPolarityToReviews(stars):
    if stars > 3:
        return 1
    return 0

In [11]:
starsColumn = data['stars']
polarity = starsColumn.map(assignPolarityToReviews)
data['polarity'] = polarity #1: Good | 0:Bad

In [9]:
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,stars,date,text,useful,polarity
0,5,2016-09-24,The staff here make the process so easy. I was...,0,1
1,2,2017-11-13,This Fry's is very small and old and unfortuna...,0,0
2,5,2011-03-06,Just came back from this wonderful brunch plac...,2,1
3,2,2013-03-11,For a man that Yelps and relies on other Yelpe...,3,0
4,1,2015-05-25,This is nothing but a greedy money making scam...,0,0


In [10]:
#Stopwords selection

stopWords = set(stopwords.words('english'))

wordsRequiredOfStopWords = ['not', 'nor', 'neither' , 'no']

stopWords = stopWords.difference(wordsRequiredOfStopWords)

In [12]:

def expandContractions(text):
    words = text.split(' ')
    expandedText = ''
    for word in words:
        if(word in contractions):
            expandedText += contractions[word].split('/')[0] #Always considering 1st form of verb for list
        else:
            expandedText += word
        expandedText += ' '
    return expandedText

def getNonStopWordsLowerCase(text):
    textTokens = word_tokenize(text)
    retText = ''
    for w in textTokens:
        w = w.lower()
        if(w not in stopWords and len(w) > 1) :
            retText += w + ' '
            
    return retText

def textCleaning(text):
    
    #Remove Hyperlinks
    # Regex: r'\S+', # sequence of non-whitespace
    text = re.sub(r'http\S+', '', text)
    
    #Remove all tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    #Expand texts
    text = expandContractions(text)
    
    #Remove all words containing numbers
    #https://stackoverflow.com/questions/18082130/python-regex-to-remove-all-words-which-contains-number
    text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s))
    
    #Remove special characters
    text = re.sub('[^A-Za-z]+', ' ', text)
    
    #Get non stop words in lower case
    text = getNonStopWordsLowerCase(text).strip()
    
    return text


In [15]:

preprocessed_texts = []

for text in tqdm(data['text'].values):
    doneText = textCleaning(text)
    if(len(doneText) < 2):
        continue
    preprocessed_texts.append(doneText)


100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [01:41<00:00, 494.91it/s]


In [24]:
#Sample prints

def printPreprocessedTexts(idx):

    print("Original Text:\t",data['text'][idx] , end='\n\n')
    print("Preprocessed Text:\t",preprocessed_texts[idx], end='\n\n')

    print("Polarity of the text: ", data['polarity'][idx])

In [25]:
printPreprocessedTexts(0)

Original Text:	 The staff here make the process so easy. I was welcomed with open arms and they are very helpful. They provided me a great service and an awesome rate. I would def recommend them for all insurance needs! Thanks Chester

Preprocessed Text:	 staff make process easy welcomed open arms helpful provided great service awesome rate would def recommend insurance needs thanks chester

Polarity of the text:  1


In [26]:
printPreprocessedTexts(1)

Original Text:	 This Fry's is very small and old and unfortunately, this store located in a not safe neighborhood. I had a food event yesterday, I saw some helpful associates and nice customers but my digital camera was stolen, I use it to take my event photo, I'm pretty upset.

Preprocessed Text:	 fry small old unfortunately store located not safe neighborhood food event yesterday saw helpful associates nice customers digital camera stolen use take event photo pretty upset

Polarity of the text:  0
