In [1]:
import string
import re

import pandas as pd

import contractions

from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize 

In [2]:
yelp_data_5k = pd.read_csv(r"C:\Users\sebas\Documents\School\bachelor-final\Dataset\RESTAURANT_YELP_5K.csv")
yelp_data_5k.head()

Unnamed: 0,reviewContent,usefulCount,coolCount,funnyCount,rating,restaurantID
0,"""'Check, Please."" The bartender was unable to ...",18,11,25,1,VZHyAmdFDreQqL0BT-zdoA
1,"""2 stars for disappointing food, one star for ...",14,10,7,3,tFcmrGLZNEymSnijoTPmqw
2,"""A Divine Dialogue"" God: ""Britton. Times up. T...",14,16,23,5,INvIaBFnAvGxzTXFWHzGvA
3,"""A Place To Go When You Have Time"" It was a mi...",13,10,10,5,FySId5SjNhkrtPA5qktdxg
4,"""A Thaiphoon of Flavor"" My First: I never enli...",27,22,28,5,RgeMUiZncTs-VSHQLm0wNg


# Functions for cleaning textual data

## Default cleaning without removing stopwords
* remove **\xa0** character (**\xa0** is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160))
* remove links/urls
* lowercase words
* remove contractions (abbreviations such as **we'll** will be replaced with **we will**)
* remove punctuation
* tokenize words
* remove numbers
* remove words consisting of one letter
* remove whitespaces

In [3]:
def default_clean_without_removing_stopwords(review):
    # remove xa0
    review = re.sub(r"\xa0", "", review)
    # remove links
    review = re.sub(r"http\S+", "", review)
    # lowercase words
    review = review.lower()
    # remove contractions
    review = contractions.fix(review)
    # remove punctuation and tokenize
    review = [word.strip(string.punctuation) for word in review.split(" ")]
    # remove numbers
    review = [word for word in review if not any(character.isdigit() for character in word)]
    # remove words with only a letter
    review = [word for word in review if len(word) > 1]
    # remove whitespaces
    review = [word for word in review if word]
    # join all back together
    full_string_return = " ".join(review)
    return full_string_return

## Default cleaning with stopwords removal
* remove **\xa0** character (**\xa0** is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160))
* remove links/urls
* lowercase words
* remove contractions (abbreviations such as **we'll** will be replaced with **we will**)
* remove punctuation
* tokenize words
* remove numbers
* remove stopwords
* remove whitespaces/nonwords
* remove words consisting of one letter

In [4]:
def default_clean_remove_stopwords(review):
    # remove xa0
    review = re.sub(r"\xa0", "", review)
    # remove links
    review = re.sub(r"http\S+", "", review)
    # lowercase words
    review = review.lower()
    # remove contractions
    review = contractions.fix(review)
    # remove punctuation and tokenize
    review = [word.strip(string.punctuation) for word in review.split(" ")]
    # remove numbers
    review = [word for word in review if not any(character.isdigit() for character in word)]
    # remove stopwords
    stopwords_engl = stopwords.words("english")
    review = [word for word in review if word not in stopwords_engl]
    # remove whitespaces
    review = [word for word in review if word]
    # remove words with only a letter
    review = [word for word in review if len(word) > 1]
    # join all back together
    full_string_return = " ".join(review)
    return full_string_return

# Stemming with PorterStemmer
* remove **\xa0** character (**\xa0** is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160))
* remove links/urls
* lowercase words
* remove contractions (abbreviations such as **we'll** will be replaced with **we will**)
* remove punctuation
* tokenize words
* remove numbers
* remove stopwords
* remove whitespaces
* stem using PorterStemmer
* remove words consisting of one letter

In [5]:
def stem_clean_remove_stopwords(review):
    # remove xa0
    review = re.sub(r"\xa0", "", review)
    # remove links
    review = re.sub(r"http\S+", "", review)
    # lowercase words
    review = review.lower()
    # remove contractions
    review = contractions.fix(review)
    # remove punctuation and tokenize
    review = [word.strip(string.punctuation) for word in review.split(" ")]
    # remove numbers
    review = [word for word in review if not any(character.isdigit() for character in word)]
    # remove stopwords
    stopwords_engl = stopwords.words("english")
    review = [word for word in review if word not in stopwords_engl]
    # remove whitespaces
    review = [word for word in review if word]
    # stem review
    stemmer = PorterStemmer()
    review = [stemmer.stem(word) for word in review]
    # remove words with only a letter
    review = [word for word in review if len(word) > 1]
    # join all back together
    full_string_return = " ".join(review)
    return full_string_return

# Stemming with SnowballStemmer
* remove **\xa0** character (**\xa0** is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160))
* remove links/urls
* lowercase words
* remove contractions (abbreviations such as **we'll** will be replaced with **we will**)
* remove punctuation
* tokenize words
* remove numbers
* remove stopwords
* remove whitespaces
* stem using SnowballStemmer
* remove words consisting of one letter

In [6]:
def stem_snowball_clean_remove_stopwords(review):
    # remove xa0
    review = re.sub(r"\xa0", "", review)
    # remove links
    review = re.sub(r"http\S+", "", review)
    # lowercase words
    review = review.lower()
    # remove contractions
    review = contractions.fix(review)
    # remove punctuation and tokenize
    review = [word.strip(string.punctuation) for word in review.split(" ")]
    # remove numbers
    review = [word for word in review if not any(character.isdigit() for character in word)]
    # remove stopwords
    stopwords_engl = stopwords.words("english")
    review = [word for word in review if word not in stopwords_engl]
    # remove whitespaces
    review = [word for word in review if word]
    # stem review
    stemmer = SnowballStemmer("english")
    review = [stemmer.stem(word) for word in review]
    # remove words with only a letter
    review = [word for word in review if len(word) > 1]
    # join all back together
    full_string_return = " ".join(review)
    return full_string_return

# Lemmatizing with WordNetLemmatizer
* remove **\xa0** character (**\xa0** is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160))
* remove links/urls
* lowercase words
* remove contractions (abbreviations such as **we'll** will be replaced with **we will**)
* remove punctuation
* tokenize words
* remove numbers
* remove stopwords
* lemmatize using WordNetLemmatizer
* remove words consisting of one letter
* remove whitespaces

WordNetLemmatizer requires the type of word as a secondary parameter, so a function to determine it is needed

In [7]:
def get_word_pos(pos_tag):
    if pos_tag.startswith("J"):
        return wordnet.ADJ
    elif pos_tag.startswith("V"):
        return wordnet.VERB
    elif pos_tag.startswith("N"):
        return wordnet.NOUN
    elif pos_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
def lemmatize_wordnet_clean_remove_stopwords(review):
    # remove xa0
    review = re.sub(r"\xa0", "", review)
    # remove links
    review = re.sub(r"http\S+", "", review)
    # lowercase words
    review = review.lower()
    # remove contractions
    review = contractions.fix(review)
    # remove punctuation and tokenize
    review = [word.strip(string.punctuation) for word in review.split(" ")]
    # remove numbers
    review = [word for word in review if not any(character.isdigit() for character in word)]
    # remove stopwords
    stopwords_engl = stopwords.words("english")
    review = [word for word in review if word not in stopwords_engl]
    # remove whitespaces
    review = [word for word in review if word]
    # lemmatize review
    pos_tags = pos_tag(review)
    review = [WordNetLemmatizer().lemmatize(t[0], get_word_pos(t[1])) for t in pos_tags]
    # remove words with only a letter
    review = [word for word in review if len(word) > 1]
    # join all back together
    full_string_return = " ".join(review)
    return full_string_return

# Stemming with Lancaster Stemmer
* remove **\xa0** character (**\xa0** is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160))
* remove links/urls
* lowercase words
* remove contractions (abbreviations such as **we'll** will be replaced with **we will**)
* remove punctuation
* tokenize words
* remove numbers
* remove stopwords
* stem using Lancaster
* remove words consisting of one letter
* remove whitespaces

In [9]:
def lancaster_clean_remove_stopwords(review):
    # remove xa0
    review = re.sub(r"\xa0", "", review)
    # remove links
    review = re.sub(r"http\S+", "", review)
    # lowercase words
    review = review.lower()
    # remove contractions
    review = contractions.fix(review)
    # remove punctuation and tokenize
    review = [word.strip(string.punctuation) for word in review.split(" ")]
    # remove numbers
    review = [word for word in review if not any(character.isdigit() for character in word)]
    # remove stopwords
    stopwords_engl = stopwords.words("english")
    review = [word for word in review if word not in stopwords_engl]
    # remove whitespaces
    review = [word for word in review if word]
    # stem review
    stemmer = LancasterStemmer()
    review = [stemmer.stem(word) for word in review]
    # remove words with only a letter
    review = [word for word in review if len(word) > 1]
    # join all back together
    full_string_return = " ".join(review)
    return full_string_return

In [10]:
yelp_data_5k['reviewCleanWithStopwords'] = yelp_data_5k['reviewContent'].apply(default_clean_without_removing_stopwords)
yelp_data_5k['reviewCleanNoStopwords'] = yelp_data_5k['reviewContent'].apply(default_clean_remove_stopwords)
yelp_data_5k['reviewCleanPorterStemmer'] = yelp_data_5k['reviewContent'].apply(stem_clean_remove_stopwords)
yelp_data_5k['reviewCleanSnowballStemmer'] = yelp_data_5k['reviewContent'].apply(stem_snowball_clean_remove_stopwords)
yelp_data_5k['reviewCleanLemmatized'] = yelp_data_5k['reviewContent'].apply(lemmatize_wordnet_clean_remove_stopwords)
yelp_data_5k['reviewCleanLancaster'] = yelp_data_5k['reviewContent'].apply(lancaster_clean_remove_stopwords)

In [11]:
yelp_data_5k.head(15)

Unnamed: 0,reviewContent,usefulCount,coolCount,funnyCount,rating,restaurantID,reviewCleanWithStopwords,reviewCleanNoStopwords,reviewCleanPorterStemmer,reviewCleanSnowballStemmer,reviewCleanLemmatized,reviewCleanLancaster
0,"""'Check, Please."" The bartender was unable to ...",18,11,25,1,VZHyAmdFDreQqL0BT-zdoA,check please the bartender was unable to recom...,check please bartender unable recommend beer t...,check pleas bartend unabl recommend beer tap t...,check pleas bartend unabl recommend beer tap t...,check please bartender unable recommend beer t...,check pleas bartend un recommend beer tap tri ...
1,"""2 stars for disappointing food, one star for ...",14,10,7,3,tFcmrGLZNEymSnijoTPmqw,stars for disappointing food one star for grea...,stars disappointing food one star great servic...,star disappoint food one star great servic rea...,star disappoint food one star great servic rea...,star disappoint food one star great service re...,star disappoint food on star gre serv read iai...
2,"""A Divine Dialogue"" God: ""Britton. Times up. T...",14,16,23,5,INvIaBFnAvGxzTXFWHzGvA,divine dialogue god britton times up the world...,divine dialogue god britton times world going ...,divin dialogu god britton time world go end to...,divin dialogu god britton time world go end to...,divine dialogue god britton time world go end ...,divin dialog god britton tim world going end t...
3,"""A Place To Go When You Have Time"" It was a mi...",13,10,10,5,FySId5SjNhkrtPA5qktdxg,place to go when you have time it was misty br...,place go time misty breezy summer night friend...,place go time misti breezi summer night friend...,place go time misti breezi summer night friend...,place go time misty breezy summer night friend...,plac go tim misty breezy sum night friend jere...
4,"""A Thaiphoon of Flavor"" My First: I never enli...",27,22,28,5,RgeMUiZncTs-VSHQLm0wNg,thaiphoon of flavor my first never enlisted in...,thaiphoon flavor first never enlisted air forc...,thaiphoon flavor first never enlist air forc a...,thaiphoon flavor first never enlist air forc a...,thaiphoon flavor first never enlist air force ...,thaiphoon flav first nev enl air forc allow jo...
5,"""ACME: the Almost Carefree Mealtime Experience...",16,10,12,3,vh0ZN59b_OaA4R6Yxft3Dg,acme the almost carefree mealtime experience w...,acme almost carefree mealtime experience good ...,acm almost carefre mealtim experi good friend ...,acm almost carefre mealtim experi good friend ...,acme almost carefree mealtime experience good ...,acm almost carefr mealtim expery good friend b...
6,"""All you can eat Japanese seafood buffet"" Some...",8,6,7,2,HnexNROT_Si68Uz3r9YHTg,all you can eat japanese seafood buffet some o...,eat japanese seafood buffet terrifying words e...,eat japanes seafood buffet terrifi word englis...,eat japanes seafood buffet terrifi word englis...,eat japanese seafood buffet terrify word engli...,eat japanes seafood buffet terr word engl lang...
7,"""Ant'ny, this here is ya mutha callin'...ya ne...",11,9,16,3,J3mPGK4CDEcC6Ra27qMMXw,ant'ny this here is ya mutha callin'...ya need...,ant'ny ya mutha callin'...ya need stop galliva...,ant'ni ya mutha callin'...ya need stop galliva...,ant'ni ya mutha callin'...ya need stop galliva...,ant'ny ya mutha callin'...ya need stop galliva...,ant'ny ya muth callin'...ya nee stop gallivant...
8,"""Arriba, Arriba ...Andale, Andale!!! ...theez ...",9,7,11,2,vUfLjwHWs7OoRygaNCebgw,arriba arriba andale andale theez eez speedy g...,arriba arriba andale andale theez eez speedy g...,arriba arriba andal andal theez eez speedi gon...,arriba arriba andal andal theez eez speedi gon...,arriba arriba andale andale theez eez speedy g...,arrib arrib and and theez eez speedy gonz pole...
9,"""As Your Broker I Advise You To Invest Elsewhe...",10,10,6,2,kec_7RON_MI7vw9T_DOmIA,as your broker advise you to invest elsewhere ...,broker advise invest elsewhere give credit sea...,broker advis invest elsewher give credit seat ...,broker advis invest elsewher give credit seat ...,broker advise invest elsewhere give credit sea...,brok adv invest elsewh giv credit seat us ev b...


In [12]:
yelp_data_5k.to_csv("PROCESSED_RESTAURANT_REVIEWS_5k.csv")