In [1]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import spacy

In [2]:
#load dataset
df=pd.read_csv('nusadua_data.csv')
df.head()

Unnamed: 0,Name,Country,Date,Content
0,ChrisThomasTravels,"Ossining, NY",Dec 2019,Initially the black sands of Canggu Beach are ...
1,Cherie H,"Roumazieres-Loubert, France",Dec 2019,Unfortunately this gorgeou beach was covered i...
2,Anastasia B,6 contributions,Jul 2019,Canggu Beach was wonderful and exactly what I ...
3,Therese Herlihy,"Dublin, Ireland",Dec 2019,We knew this was not a white sandy beach but t...
4,Mark Jones,"Lake Macquarie, Australia",Sep 2019,Always find you will get a wave at Canggu (and...


In [3]:
#case folding
df['case_folded'] = df['Content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
# df['case_folded'].head()
df.drop(["Name", "Country"], axis=1, inplace=True)
df.head()

Unnamed: 0,Date,Content,case_folded
0,Dec 2019,Initially the black sands of Canggu Beach are ...,initially the black sands of canggu beach are ...
1,Dec 2019,Unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...
2,Jul 2019,Canggu Beach was wonderful and exactly what I ...,canggu beach was wonderful and exactly what i ...
3,Dec 2019,We knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...
4,Sep 2019,Always find you will get a wave at Canggu (and...,always find you will get a wave at canggu (and...


In [4]:
#emoticon/emoji removal
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
df['no_emot'] = df['case_folded'].apply(lambda x: remove_emoji(x))

In [5]:
#remove punctuation
x = 'everyne walking arounf had no masks on, i seem...'
# re.sub(r'[^\w ]+', "", x)

df['no_punct'] = df['no_emot'].apply(lambda x: re.sub(r'[^\w ]+', "", x))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct
0,Dec 2019,Initially the black sands of Canggu Beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...
1,Dec 2019,Unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...
2,Jul 2019,Canggu Beach was wonderful and exactly what I ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...
3,Dec 2019,We knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...
4,Sep 2019,Always find you will get a wave at Canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu and ...


In [6]:
#frequent words removal
# text = ' '.join(df['no_punct'])
# text = text.split()

# freq_words = pd.Series(text).value_counts()
# f20 = freq_words[:20]
# f20

# df['no_fr_words'] = df['no_punct'].apply(lambda x: ' '.join([t for t in x.split() if t not in f20]))

cnt = Counter()
for text in df["no_punct"].values:
    for word in text.split():
        cnt[word] += 1
        
# cnt.most_common(10)

FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["no_freq_words"] = df["no_punct"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words
0,Dec 2019,Initially the black sands of Canggu Beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially black sands canggu are quite novel a...
1,Dec 2019,Unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou was covered rubbish...
2,Jul 2019,Canggu Beach was wonderful and exactly what I ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu was wonderful exactly what i wanted my ...
3,Dec 2019,We knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not white sandy but itself ha...
4,Sep 2019,Always find you will get a wave at Canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu and ...,always find will get wave at canggu surrounds ...


In [7]:
#Stopwords Removal
stop = set(stopwords.words('english'))
# df['no_stopwords'] = df['no_fr_words'].apply(lambda x: ' '.join([t for t in x.split() if t not in stopwords]))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["no_stopwords"] = df["no_freq_words"].apply(lambda text: remove_stopwords(text))

df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords
0,Dec 2019,Initially the black sands of Canggu Beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially black sands canggu are quite novel a...,initially black sands canggu quite novel spot ...
1,Dec 2019,Unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou was covered rubbish...,unfortunately gorgeou covered rubbish little f...
2,Jul 2019,Canggu Beach was wonderful and exactly what I ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu was wonderful exactly what i wanted my ...,canggu wonderful exactly wanted vacation spent...
3,Dec 2019,We knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not white sandy but itself ha...,knew white sandy lots rubbish noisy lots drunk...
4,Sep 2019,Always find you will get a wave at Canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu and ...,always find will get wave at canggu surrounds ...,always find will wave canggu surrounds without...


In [8]:
#rare words removal
# rare20 = most_common.tail(20)
# df['no_rare_words'] = df['no_stopwords'].apply(lambda x : ' '.join([t for t in x.split() if t not in rare20]))

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["no_rare_words"] = df["no_stopwords"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words
0,Dec 2019,Initially the black sands of Canggu Beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially black sands canggu are quite novel a...,initially black sands canggu quite novel spot ...,initially black sands canggu quite novel spot ...
1,Dec 2019,Unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou was covered rubbish...,unfortunately gorgeou covered rubbish little f...,unfortunately gorgeou covered rubbish little f...
2,Jul 2019,Canggu Beach was wonderful and exactly what I ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu was wonderful exactly what i wanted my ...,canggu wonderful exactly wanted vacation spent...,canggu wonderful exactly wanted vacation spent...
3,Dec 2019,We knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not white sandy but itself ha...,knew white sandy lots rubbish noisy lots drunk...,knew white sandy lots rubbish noisy lots drunk...
4,Sep 2019,Always find you will get a wave at Canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu and ...,always find will get wave at canggu surrounds ...,always find will wave canggu surrounds without...,always find will wave canggu surrounds without...


In [9]:
#Stemming
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_stemmed"] = df["no_rare_words"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words,text_stemmed
0,Dec 2019,Initially the black sands of Canggu Beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially black sands canggu are quite novel a...,initially black sands canggu quite novel spot ...,initially black sands canggu quite novel spot ...,initi black sand canggu quit novel spot becom ...
1,Dec 2019,Unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou was covered rubbish...,unfortunately gorgeou covered rubbish little f...,unfortunately gorgeou covered rubbish little f...,unfortun gorgeou cover rubbish littl food drin...
2,Jul 2019,Canggu Beach was wonderful and exactly what I ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu was wonderful exactly what i wanted my ...,canggu wonderful exactly wanted vacation spent...,canggu wonderful exactly wanted vacation spent...,canggu wonder exactli want vacat spent three w...
3,Dec 2019,We knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not white sandy but itself ha...,knew white sandy lots rubbish noisy lots drunk...,knew white sandy lots rubbish noisy lots drunk...,knew white sandi lot rubbish noisi lot drunk p...
4,Sep 2019,Always find you will get a wave at Canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu and ...,always find will get wave at canggu surrounds ...,always find will wave canggu surrounds without...,always find will wave canggu surrounds without...,alway find will wave canggu surround without m...


In [10]:
#lemmatization
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["review"] = df["no_rare_words"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words,text_stemmed,review
0,Dec 2019,Initially the black sands of Canggu Beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially the black sands of canggu beach are ...,initially black sands canggu are quite novel a...,initially black sands canggu quite novel spot ...,initially black sands canggu quite novel spot ...,initi black sand canggu quit novel spot becom ...,initially black sand canggu quite novel spot b...
1,Dec 2019,Unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou beach was covered i...,unfortunately this gorgeou was covered rubbish...,unfortunately gorgeou covered rubbish little f...,unfortunately gorgeou covered rubbish little f...,unfortun gorgeou cover rubbish littl food drin...,unfortunately gorgeou cover rubbish little foo...
2,Jul 2019,Canggu Beach was wonderful and exactly what I ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu beach was wonderful and exactly what i ...,canggu was wonderful exactly what i wanted my ...,canggu wonderful exactly wanted vacation spent...,canggu wonderful exactly wanted vacation spent...,canggu wonder exactli want vacat spent three w...,canggu wonderful exactly want vacation spend t...
3,Dec 2019,We knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not a white sandy beach but t...,we knew this was not white sandy but itself ha...,knew white sandy lots rubbish noisy lots drunk...,knew white sandy lots rubbish noisy lots drunk...,knew white sandi lot rubbish noisi lot drunk p...,know white sandy lot rubbish noisy lot drink p...
4,Sep 2019,Always find you will get a wave at Canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu (and...,always find you will get a wave at canggu and ...,always find will get wave at canggu surrounds ...,always find will wave canggu surrounds without...,always find will wave canggu surrounds without...,alway find will wave canggu surround without m...,always find will wave canggu surround without ...


In [11]:
df.drop(["case_folded", "no_emot", "no_punct", "no_freq_words", "no_stopwords", "no_rare_words", "text_stemmed"], axis=1, inplace=True)
df.sample(5)

Unnamed: 0,Date,Content,review
249,Sep 2017,Very dirty/littered but lovely location. Waves...,dirtylittered lovely location wavessurf pretty...
35,Sep 2019,Came to the beach of an evening with friends t...,come even friend drink watch sunset pretty pop...
522,Jun 2015,Canggu's a great place to come for surfing and...,canggus great place come surf hang especially ...
52,Aug 2019,I have always stayed in Legian this time I dec...,always stay legian time decide try different p...
433,Apr 2016,Great beach for a visit. Surf variable to suit...,great visit surf variable suit taste attention...


In [12]:
#data labelling
sid = SentimentIntensityAnalyzer()
# df = pd.read_csv('doublesix_lemmatized.csv', sep='\t')
# df.head()

df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['sentiment'] = df['compound'].apply(lambda c: 'positive' if c >= 0 else 'negative')

# df.sample(20)

df.to_csv('nusadua_preprocessed.csv', index=False, encoding='utf-8')