In [1]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import spacy

In [2]:
#load dataset
df=pd.read_csv('seminyak_data.csv')
df.head()

Unnamed: 0,Name,Country,Date,Content
0,MADHURKJAIN,"Vadodara, India",Oct 2019,Beautiful and pristine Bech with clear waters....
1,mmuazhiim,83 contributions,Oct 2019,Beautiful and stunning view of the beach. Clea...
2,Sam Ibrahim,"Sanur, Indonesia",Dec 2019,You arrifve and are immeditely overwhelmed by ...
3,Aslam_Sherif,"London, UK",Aug 2019,Nusa Dua are the best beaches I saw on Bali. N...
4,JodyBonny,"Mendrisio, Switzerland",Dec 2019,Move to the central area. Wave and tides clean...


In [3]:
#case folding
df['case_folded'] = df['Content'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
# df['case_folded'].head()
df.drop(["Name", "Country"], axis=1, inplace=True)
df.head()

Unnamed: 0,Date,Content,case_folded
0,Oct 2019,Beautiful and pristine Bech with clear waters....,beautiful and pristine bech with clear waters....
1,Oct 2019,Beautiful and stunning view of the beach. Clea...,beautiful and stunning view of the beach. clea...
2,Dec 2019,You arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...
3,Aug 2019,Nusa Dua are the best beaches I saw on Bali. N...,nusa dua are the best beaches i saw on bali. n...
4,Dec 2019,Move to the central area. Wave and tides clean...,move to the central area. wave and tides clean...


In [4]:
#emoticon/emoji removal
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
df['no_emot'] = df['case_folded'].apply(lambda x: remove_emoji(x))

In [5]:
#remove punctuation
x = 'everyne walking arounf had no masks on, i seem...'
# re.sub(r'[^\w ]+', "", x)

df['no_punct'] = df['no_emot'].apply(lambda x: re.sub(r'[^\w ]+', "", x))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct
0,Oct 2019,Beautiful and pristine Bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters ...
1,Oct 2019,Beautiful and stunning view of the beach. Clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach clear...
2,Dec 2019,You arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...
3,Aug 2019,Nusa Dua are the best beaches I saw on Bali. N...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali nu...
4,Dec 2019,Move to the central area. Wave and tides clean...,move to the central area. wave and tides clean...,move to the central area. wave and tides clean...,move to the central area wave and tides clean ...


In [6]:
#frequent words removal
# text = ' '.join(df['no_punct'])
# text = text.split()

# freq_words = pd.Series(text).value_counts()
# f20 = freq_words[:20]
# f20

# df['no_fr_words'] = df['no_punct'].apply(lambda x: ' '.join([t for t in x.split() if t not in f20]))

cnt = Counter()
for text in df["no_punct"].values:
    for word in text.split():
        cnt[word] += 1
        
# cnt.most_common(10)

FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["no_freq_words"] = df["no_punct"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words
0,Oct 2019,Beautiful and pristine Bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters ...,beautiful pristine bech with clear waters enjo...
1,Oct 2019,Beautiful and stunning view of the beach. Clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach clear...,beautiful stunning view clear blue sky landscp...
2,Dec 2019,You arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,arrifve are immeditely overwhelmed by natural ...
3,Aug 2019,Nusa Dua are the best beaches I saw on Bali. N...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali nu...,nusa dua are best beaches i saw on bali nusa d...
4,Dec 2019,Move to the central area. Wave and tides clean...,move to the central area. wave and tides clean...,move to the central area. wave and tides clean...,move to the central area wave and tides clean ...,move central area wave tides clean away plasti...


In [7]:
#Stopwords Removal
stop = set(stopwords.words('english'))
# df['no_stopwords'] = df['no_fr_words'].apply(lambda x: ' '.join([t for t in x.split() if t not in stopwords]))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["no_stopwords"] = df["no_freq_words"].apply(lambda text: remove_stopwords(text))

df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords
0,Oct 2019,Beautiful and pristine Bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters ...,beautiful pristine bech with clear waters enjo...,beautiful pristine bech clear waters enjoyed l...
1,Oct 2019,Beautiful and stunning view of the beach. Clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach clear...,beautiful stunning view clear blue sky landscp...,beautiful stunning view clear blue sky landscp...
2,Dec 2019,You arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,arrifve are immeditely overwhelmed by natural ...,arrifve immeditely overwhelmed natural beautys...
3,Aug 2019,Nusa Dua are the best beaches I saw on Bali. N...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali nu...,nusa dua are best beaches i saw on bali nusa d...,nusa dua best beaches saw bali nusa dua walkin...
4,Dec 2019,Move to the central area. Wave and tides clean...,move to the central area. wave and tides clean...,move to the central area. wave and tides clean...,move to the central area wave and tides clean ...,move central area wave tides clean away plasti...,move central area wave tides clean away plasti...


In [8]:
#rare words removal
# rare20 = most_common.tail(20)
# df['no_rare_words'] = df['no_stopwords'].apply(lambda x : ' '.join([t for t in x.split() if t not in rare20]))

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["no_rare_words"] = df["no_stopwords"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words
0,Oct 2019,Beautiful and pristine Bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters ...,beautiful pristine bech with clear waters enjo...,beautiful pristine bech clear waters enjoyed l...,beautiful pristine bech clear waters enjoyed l...
1,Oct 2019,Beautiful and stunning view of the beach. Clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach clear...,beautiful stunning view clear blue sky landscp...,beautiful stunning view clear blue sky landscp...,beautiful stunning view clear blue sky landscp...
2,Dec 2019,You arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,arrifve are immeditely overwhelmed by natural ...,arrifve immeditely overwhelmed natural beautys...,arrifve immeditely overwhelmed natural beautys...
3,Aug 2019,Nusa Dua are the best beaches I saw on Bali. N...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali nu...,nusa dua are best beaches i saw on bali nusa d...,nusa dua best beaches saw bali nusa dua walkin...,nusa dua best beaches saw bali nusa dua walkin...
4,Dec 2019,Move to the central area. Wave and tides clean...,move to the central area. wave and tides clean...,move to the central area. wave and tides clean...,move to the central area wave and tides clean ...,move central area wave tides clean away plasti...,move central area wave tides clean away plasti...,move central area wave tides clean away plasti...


In [9]:
#Stemming
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_stemmed"] = df["no_rare_words"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words,text_stemmed
0,Oct 2019,Beautiful and pristine Bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters ...,beautiful pristine bech with clear waters enjo...,beautiful pristine bech clear waters enjoyed l...,beautiful pristine bech clear waters enjoyed l...,beauti pristin bech clear water enjoy lot wate...
1,Oct 2019,Beautiful and stunning view of the beach. Clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach clear...,beautiful stunning view clear blue sky landscp...,beautiful stunning view clear blue sky landscp...,beautiful stunning view clear blue sky landscp...,beauti stun view clear blue sky landscpa place...
2,Dec 2019,You arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,arrifve are immeditely overwhelmed by natural ...,arrifve immeditely overwhelmed natural beautys...,arrifve immeditely overwhelmed natural beautys...,arrifv immedit overwhelm natur beautyserin sta...
3,Aug 2019,Nusa Dua are the best beaches I saw on Bali. N...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali nu...,nusa dua are best beaches i saw on bali nusa d...,nusa dua best beaches saw bali nusa dua walkin...,nusa dua best beaches saw bali nusa dua walkin...,nusa dua best beach saw bali nusa dua walk sun...
4,Dec 2019,Move to the central area. Wave and tides clean...,move to the central area. wave and tides clean...,move to the central area. wave and tides clean...,move to the central area wave and tides clean ...,move central area wave tides clean away plasti...,move central area wave tides clean away plasti...,move central area wave tides clean away plasti...,move central area wave tide clean away plastic...


In [10]:
#lemmatization
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["review"] = df["no_rare_words"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words,text_stemmed,review
0,Oct 2019,Beautiful and pristine Bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters....,beautiful and pristine bech with clear waters ...,beautiful pristine bech with clear waters enjo...,beautiful pristine bech clear waters enjoyed l...,beautiful pristine bech clear waters enjoyed l...,beauti pristin bech clear water enjoy lot wate...,beautiful pristine bech clear water enjoy lot ...
1,Oct 2019,Beautiful and stunning view of the beach. Clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach. clea...,beautiful and stunning view of the beach clear...,beautiful stunning view clear blue sky landscp...,beautiful stunning view clear blue sky landscp...,beautiful stunning view clear blue sky landscp...,beauti stun view clear blue sky landscpa place...,beautiful stun view clear blue sky landscpae p...
2,Dec 2019,You arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,you arrifve and are immeditely overwhelmed by ...,arrifve are immeditely overwhelmed by natural ...,arrifve immeditely overwhelmed natural beautys...,arrifve immeditely overwhelmed natural beautys...,arrifv immedit overwhelm natur beautyserin sta...,arrifve immeditely overwhelmed natural beautys...
3,Aug 2019,Nusa Dua are the best beaches I saw on Bali. N...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali. n...,nusa dua are the best beaches i saw on bali nu...,nusa dua are best beaches i saw on bali nusa d...,nusa dua best beaches saw bali nusa dua walkin...,nusa dua best beaches saw bali nusa dua walkin...,nusa dua best beach saw bali nusa dua walk sun...,nusa dua best beach saw bali nusa dua walk sun...
4,Dec 2019,Move to the central area. Wave and tides clean...,move to the central area. wave and tides clean...,move to the central area. wave and tides clean...,move to the central area wave and tides clean ...,move central area wave tides clean away plasti...,move central area wave tides clean away plasti...,move central area wave tides clean away plasti...,move central area wave tide clean away plastic...,move central area wave tide clean away plastic...


In [11]:
df.drop(["case_folded", "no_emot", "no_punct", "no_freq_words", "no_stopwords", "no_rare_words", "text_stemmed"], axis=1, inplace=True)
df.sample(5)

Unnamed: 0,Date,Content,review
102,Sep 2019,Amazing quiet private beach away from the hust...,amaze quiet private away hustle bustle city gr...
1248,Oct 2016,"Clean, unspoilt..one could have a candlelit di...",clean unspoiltone candlelit dinner organise la...
3305,Jun 2013,We took a taxi ride from Legian to here to get...,take taxi ride legian away overcrowd kuta were...
3138,May 2014,The beach was very inviting. It is cleaned ev...,inviting clean every day walk along one day wh...
1175,Jun 2017,"Golden sands, shallow shelving and crystal cle...",golden sand shallow shelve crystal clear water...


In [12]:
#data labelling
sid = SentimentIntensityAnalyzer()
# df = pd.read_csv('doublesix_lemmatized.csv', sep='\t')
# df.head()

df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['sentiment'] = df['compound'].apply(lambda c: '1' if c >= 0 else '0')

# df.sample(20)

df.to_csv('seminyak_preprocessed.csv', index=False, encoding='utf-8')