In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import spacy

In [2]:
#load dataset
df=pd.read_csv('seminyak_data.csv')
df.head()

Unnamed: 0,Name,Country,Date,Content
0,MADHURKJAIN,"Vadodara, India",Oct 2019,Pristine white sandy beach. Ideal place to rel...
1,ind,"England, UK",Sep 2019,Arrrrh the beach... this is one place we didn’...
2,PrestonGuild,United Kingdom,Jul 2019,Once you go past Legian beach you are on Semin...
3,Aslam_Sherif,"London, UK",Aug 2019,Seminyak beach is nice and clean. I found it s...
4,Sandy Ta,"Singapore, Singapore",May 2019,Purposely chose Seminyak beach of all places i...


In [3]:
#case folding
df['case_folded'] = df['Content'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
# df['case_folded'].head()
df.drop(["Name", "Country"], axis=1, inplace=True)
df.head()

Unnamed: 0,Date,Content,case_folded
0,Oct 2019,Pristine white sandy beach. Ideal place to rel...,pristine white sandy beach. ideal place to rel...
1,Sep 2019,Arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...
2,Jul 2019,Once you go past Legian beach you are on Semin...,once you go past legian beach you are on semin...
3,Aug 2019,Seminyak beach is nice and clean. I found it s...,seminyak beach is nice and clean. i found it s...
4,May 2019,Purposely chose Seminyak beach of all places i...,purposely chose seminyak beach of all places i...


In [4]:
#emoticon/emoji removal
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
df['no_emot'] = df['case_folded'].apply(lambda x: remove_emoji(x))

In [5]:
#remove punctuation
x = 'everyne walking arounf had no masks on, i seem...'
# re.sub(r'[^\w ]+', "", x)

df['no_punct'] = df['no_emot'].apply(lambda x: re.sub(r'[^\w ]+', "", x))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct
0,Oct 2019,Pristine white sandy beach. Ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach ideal place to rela...
1,Sep 2019,Arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach this is one place we didnt li...
2,Jul 2019,Once you go past Legian beach you are on Semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...
3,Aug 2019,Seminyak beach is nice and clean. I found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean i found it so...
4,May 2019,Purposely chose Seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...


In [6]:
#frequent words removal
# text = ' '.join(df['no_punct'])
# text = text.split()

# freq_words = pd.Series(text).value_counts()
# f20 = freq_words[:20]
# f20

# df['no_fr_words'] = df['no_punct'].apply(lambda x: ' '.join([t for t in x.split() if t not in f20]))

cnt = Counter()
for text in df["no_punct"].values:
    for word in text.split():
        cnt[word] += 1
        
# cnt.most_common(10)

FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["no_freq_words"] = df["no_punct"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words
0,Oct 2019,Pristine white sandy beach. Ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach ideal place to rela...,pristine white sandy ideal place relax or swim...
1,Sep 2019,Arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach this is one place we didnt li...,arrrrh this one place we didnt list on our age...
2,Jul 2019,Once you go past Legian beach you are on Semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once go past legian are on seminyak beachthere...
3,Aug 2019,Seminyak beach is nice and clean. I found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean i found it so...,seminyak nice clean i found it some parts very...
4,May 2019,Purposely chose Seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak all places bali my tr...


In [7]:
#Stopwords Removal
stop = set(stopwords.words('english'))
# df['no_stopwords'] = df['no_fr_words'].apply(lambda x: ' '.join([t for t in x.split() if t not in stopwords]))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["no_stopwords"] = df["no_freq_words"].apply(lambda text: remove_stopwords(text))

df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords
0,Oct 2019,Pristine white sandy beach. Ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach ideal place to rela...,pristine white sandy ideal place relax or swim...,pristine white sandy ideal place relax swim lo...
1,Sep 2019,Arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach this is one place we didnt li...,arrrrh this one place we didnt list on our age...,arrrrh one place didnt list agenda planning tr...
2,Jul 2019,Once you go past Legian beach you are on Semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once go past legian are on seminyak beachthere...,go past legian seminyak beachthere high end ho...
3,Aug 2019,Seminyak beach is nice and clean. I found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean i found it so...,seminyak nice clean i found it some parts very...,seminyak nice clean found parts busy loud part...
4,May 2019,Purposely chose Seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak all places bali my tr...,purposely chose seminyak places bali trip want...


In [8]:
#rare words removal
# rare20 = most_common.tail(20)
# df['no_rare_words'] = df['no_stopwords'].apply(lambda x : ' '.join([t for t in x.split() if t not in rare20]))

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["no_rare_words"] = df["no_stopwords"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words
0,Oct 2019,Pristine white sandy beach. Ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach ideal place to rela...,pristine white sandy ideal place relax or swim...,pristine white sandy ideal place relax swim lo...,pristine white sandy ideal place relax swim lo...
1,Sep 2019,Arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach this is one place we didnt li...,arrrrh this one place we didnt list on our age...,arrrrh one place didnt list agenda planning tr...,arrrrh one place didnt list agenda planning tr...
2,Jul 2019,Once you go past Legian beach you are on Semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once go past legian are on seminyak beachthere...,go past legian seminyak beachthere high end ho...,go past legian seminyak beachthere high end ho...
3,Aug 2019,Seminyak beach is nice and clean. I found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean i found it so...,seminyak nice clean i found it some parts very...,seminyak nice clean found parts busy loud part...,seminyak nice clean found parts busy loud part...
4,May 2019,Purposely chose Seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak all places bali my tr...,purposely chose seminyak places bali trip want...,purposely chose seminyak places bali trip want...


In [9]:
#Stemming
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_stemmed"] = df["no_rare_words"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words,text_stemmed
0,Oct 2019,Pristine white sandy beach. Ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach ideal place to rela...,pristine white sandy ideal place relax or swim...,pristine white sandy ideal place relax swim lo...,pristine white sandy ideal place relax swim lo...,pristin white sandi ideal place relax swim loc...
1,Sep 2019,Arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach this is one place we didnt li...,arrrrh this one place we didnt list on our age...,arrrrh one place didnt list agenda planning tr...,arrrrh one place didnt list agenda planning tr...,arrrrh one place didnt list agenda plan trip p...
2,Jul 2019,Once you go past Legian beach you are on Semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once go past legian are on seminyak beachthere...,go past legian seminyak beachthere high end ho...,go past legian seminyak beachthere high end ho...,go past legian seminyak beachther high end hot...
3,Aug 2019,Seminyak beach is nice and clean. I found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean i found it so...,seminyak nice clean i found it some parts very...,seminyak nice clean found parts busy loud part...,seminyak nice clean found parts busy loud part...,seminyak nice clean found part busi loud part ...
4,May 2019,Purposely chose Seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak all places bali my tr...,purposely chose seminyak places bali trip want...,purposely chose seminyak places bali trip want...,purpos chose seminyak place bali trip want lea...


In [10]:
#lemmatization
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["review"] = df["text_stemmed"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,Date,Content,case_folded,no_emot,no_punct,no_freq_words,no_stopwords,no_rare_words,text_stemmed,review
0,Oct 2019,Pristine white sandy beach. Ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach. ideal place to rel...,pristine white sandy beach ideal place to rela...,pristine white sandy ideal place relax or swim...,pristine white sandy ideal place relax swim lo...,pristine white sandy ideal place relax swim lo...,pristin white sandi ideal place relax swim loc...,pristin white sandi ideal place relax swim loc...
1,Sep 2019,Arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach... this is one place we didn’...,arrrrh the beach this is one place we didnt li...,arrrrh this one place we didnt list on our age...,arrrrh one place didnt list agenda planning tr...,arrrrh one place didnt list agenda planning tr...,arrrrh one place didnt list agenda plan trip p...,arrrrh one place didnt list agenda plan trip p...
2,Jul 2019,Once you go past Legian beach you are on Semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once you go past legian beach you are on semin...,once go past legian are on seminyak beachthere...,go past legian seminyak beachthere high end ho...,go past legian seminyak beachthere high end ho...,go past legian seminyak beachther high end hot...,go past legian seminyak beachther high end hot...
3,Aug 2019,Seminyak beach is nice and clean. I found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean. i found it s...,seminyak beach is nice and clean i found it so...,seminyak nice clean i found it some parts very...,seminyak nice clean found parts busy loud part...,seminyak nice clean found parts busy loud part...,seminyak nice clean found part busi loud part ...,seminyak nice clean find part busi loud part q...
4,May 2019,Purposely chose Seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak beach of all places i...,purposely chose seminyak all places bali my tr...,purposely chose seminyak places bali trip want...,purposely chose seminyak places bali trip want...,purpos chose seminyak place bali trip want lea...,purpos choose seminyak place bali trip want le...


In [11]:
# df.drop(["case_folded", "no_emot", "no_punct", "no_freq_words", "no_stopwords", "no_rare_words", "text_stemmed"], axis=1, inplace=True)
# df.sample(5)

In [13]:
#data labelling
sid = SentimentIntensityAnalyzer()
# df = pd.read_csv('doublesix_lemmatized.csv', sep='\t')
# df.head()

df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['sentiment'] = df['compound'].apply(lambda c: '1' if c >= 0 else '0')

# df.sample(20)

df.to_csv('seminyak_preprocessed1.csv', index=False, encoding='utf-8')

In [14]:
#TF-IDF
df = pd.read_csv('seminyak_preprocessed.csv')

In [15]:
# df = pd.DataFrame(np.random.randint(0.0,100.0,size=(10,1)),
#               index=range(10,20),
#               columns=['review'],
#               dtype='str')
corpus = df['review'].values.astype('U')

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(stop_words='english')
vec.fit_transform(corpus)
# print(resp)

<4198x8507 sparse matrix of type '<class 'numpy.float64'>'
	with 82699 stored elements in Compressed Sparse Row format>

In [17]:
vec.get_feature_names()

['000',
 '0000',
 '000idr',
 '000s',
 '0045',
 '0900',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1000000',
 '100000idr',
 '100000ir',
 '100000rp',
 '100k',
 '100kthe',
 '100m',
 '100rupes',
 '1010',
 '1015',
 '1020',
 '104f',
 '10am',
 '10am2pmthe',
 '10k',
 '10km',
 '10kyes',
 '10min',
 '10nz',
 '11',
 '110',
 '1100am',
 '1100pm',
 '110k',
 '112',
 '11am',
 '11pm',
 '12',
 '12nn330pm',
 '12pm',
 '13',
 '14',
 '1400000',
 '14kms',
 '15',
 '150',
 '1500',
 '15000',
 '150000',
 '150000rp',
 '150k',
 '1520',
 '1520mins',
 '15first',
 '15hours',
 '15k',
 '15min',
 '16',
 '1600',
 '16000',
 '16th',
 '17',
 '1700',
 '1715pm',
 '1730',
 '177m',
 '18',
 '180000',
 '1815',
 '18th',
 '19',
 '1900',
 '1900ish',
 '1960s',
 '1974',
 '1976',
 '1980s',
 '1982',
 '1am',
 '1h',
 '1ml',
 '1pm',
 '1st',
 '20',
 '200',
 '2000',
 '20000',
 '200000',
 '200000for',
 '20000idr',
 '20000rph',
 '2002',
 '200300',
 '2004',
 '2005in',
 '2006',
 '2007',
 '2008',
 '2009fast',
 '200k',
 '200khour',
 '2010',
 '2

In [18]:
df.head()

Unnamed: 0,Date,Content,review,scores,compound,sentiment
0,Oct 2019,Pristine white sandy beach. Ideal place to rel...,pristine white sandy ideal place relax swim lo...,"{'neg': 0.0, 'neu': 0.57, 'pos': 0.43, 'compou...",0.8689,1
1,Sep 2019,Arrrrh the beach... this is one place we didn’...,arrrrh one place didnt list agenda plan trip p...,"{'neg': 0.068, 'neu': 0.609, 'pos': 0.323, 'co...",0.9975,1
2,Jul 2019,Once you go past Legian beach you are on Semin...,go past legian seminyak beachthere high end ho...,"{'neg': 0.0, 'neu': 0.679, 'pos': 0.321, 'comp...",0.7927,1
3,Aug 2019,Seminyak beach is nice and clean. I found it s...,seminyak nice clean find part busy loud part q...,"{'neg': 0.0, 'neu': 0.607, 'pos': 0.393, 'comp...",0.8126,1
4,May 2019,Purposely chose Seminyak beach of all places i...,purposely chose seminyak place bali trip want ...,"{'neg': 0.047, 'neu': 0.658, 'pos': 0.295, 'co...",0.9517,1


In [None]:
# df.to_csv('seminyak_weighted.csv', index=False, encoding='utf-8')