# Load Data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

# Preprocessing Tools
import re
import string

# NLTK library
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, MWETokenizer
from nltk.stem import LancasterStemmer, PorterStemmer, WordNetLemmatizer

import spacy

# Document-Term Matrix
from sklearn.feature_extraction.text import CountVectorizer

# Topic Modeling
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [4]:
# Set the display constraints to be able to read entire reviews
pd.set_option('display.max_colwidth', None)

# open pickle file with sentiment and score
df = pd.read_pickle('df_sentiment_score_pkl')
df.head(1)

Unnamed: 0,ID,Rating,Date,Reviewer_Location,Reviews,Branch,Rating_Type,Sentiment_Score
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides its a Small World is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well.,HongKong,Positive,0.7069


# Preprocessing

In [5]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df['Reviews'] = df.Reviews.map(alphanumeric).map(punc_lower)
df.head(1)

Unnamed: 0,ID,Rating,Date,Reviewer_Location,Reviews,Branch,Rating_Type,Sentiment_Score
0,670772142,4,2019-4,Australia,if you ve ever been to disneyland anywhere you ll find disneyland hong kong very similar in the layout when you walk into main street it has a very familiar feel one of the rides its a small world is absolutely fabulous and worth doing the day we visited was fairly hot and relatively busy but the queues moved fairly well,HongKong,Positive,0.7069


In [6]:
# Compound Term Extraction - link common phrases and words
mwe_word_list = [('small', 'world'), ('hong', 'kong'), ('space', 'mountain'), 
                         ('happiest', 'place', 'on', 'earth'), ('haunted', 'mansion'), 
                         ('park', 'hopper'), ('fast', 'pass'), ('indiana', 'jones'), 
                        ('pirates','of','the','caribbean'), ('mickey', 'mouse'), 
                     ('peter', 'pan'), ('star', 'wars'), ('california', 'adventure'),
                    ('lion', 'king'), ('buzz', 'lightyear'), ('roller', 'coaster'),
                    ('toy', 'story'), ('jungle', 'cruise'), ('radiator', 'springs')]

mwe_tokenizer = MWETokenizer(mwe_word_list)

In [7]:
# Create a column with the tokenized words
df['mwe_clean'] = df.Reviews.apply(lambda x: mwe_tokenizer.tokenize(x.split()))
df['mwe_clean'][:3]

0                                                                                                                                                                                                                                                                       [if, you, ve, ever, been, to, disneyland, anywhere, you, ll, find, disneyland, hong_kong, very, similar, in, the, layout, when, you, walk, into, main, street, it, has, a, very, familiar, feel, one, of, the, rides, its, a, small_world, is, absolutely, fabulous, and, worth, doing, the, day, we, visited, was, fairly, hot, and, relatively, busy, but, the, queues, moved, fairly, well]
1    [its, been, a, while, since, d, last, time, we, visit, hk, disneyland, yet, this, time, we, only, stay, in, tomorrowland, aka, marvel, land, now, they, have, iron, man, experience, n, d, newly, open, ant, man, n, d, wasp, ironman, great, feature, n, so, exciting, especially, d, whole, scenery, of, hk, hk, central, area, to, kowloon, antman, chang

In [8]:
# Merge all tokens together and build Vectorizer with cleaned text data
mwe_clean_docs = [' '.join(doc) for doc in df.mwe_clean]
mwe_clean_docs[:3]

['if you ve ever been to disneyland anywhere you ll find disneyland hong_kong very similar in the layout when you walk into main street it has a very familiar feel one of the rides its a small_world is absolutely fabulous and worth doing the day we visited was fairly hot and relatively busy but the queues moved fairly well',
 'its been a while since d last time we visit hk disneyland yet this time we only stay in tomorrowland aka marvel land now they have iron man experience n d newly open ant man n d wasp ironman great feature n so exciting especially d whole scenery of hk hk central area to kowloon antman changed by previous buzz_lightyear more or less d same but i m expecting to have something most however my boys like it space_mountain turns into star_wars this is great for cast members staffs felt bit minus point from before just dun feel like its a disney brand seems more local like ocean park or even worst they got no smiling face but just wanna u to enter n attraction n leave h

# Topic Modeling - CorEx

In [9]:
# Vectorize the document
vectorizer = CountVectorizer(max_features=2500,
                             stop_words='english', 
                              #token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)
doc_word = vectorizer.fit_transform(mwe_clean_docs)
words = list(np.asarray(vectorizer.get_feature_names_out()))

In [10]:
doc_word.shape

(37547, 2500)

In [11]:
words[:20]

['ability',
 'able',
 'absolute',
 'absolutely',
 'accept',
 'acceptable',
 'access',
 'accessible',
 'accommodate',
 'accommodating',
 'accommodation',
 'according',
 'accordingly',
 'accurate',
 'act',
 'action',
 'activities',
 'activity',
 'actors',
 'actual']

In [12]:
# Create an unsupervised topic model with CorEx without the use of anchor words
topic_model = ct.Corex(n_hidden= 10,
                       words=words, 
                       seed=1)

topic_model.fit(doc_word,         
                words= words,     
                docs= mwe_clean_docs)


<corextopic.corextopic.Corex at 0x2a34be78b50>

In [13]:
# Topics without anchored words
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: people, staff, money, told, think, know, rude, closed, let, said
1: time, use, line, fast, wait, early, fast_pass, passes, make, times
2: did, minutes, got, hour, main, way, hours, tickets, walk, street
3: food, hotel, water, meal, eat, lunch, expensive, breakfast, restaurant, drinks
4: disney, park, parks, paris, florida, magic, studios, queues, having, better
5: just, like, year, old, disneyland, really, going, little, years, trip
6: day, parade, fireworks, rides, night, went, didn, days, end, evening
7: train, manor, grizzly, castle, station, mystic, land, mtr, lion_king, hong_kong
8: characters, meet, mickey, queue, character, good, photo, minnie, children, photos
9: mountain, thunder, ride, space_mountain, indiana_jones, splash, pirates_of_the_caribbean, tours, pirates, small_world


In [17]:
# Create semi-supervised topic model with CorEx with the use of anchor words
"""
The anchors we'll use are:

1. disneyland, disney, california
2. disneyland, paris
3. disneyland, hong kong
4. disney, food

"""
topic_model2 = ct.Corex(n_hidden= 6,
                       words=words,  
                       seed=1)

topic_model2.fit(doc_word,
                words= words,
                docs= mwe_clean_docs,
                anchors=[
                         ['disneyland', 'disney', 'california'],
                         ['disneyland', 'disney', 'paris'],
                         ['disneyland', 'disney', 'hong_kong'],
                         ['disney', 'food'],
                         ['disney', 'parade']],
                anchor_strength=5);

In [18]:
# Topics with anchored words
topics2 = topic_model2.get_topics()

for n,topic in enumerate(topics2):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: disney, california, world, walt, downtown, fan, disneyland, fans, southern, original
1: disney, paris, florida, disneyland, orlando, dreams, rer, comparison, typical, states
2: disney, hong_kong, land, disneyland, smaller, compared, mtr, version, compare, scale
3: food, expensive, disney, drinks, drink, quality, prices, options, bring, overpriced
4: parade, fireworks, disney, night, watch, paint, miss, lion_king, floats, amazing
5: ride, park, day, just, time, people, did, rides, hotel, got


# Preprocessing

In [19]:
#Loading Stopwords into a list
nltk.download('stopwords')
NLTK_stop_words_list = stopwords.words('english')

# Adding new stop words
add_stop_words_list = ['disneyland', 'disney']

final_stop_words_list = (NLTK_stop_words_list + add_stop_words_list)    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# Vectorize
vectorizer2 = CountVectorizer(max_features=2500,
                             stop_words=final_stop_words_list,
                             #token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)
doc_word2 = vectorizer2.fit_transform(mwe_clean_docs)
words2 = list(np.asarray(vectorizer2.get_feature_names_out()))

In [21]:
# Create an unsupervised topic model with CorEx without the use of anchor words
topic_model3 = ct.Corex(n_hidden= 10,
                       words=words2, 
                       seed=1)

topic_model3.fit(doc_word2,         
                words= words2,     
                docs= mwe_clean_docs)

<corextopic.corextopic.Corex at 0x2a363c5e7c0>

In [22]:
# Topics without anchored words
topics3 = topic_model3.get_topics()
for n,topic in enumerate(topics3):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: people, would, could, money, told, think, know, find, said, let
1: ride, one, day, minutes, got, mountain, hour, thunder, hours, space_mountain
2: food, paris, queues, queue, good, parks, staff, expensive, found, stayed
3: get, take, time, go, early, want, make, water, plan, sure
4: hotel, around, characters, see, lunch, breakfast, eat, meal, restaurant, dinner
5: show, parade, main, castle, fireworks, street, mickey, night, train, lion_king
6: year, back, old, even, days, cast, trip, last, made, going
7: like, us, first, many, experience, magic, things, better, different, visit
8: passes, times, line, use, fast, wait, fast_pass, indiana_jones, tours, star
9: park, rides, also, really, much, well, went, bit, quite, worth


In [23]:
# Create semi-supervised topic model with CorEx with the use of anchor words
"""
The anchors we'll use are:

1. park, california
2. park, paris
3. park, hong kong
4. park, food

"""
topic_model4 = ct.Corex(n_hidden= 6,
                       words=words2,  
                       seed=1)

topic_model4.fit(doc_word2,
                words= words2,
                docs= mwe_clean_docs,
                anchors=[
                         ['park', 'california'],
                         ['park', 'paris'],
                         ['park', 'hong_kong'],
                         ['food']],
                anchor_strength=5);

In [57]:
# Topics with anchored words
topics4 = topic_model4.get_topics()

for n,topic in enumerate(topics4):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: park, california, theme, hopper, clean, amusement, ocean, original, history, navigate
1: park, paris, would, people, us, parks, could, hour, staff, two
2: park, hong, kong, smaller, maintained, smallest, subway, attractive, senior, organized
3: food, expensive, drinks, drink, quality, prices, options, overpriced, priced, pricey
4: ride, get, one, mountain, time, day, rides, also, go, space
5: hotel, take, around, show, see, good, characters, queue, parade, want
