# Load Data

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

# Preprocessing Tools
import re
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import LancasterStemmer, PorterStemmer, WordNetLemmatizer

import spacy

# Document-Term Matrix
from sklearn.feature_extraction.text import CountVectorizer

# Topic Modeling
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [2]:
# Set the display constraints to be able to read entire reviews
pd.set_option('display.max_colwidth', None)

# open pickle file with sentiment and score
df = pd.read_pickle('df_sentiment_score_pkl')
df.head(1)

Unnamed: 0,ID,Rating,Date,Reviewer_Location,Reviews,Branch,Sentiment,Score
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides its a Small World is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well.,Disneyland_HongKong,Positive,0.7069


# Preprocessing

In [11]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df['Reviews'] = df.Reviews.map(alphanumeric).map(punc_lower)
df.head(1)

Unnamed: 0,ID,Rating,Date,Reviewer_Location,Reviews,Branch,Sentiment,Score
0,670772142,4,2019-4,Australia,if you ve ever been to disneyland anywhere you ll find disneyland hong kong very similar in the layout when you walk into main street it has a very familiar feel one of the rides its a small world is absolutely fabulous and worth doing the day we visited was fairly hot and relatively busy but the queues moved fairly well,Disneyland_HongKong,Positive,0.7069


# Topic Modeling - CorEx

In [12]:
vectorizer = CountVectorizer(max_features=2500,
                             stop_words='english', 
                             token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)
doc_word = vectorizer.fit_transform(df.Reviews)
words = list(np.asarray(vectorizer.get_feature_names_out()))

In [13]:
doc_word.shape

(37547, 2500)

In [14]:
words[:20]

['ability',
 'able',
 'absolute',
 'absolutely',
 'accept',
 'acceptable',
 'access',
 'accessible',
 'accommodate',
 'accommodating',
 'accommodation',
 'according',
 'accordingly',
 'accurate',
 'act',
 'action',
 'activities',
 'activity',
 'actors',
 'actual']

In [15]:
# Create an unsupervised topic model with CorEx without the use of anchor words
topic_model = ct.Corex(n_hidden= 10,
                       words=words, 
                       seed=1)

topic_model.fit(doc_word,         
                words= words,     
                docs= df.Reviews)


<corextopic.corextopic.Corex at 0x1ca2d72bfa0>

In [16]:
# Topics without anchored words
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: people, way, money, told, think, know, said, let, rude, thing
1: food, hotel, paris, queues, queue, good, staff, parks, stayed, expensive
2: day, park, did, got, minutes, rides, hour, went, didn, hours
3: main, king, street, small, train, castle, lion, parade, hong, story
4: disney, just, like, disneyland, really, world, magic, experience, going, year
5: mountain, space, pirates, thunder, jones, ride, indiana, caribbean, peter, star
6: water, bring, snacks, buy, drinks, need, outside, walking, free, sit
7: time, line, wait, make, early, don, want, times, plan, long
8: characters, mickey, meet, breakfast, character, minnie, lunch, mouse, princess, old
9: fast, pass, passes, use, california, adventure, haunted, mansion, lines, hopper


In [17]:
# Create semi-supervised topic model with CorEx with the use of anchor words
"""
The anchors we'll use are:

1. disneyland, disney, california
2. disneyland, paris
3. disneyland, hong kong
4. disney, food

"""
topic_model2 = ct.Corex(n_hidden= 6,
                       words=words,  
                       seed=1)

topic_model2.fit(doc_word,
                words= words,
                docs= df.Reviews,
                anchors=[
                         ['disneyland', 'disney', 'california'],
                         ['disneyland', 'disney', 'paris'],
                         ['disneyland', 'disney', 'hong', 'kong'],
                         ['disney', 'food']],
                anchor_strength=5);

In [18]:
# Topics with anchored words
topics2 = topic_model2.get_topics()

for n,topic in enumerate(topics2):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: disneyland, california, disney, adventure, world, hopper, southern, anaheim, original, radiator
1: disney, paris, walt, florida, disneyland, downtown, orlando, smaller, fan, euro
2: disneyland, disney, hong, kong, compared, mtr, hk, unique, tokyo, version
3: food, expensive, disney, drinks, drink, quality, prices, options, bring, overpriced
4: ride, park, day, people, just, time, rides, did, fast, minutes
5: hotel, characters, main, good, mickey, parade, really, queue, small, castle


# More preprocessing

In [19]:
#Loading Stopwords into a list
nltk.download('stopwords')
NLTK_stop_words_list = stopwords.words('english')

# Adding new stop words
add_stop_words_list = ['disneyland', 'disney']

final_stop_words_list = (NLTK_stop_words_list + add_stop_words_list)    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# Vectorize
vectorizer2 = CountVectorizer(max_features=2500,
                             stop_words=final_stop_words_list,
                             token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)
doc_word2 = vectorizer2.fit_transform(df.Reviews)
words2 = list(np.asarray(vectorizer2.get_feature_names_out()))

In [49]:
# Create an unsupervised topic model with CorEx without the use of anchor words
topic_model3 = ct.Corex(n_hidden= 10,
                       words=words2, 
                       seed=1)

topic_model3.fit(doc_word2,         
                words= words2,     
                docs= df.Reviews)

<corextopic.corextopic.Corex at 0x1bf0a8e6190>

In [50]:
# Topics without anchored words
topics3 = topic_model3.get_topics()
for n,topic in enumerate(topics3):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: people, would, us, could, money, think, told, another, going, know
1: one, day, park, minutes, got, hour, tickets, hours, way, two
2: get, fast, pass, time, use, line, early, wait, go, passes
3: hotel, paris, queues, staff, queue, parks, stayed, studios, half, children
4: characters, mickey, main, see, meet, street, breakfast, character, lunch, old
5: like, even, first, back, many, years, last, trip, never, made
6: show, parade, rides, fireworks, also, night, really, castle, went, well
7: mountain, space, pirates, jones, ride, thunder, indiana, caribbean, haunted, mansion
8: food, water, take, expensive, around, good, snacks, drinks, eat, bring
9: kong, hong, train, king, lion, small, story, toy, grizzly, mystic


In [56]:
# Create semi-supervised topic model with CorEx with the use of anchor words
"""
The anchors we'll use are:

1. park, california
2. park, paris
3. park, hong kong
4. park, food

"""
topic_model4 = ct.Corex(n_hidden= 6,
                       words=words2,  
                       seed=1)

topic_model4.fit(doc_word2,
                words= words2,
                docs= df.Reviews,
                anchors=[
                         ['park', 'california'],
                         ['park', 'paris'],
                         ['park', 'hong', 'kong'],
                         ['food']],
                anchor_strength=5);

In [57]:
# Topics with anchored words
topics4 = topic_model4.get_topics()

for n,topic in enumerate(topics4):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: park, california, theme, hopper, clean, amusement, ocean, original, history, navigate
1: park, paris, would, people, us, parks, could, hour, staff, two
2: park, hong, kong, smaller, maintained, smallest, subway, attractive, senior, organized
3: food, expensive, drinks, drink, quality, prices, options, overpriced, priced, pricey
4: ride, get, one, mountain, time, day, rides, also, go, space
5: hotel, take, around, show, see, good, characters, queue, parade, want


# More Preprocessing

## Tokenize

In [9]:
# Use a multi-word tokenizer to link words
from nltk.tokenize import MWETokenizer

mwe_tokenizer = MWETokenizer(['hong', 'kong'])
mwe_tokens = pd.DataFrame(mwe_tokenizer.tokenize(word_tokenize(doc) for doc in corpus))


TypeError: object of type 'generator' has no len()

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer