In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

# Preprocessing Tools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import spacy

# Document-Term Matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Dimensionality Reduction with Topic Modeling
from sklearn.decomposition import TruncatedSVD, NMF

# Topic Modeling
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [4]:
# Set the display constraints to be able to read entire reviews
pd.set_option('display.max_colwidth', None)

# open pickle file with sentiment and score
df = pd.read_pickle('df_sentiment_score_pkl')
df.head(3)

Unnamed: 0,ID,Rating,Date,Reviewer_Location,Reviews,Branch,Sentiment,Score
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides its a Small World is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well.,Disneyland_HongKong,Positive,0.7069
1,670682799,4,2019-5,Philippines,"Its been a while since d last time we visit HK Disneyland .. Yet, this time we only stay in Tomorrowland .. AKA Marvel land!Now they have Iron Man Experience n d Newly open Ant Man n d Wasp!!Ironman .. Great feature n so Exciting, especially d whole scenery of HK (HK central area to Kowloon)!Antman .. Changed by previous Buzz lightyear! More or less d same, but I'm expecting to have something most!!However, my boys like it!!Space Mountain .. Turns into Star Wars!! This 1 is Great!!!For cast members (staffs) .. Felt bit MINUS point from before!!! Just dun feel like its a Disney brand!! Seems more local like Ocean Park or even worst!!They got no SMILING face, but just wanna u to enter n attraction n leave!!Hello this is supposed to be Happiest Place on Earth brand!! But, just really Dont feel it!!Bakery in Main Street now have more attractive delicacies n Disney theme sweets .. These are Good Points!!Last, they also have Starbucks now inside the theme park!!",Disneyland_HongKong,Positive,0.9853
2,670623270,4,2019-4,United Arab Emirates,"Thanks God it wasn t too hot or too humid when I was visiting the park otherwise it would be a big issue (there is not a lot of shade).I have arrived around 10:30am and left at 6pm. Unfortunately I didn t last until evening parade, but 8.5 hours was too much for me.There is plenty to do and everyone will find something interesting for themselves to enjoy.It wasn t extremely busy and the longest time I had to queue for certain attractions was 45 minutes (which is really not that bad).Although I had an amazing time, I felt a bit underwhelmed with choice of rides and attractions. The park itself is quite small (I was really expecting something grand even the main castle which was closed by the way was quite small).The food options are good, few coffee shops (including Starbucks) and plenty of gift shops. There was no issue with toilets as they are everywhere.All together it was a great day out and I really enjoyed it.",Disneyland_HongKong,Positive,0.992


# spaCy

In [5]:
# Load our chosen language model
nlp = spacy.load('en_core_web_sm')

In [7]:
# Create a column in the dataframe for the parsed spaCy document for the Reviews column
docs = list(nlp.pipe(df.Reviews))
df['spacy_doc'] = docs

In [8]:
df.head(3)

Unnamed: 0,ID,Rating,Date,Reviewer_Location,Reviews,Branch,Sentiment,Score,spacy_doc
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides its a Small World is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well.,Disneyland_HongKong,Positive,0.7069,"(If, you, 've, ever, been, to, Disneyland, anywhere, you, 'll, find, Disneyland, Hong, Kong, very, similar, in, the, layout, when, you, walk, into, main, street, !, It, has, a, very, familiar, feel, ., One, of, the, rides, , its, a, Small, World, , is, absolutely, fabulous, and, worth, doing, ., The, day, we, visited, was, fairly, hot, and, relatively, busy, but, the, queues, moved, fairly, well, .)"
1,670682799,4,2019-5,Philippines,"Its been a while since d last time we visit HK Disneyland .. Yet, this time we only stay in Tomorrowland .. AKA Marvel land!Now they have Iron Man Experience n d Newly open Ant Man n d Wasp!!Ironman .. Great feature n so Exciting, especially d whole scenery of HK (HK central area to Kowloon)!Antman .. Changed by previous Buzz lightyear! More or less d same, but I'm expecting to have something most!!However, my boys like it!!Space Mountain .. Turns into Star Wars!! This 1 is Great!!!For cast members (staffs) .. Felt bit MINUS point from before!!! Just dun feel like its a Disney brand!! Seems more local like Ocean Park or even worst!!They got no SMILING face, but just wanna u to enter n attraction n leave!!Hello this is supposed to be Happiest Place on Earth brand!! But, just really Dont feel it!!Bakery in Main Street now have more attractive delicacies n Disney theme sweets .. These are Good Points!!Last, they also have Starbucks now inside the theme park!!",Disneyland_HongKong,Positive,0.9853,"(Its, been, a, while, since, d, last, time, we, visit, HK, Disneyland, .., Yet, ,, this, time, we, only, stay, in, Tomorrowland, .., AKA, Marvel, land!Now, they, have, Iron, Man, Experience, n, d, Newly, open, Ant, Man, n, d, Wasp!!Ironman, .., Great, feature, n, so, Exciting, ,, especially, d, whole, scenery, of, HK, (, HK, central, area, to, Kowloon)!Antman, .., Changed, by, previous, Buzz, lightyear, !, More, or, less, d, same, ,, but, I, 'm, expecting, to, have, something, most!!However, ,, my, boys, like, it!!Space, Mountain, .., Turns, into, Star, Wars, !, !, This, 1, is, Great!!!For, cast, members, (, ...)"
2,670623270,4,2019-4,United Arab Emirates,"Thanks God it wasn t too hot or too humid when I was visiting the park otherwise it would be a big issue (there is not a lot of shade).I have arrived around 10:30am and left at 6pm. Unfortunately I didn t last until evening parade, but 8.5 hours was too much for me.There is plenty to do and everyone will find something interesting for themselves to enjoy.It wasn t extremely busy and the longest time I had to queue for certain attractions was 45 minutes (which is really not that bad).Although I had an amazing time, I felt a bit underwhelmed with choice of rides and attractions. The park itself is quite small (I was really expecting something grand even the main castle which was closed by the way was quite small).The food options are good, few coffee shops (including Starbucks) and plenty of gift shops. There was no issue with toilets as they are everywhere.All together it was a great day out and I really enjoyed it.",Disneyland_HongKong,Positive,0.992,"(Thanks, God, it, wasn, , t, too, hot, or, too, humid, when, I, was, visiting, the, park, , otherwise, it, would, be, a, big, issue, (, there, is, not, a, lot, of, shade).I, have, arrived, around, 10:30am, and, left, at, 6, pm, ., Unfortunately, I, didn, , t, last, until, evening, parade, ,, but, 8.5, hours, was, too, much, for, me, ., There, is, plenty, to, do, and, everyone, will, find, something, interesting, for, themselves, to, enjoy, ., It, wasn, , t, extremely, busy, and, the, longest, time, I, had, to, queue, for, certain, attractions, was, 45, minutes, (, which, ...)"


In [9]:
df[['spacy_doc', 'Sentiment', 'Score']].sample(5)

Unnamed: 0,spacy_doc,Sentiment,Score
35844,"(The, most, magical, place, on, earth, !, I, absolutely, adore, Main, Street, USA, as, it, feels, as, if, you, 've, traveled, back, in, time, ., My, favourite, land, ,, however, ,, is, Fantasyland, as, I, can, let, my, imagination, run, wild, !, Some, fantastic, rides, overall, and, brilliant, restaurants, too, ., The, shops, are, also, great, ., Wonderful, facilities, and, clean, toilets, !)",Positive,0.9741
24244,"(For, those, who, have, been, ,, have, their, own, wonderful, experiences, to, share, ., I, have, visited, this, park, over, 30, times, ,, over, a, 20, year, time, span, and, every, time, is, different, ., There, is, lots, of, walking, so, put, on, comfy, shoes, and, go, !, I, really, do, n't, think, there, is, a, bad, ride, ., There, is, a, ride, for, every, one, of, all, ages, to, endure, ., I, like, the, fact, that, there, are, Fast, Pass, tickets, to, use, if, the, lines, are, outrageously, long, ., These, are, n't, offered, on, all, rides, ,, but, ...)",Positive,0.9487
17793,"(Disneyland, Park, is, a, wonderful, place, to, visit, ..., I, have, been, there, many, times, in, the, last, 10, years, and, always, had, a, good, time, ...., lots, of, cool, rides, not, many, new, ones, over, the, last, few, years, and, Disney, does, make, every, effort, for, you, to, enjoy, your, stay, ..., we, visited, in, July, and, had, a, 5, day, park, hopper, pass, ....., I, did, get, a, better, price, from, Arestravel, , Mousesaver, and, was, happy, with, that, ..., some, of, the, rides, broke, down, during, our, visit, which, was, most, unfortunate, specially, if, you, wait, in, ...)",Positive,0.9281
30345,"(I, have, now, visited, Disneyland, Park, in, Paris, twice, in, the, last, 14, months, ., My, first, visit, was, in, April, 2017, and, the, second, just, last, month, for, my, 21st, on, 16th, May, (, which, was, a, surprise, !, !, ), ., This, Disneyland, Park, is, such, an, amazing, alternative, to, DisneyWorld, in, Orlando, ., There, is, plenty, of, characters, to, go, and, meet, throughout, the, park, and, there, is, always, shows, and, parades, going, on, everyday, ., When, I, was, there, was, a, show, on, throughout, the, day, called, Princesses, Vs, Pirates, ,, it, was, amazing, and, so, ...)",Positive,0.9476
36380,"(Great, great, great, for, all, of, us, not, just, children, ., You, can, spend, all, day, and, never, get, borred, ., Must, see, :, the, sleeping, beauty, castle, ,, snow, white, ,, pinochio, ,, peter, pan, ,, go, on, a, ride, on, dumbo, 's, elephants, or, the, mad, hatter, 's, tea, cups, ;, go, through, alice, 's, labirinth, ,, it, 's, a, small, world, (, really, great, ), ,, go, meet, mickey, mouse, and, see, old, cartoons, with, mickey, ., The, parade, is, not, to, be, missed, and, the, show, at, 22:30, is, great, you, will, love, it, ., In, ...)",Positive,0.9831


## Save to Pickle File

In [13]:
df.to_pickle('df_sentiment_score_spacy_doc_pkl')

In [3]:
# Read in the pickle file
df = pd.read_pickle('df_sentiment_score_spacy_doc_pkl')

## POS by Sentiment

In [15]:
# Filter out the positive reviews
pos_reviews = df[df['Sentiment']=='Positive']

# Extract the adjectives in the positive reviews
pos_adj = [token.text.lower() for doc in pos_reviews.spacy_doc for token in doc if token.pos_=='ADJ']

# Find the 20 most common adjectives in the positive reviews
top_pos_adj = Counter(pos_adj).most_common(20)
top_pos_adj

[('great', 14402),
 ('good', 9898),
 ('more', 6570),
 ('fast', 6234),
 ('many', 6105),
 ('other', 5822),
 ('long', 5737),
 ('worth', 5644),
 ('first', 5556),
 ('amazing', 5535),
 ('old', 5300),
 ('best', 5272),
 ('little', 5116),
 ('expensive', 4645),
 ('most', 4446),
 ('magical', 4295),
 ('small', 4194),
 ('few', 4056),
 ('much', 3687),
 ('sure', 3651)]

In [16]:
# Do the same for the negative reviews
neg_reviews = df[df['Sentiment']=='Negative']

neg_adj = [token.text.lower() for doc in neg_reviews.spacy_doc for token in doc if token.pos_=='ADJ']

# Top 20 common adjectives in negative reviews
top_neg_adj = Counter(neg_adj).most_common(20)
top_neg_adj

[('many', 1314),
 ('other', 1110),
 ('good', 1050),
 ('more', 1016),
 ('long', 962),
 ('fast', 915),
 ('old', 780),
 ('expensive', 723),
 ('few', 715),
 ('small', 693),
 ('first', 647),
 ('most', 641),
 ('same', 624),
 ('great', 617),
 ('better', 611),
 ('last', 542),
 ('little', 521),
 ('rude', 499),
 ('disappointed', 490),
 ('much', 468)]

**It makes sense that for the positive list, the top words are "great" and "good"; and for the negative list, the top words are "many", "other", and "good".  We may need to clean up our documents further to get cleaner insights.**

## Dependency Parsing

In [5]:
from spacy.symbols import amod
from pprint import pprint

In [13]:
# Function to extract adjective modifiers given noun of interest
def get_amods(noun, ser):
    amod_list = []
    for doc in ser:
        for token in doc:
            if (token.text) == noun:
                for child in token.children:
                    if child.dep == amod:
                        amod_list.append(child.text.lower())
    return sorted(amod_list)

# Function to print 20 most common adjective modifiers (along with their frequency) 
# given noun of interest, categorized by sentiment
def amods_by_sentiment(noun):
    print(f"Adjectives describing {str.upper(noun)}:\n")
    
    print("POSITIVE:")
    pos_list = get_amods(noun, pos_reviews.spacy_doc)
    pprint(Counter(pos_list).most_common(20))
    
    print("\nNEGATIVE:")
    neg_list = get_amods(noun, neg_reviews.spacy_doc)
    pprint(Counter(neg_list).most_common(20))

In [14]:
# Find sentiments around "food"
amods_by_sentiment("food")

Adjectives describing FOOD:

POSITIVE:
[('own', 312),
 ('fast', 309),
 ('good', 176),
 ('great', 129),
 ('expensive', 87),
 ('outside', 43),
 ('best', 40),
 ('available', 39),
 ('decent', 31),
 ('vegetarian', 31),
 ('indian', 30),
 ('chinese', 29),
 ('better', 28),
 ('nice', 28),
 ('much', 22),
 ('healthy', 21),
 ('asian', 19),
 ('overpriced', 19),
 ('western', 19),
 ('delicious', 18)]

NEGATIVE:
[('fast', 78),
 ('own', 39),
 ('expensive', 36),
 ('overpriced', 19),
 ('poor', 17),
 ('good', 16),
 ('terrible', 16),
 ('bad', 14),
 ('priced', 12),
 ('awful', 11),
 ('decent', 8),
 ('horrible', 8),
 ('mediocre', 8),
 ('same', 8),
 ('better', 7),
 ('great', 7),
 ('hot', 7),
 ('much', 6),
 ('only', 6),
 ('available', 5)]


In [8]:
# Find sentiments around "park"
amods_by_sentiment("park")

Adjectives describing PARK:

POSITIVE:
[('whole', 310),
 ('other', 274),
 ('main', 217),
 ('great', 202),
 ('original', 191),
 ('entire', 166),
 ('small', 109),
 ('best', 105),
 ('smaller', 99),
 ('nice', 74),
 ('beautiful', 67),
 ('good', 59),
 ('first', 56),
 ('clean', 55),
 ('amazing', 50),
 ('big', 45),
 ('huge', 37),
 ('wonderful', 36),
 ('second', 34),
 ('only', 32)]

NEGATIVE:
[('other', 48),
 ('whole', 45),
 ('main', 37),
 ('entire', 21),
 ('small', 16),
 ('local', 14),
 ('original', 12),
 ('crowded', 10),
 ('first', 10),
 ('great', 9),
 ('worst', 8),
 ('smaller', 7),
 ('better', 6),
 ('full', 6),
 ('new', 6),
 ('second', 6),
 ('actual', 5),
 ('beautiful', 5),
 ('big', 4),
 ('expensive', 4)]


In [9]:
# Find sentiments around "experience"
amods_by_sentiment("experience")

Adjectives describing EXPERIENCE:

POSITIVE:
[('great', 498),
 ('magical', 313),
 ('amazing', 251),
 ('whole', 235),
 ('wonderful', 182),
 ('first', 152),
 ('good', 140),
 ('overall', 117),
 ('best', 94),
 ('different', 80),
 ('fantastic', 72),
 ('fun', 66),
 ('awesome', 57),
 ('nice', 55),
 ('enjoyable', 53),
 ('memorable', 51),
 ('new', 46),
 ('bad', 44),
 ('full', 42),
 ('same', 38)]

NEGATIVE:
[('whole', 52),
 ('magical', 47),
 ('bad', 37),
 ('worst', 28),
 ('better', 27),
 ('great', 26),
 ('disappointing', 22),
 ('good', 21),
 ('terrible', 21),
 ('poor', 20),
 ('first', 19),
 ('same', 14),
 ('overall', 11),
 ('horrible', 9),
 ('awful', 8),
 ('different', 8),
 ('miserable', 8),
 ('pleasant', 8),
 ('similar', 8),
 ('enjoyable', 7)]


In [10]:
# Find sentiments around "price"
amods_by_sentiment("price")

Adjectives describing PRICE:

POSITIVE:
[('same', 72),
 ('reasonable', 42),
 ('full', 34),
 ('high', 27),
 ('good', 16),
 ('great', 12),
 ('discounted', 11),
 ('cheaper', 10),
 ('half', 9),
 ('average', 8),
 ('best', 8),
 ('higher', 8),
 ('normal', 8),
 ('small', 8),
 ('better', 7),
 ('reduced', 7),
 ('regular', 7),
 ('decent', 6),
 ('expensive', 6),
 ('lower', 6)]

NEGATIVE:
[('full', 60),
 ('same', 20),
 ('high', 6),
 ('extortionate', 3),
 ('low', 3),
 ('premium', 3),
 ('ridiculous', 3),
 ('average', 2),
 ('good', 2),
 ('hefty', 2),
 ('normal', 2),
 ('reasonable', 2),
 ('stupid', 2),
 ('usual', 2),
 ('affordable', 1),
 ('astronomic', 1),
 ('big', 1),
 ('cheaper', 1),
 ('cheapest', 1),
 ('correct', 1)]


# More Preprocessing

In [8]:
# Remove stop words (but retain "not" to handle negative sentiment) and punctuation; extract only the lemmatized, lowercase text from spacy_doc
docs_clean = [[w.lemma_.lower() for w in doc if (not w.is_stop and not w.is_punct and not w.like_num) \
             or (w.lemma =='not')] for doc in docs]

# Create a column with the cleaned docs
df['docs_clean'] = docs_clean

In [12]:
df[['Reviews', 'spacy_doc', 'docs_clean', 'Sentiment', 'Score']].head(3)

Unnamed: 0,Reviews,spacy_doc,docs_clean,Sentiment,Score
0,If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides its a Small World is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well.,"(If, you, 've, ever, been, to, Disneyland, anywhere, you, 'll, find, Disneyland, Hong, Kong, very, similar, in, the, layout, when, you, walk, into, main, street, !, It, has, a, very, familiar, feel, ., One, of, the, rides, , its, a, Small, World, , is, absolutely, fabulous, and, worth, doing, ., The, day, we, visited, was, fairly, hot, and, relatively, busy, but, the, queues, moved, fairly, well, .)","[disneyland, find, disneyland, hong, kong, similar, layout, walk, main, street, familiar, feel, ride, , small, world, , absolutely, fabulous, worth, day, visit, fairly, hot, relatively, busy, queue, move, fairly]",Positive,0.7069
1,"Its been a while since d last time we visit HK Disneyland .. Yet, this time we only stay in Tomorrowland .. AKA Marvel land!Now they have Iron Man Experience n d Newly open Ant Man n d Wasp!!Ironman .. Great feature n so Exciting, especially d whole scenery of HK (HK central area to Kowloon)!Antman .. Changed by previous Buzz lightyear! More or less d same, but I'm expecting to have something most!!However, my boys like it!!Space Mountain .. Turns into Star Wars!! This 1 is Great!!!For cast members (staffs) .. Felt bit MINUS point from before!!! Just dun feel like its a Disney brand!! Seems more local like Ocean Park or even worst!!They got no SMILING face, but just wanna u to enter n attraction n leave!!Hello this is supposed to be Happiest Place on Earth brand!! But, just really Dont feel it!!Bakery in Main Street now have more attractive delicacies n Disney theme sweets .. These are Good Points!!Last, they also have Starbucks now inside the theme park!!","(Its, been, a, while, since, d, last, time, we, visit, HK, Disneyland, .., Yet, ,, this, time, we, only, stay, in, Tomorrowland, .., AKA, Marvel, land!Now, they, have, Iron, Man, Experience, n, d, Newly, open, Ant, Man, n, d, Wasp!!Ironman, .., Great, feature, n, so, Exciting, ,, especially, d, whole, scenery, of, HK, (, HK, central, area, to, Kowloon)!Antman, .., Changed, by, previous, Buzz, lightyear, !, More, or, less, d, same, ,, but, I, 'm, expecting, to, have, something, most!!However, ,, my, boys, like, it!!Space, Mountain, .., Turns, into, Star, Wars, !, !, This, 1, is, Great!!!For, cast, members, (, ...)","[d, time, visit, hk, disneyland, time, stay, tomorrowland, aka, marvel, land!now, iron, man, experience, n, d, newly, open, ant, man, n, d, wasp!!ironman, great, feature, n, exciting, especially, d, scenery, hk, hk, central, area, kowloon)!antman, change, previous, buzz, lightyear, d, expect, most!!however, boy, like, it!!space, mountain, turn, star, wars, great!!!for, cast, member, staff, feel, bit, minus, point, dun, feel, like, disney, brand, local, like, ocean, park, worst!!they, get, smiling, face, wanna, u, enter, n, attraction, n, leave!!hello, suppose, happiest, place, earth, brand, not, feel, it!!bakery, main, street, attractive, delicacy, n, disney, theme, sweet, good, points!!last, starbuck, inside, theme, park]",Positive,0.9853
2,"Thanks God it wasn t too hot or too humid when I was visiting the park otherwise it would be a big issue (there is not a lot of shade).I have arrived around 10:30am and left at 6pm. Unfortunately I didn t last until evening parade, but 8.5 hours was too much for me.There is plenty to do and everyone will find something interesting for themselves to enjoy.It wasn t extremely busy and the longest time I had to queue for certain attractions was 45 minutes (which is really not that bad).Although I had an amazing time, I felt a bit underwhelmed with choice of rides and attractions. The park itself is quite small (I was really expecting something grand even the main castle which was closed by the way was quite small).The food options are good, few coffee shops (including Starbucks) and plenty of gift shops. There was no issue with toilets as they are everywhere.All together it was a great day out and I really enjoyed it.","(Thanks, God, it, wasn, , t, too, hot, or, too, humid, when, I, was, visiting, the, park, , otherwise, it, would, be, a, big, issue, (, there, is, not, a, lot, of, shade).I, have, arrived, around, 10:30am, and, left, at, 6, pm, ., Unfortunately, I, didn, , t, last, until, evening, parade, ,, but, 8.5, hours, was, too, much, for, me, ., There, is, plenty, to, do, and, everyone, will, find, something, interesting, for, themselves, to, enjoy, ., It, wasn, , t, extremely, busy, and, the, longest, time, I, had, to, queue, for, certain, attractions, was, 45, minutes, (, which, ...)","[thank, god, wasn, , t, hot, humid, visit, park, , big, issue, lot, shade).i, arrive, 10:30am, leave, pm, unfortunately, didn, , t, evening, parade, hour, plenty, find, interesting, enjoy, wasn, , t, extremely, busy, long, time, queue, certain, attraction, minute, bad).although, amazing, time, feel, bit, underwhelmed, choice, ride, attraction, park, small, expect, grand, , main, castle, close, way, small).the, food, option, good, coffee, shop, include, starbucks, plenty, gift, shop, issue, toilet, great, day, enjoy]",Positive,0.992


## Save to Pickle File

In [39]:
df.to_pickle('df_sentiment_score_spacy_docs_cleaned_pkl')

In [68]:
# Read in the pickle file
df = pd.read_pickle('df_sentiment_score_spacy_docs_cleaned_pkl')

## POS by Sentiment 2

In [3]:
# Let's take a look again at the list of most common adjectives for both positive and negative reviews

# Filter out the positive/negative reviews
pos_reviews = df[df['Sentiment']=='Positive']
neg_reviews = df[df['Sentiment']=='Negative']

# Extract the adjectives in the positive/negative reviews
pos_adj_2 = [token.text for doc in pos_reviews.docs_clean for token in doc if token.pos_=='ADJ']

neg_adj_2 = [token.text for doc in neg_reviews.docs_clean for token in doc if token.pos_=='ADJ']


# Find the 10 most common adjectives in the positive/negative reviews
top_pos_adj_2 = Counter(pos_adj_2).most_common(10)
top_neg_adj_2 = Counter(neg_adj_2).most_common(10)

AttributeError: 'str' object has no attribute 'pos_'

# Vectorizers

In [69]:
# Merge all tokens together and build Vectorizer with cleaned text data
docs_list_clean = [' '.join(doc) for doc in df.docs_clean]
docs_list_clean[:3]

['disneyland find disneyland hong kong similar layout walk main street familiar feel ride   small world   absolutely fabulous worth day visit fairly hot relatively busy queue move fairly',
 'd time visit hk disneyland time stay tomorrowland aka marvel land!now iron man experience n d newly open ant man n d wasp!!ironman great feature n exciting especially d scenery hk hk central area kowloon)!antman change previous buzz lightyear d expect most!!however boy like it!!space mountain turn star wars great!!!for cast member staff feel bit minus point dun feel like disney brand local like ocean park worst!!they get smiling face wanna u enter n attraction n leave!!hello suppose happiest place earth brand not feel it!!bakery main street attractive delicacy n disney theme sweet good points!!last starbuck inside theme park',
 'thank god wasn    t hot humid visit park    big issue lot shade).i arrive 10:30am leave pm unfortunately didn    t evening parade hour plenty find interesting enjoy wasn   

### CountVectorizer

In [70]:
# Fit a CountVectorizer
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_list_clean)

df_X = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())
df_X.head()

Unnamed: 0,00,000,0000,0000hrs,005,00a,00am,00euros,00for,00h,...,zonewise,zoo,zoom,zootopia,zoover,zorb,zorg,zulqairil,zurg,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TdifVectorizer

In [76]:
# Fit a TdifVectorizer
tfv = TfidfVectorizer(stop_words='english')
X2 = tfv.fit_transform(docs_list_clean)

df_X2 = pd.DataFrame(X2.toarray(), columns=tfv.get_feature_names_out())
df_X2.head()

Unnamed: 0,00,000,0000,0000hrs,005,00a,00am,00euros,00for,00h,...,zonewise,zoo,zoom,zootopia,zoover,zorb,zorg,zulqairil,zurg,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Topic Modeling w CountVectorizer

In [60]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

### NMF

In [71]:
num_topics=4

nmf_topics = NMF(num_topics)
nmf_doc_topic = nmf_topics.fit_transform(X)



In [32]:
nmf_topic_word = pd.DataFrame(nmf_topics.components_.round(3),
                         columns=cv.get_feature_names_out())
nmf_topic_word

Unnamed: 0,00,000,0000,0000hrs,005,00a,00am,00euros,00for,00h,...,zonewise,zoo,zoom,zootopia,zoover,zorb,zorg,zulqairil,zurg,zurich
0,0.071,0.0,0.0,0.006,0.0,0.0,0.008,0.0,0.0,0.003,...,0.0,0.013,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.075,0.0,0.0,0.0,0.0,0.001,0.021,0.0,0.0,0.0,...,0.0,0.004,0.005,0.0,0.0,0.0,0.0,0.0,0.007,0.0
2,0.152,0.002,0.0,0.0,0.001,0.0,0.024,0.01,0.0,0.0,...,0.001,0.007,0.0,0.001,0.001,0.001,0.001,0.0,0.0,0.0
3,0.04,0.0,0.0,0.0,0.001,0.0,0.006,0.0,0.0,0.0,...,0.0,0.011,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
display_topics(nmf_topics, cv.get_feature_names_out(), 10)


Topic  0
park, disney, visit, hotel, paris, food, small, ticket, attraction, world

Topic  1
ride, line, wait, pass, fast, time, mountain, long, minute, hour

Topic  2
day, time, queue, good, kid, food, child, character, parade, great

Topic  3
disneyland, visit, time, day, place, disney, love, world, year, like


### LSA

In [72]:
lsa_topics = TruncatedSVD(num_topics)
lsa_doc_topic = lsa_topics.fit_transform(X)

In [73]:
lsa_topic_word = pd.DataFrame(lsa_topics.components_.round(3),
                         columns=cv.get_feature_names_out())
lsa_topic_word

Unnamed: 0,00,000,0000,0000hrs,005,00a,00am,00euros,00for,00h,...,zonewise,zoo,zoom,zootopia,zoover,zorb,zorg,zulqairil,zurg,zurich
0,0.007,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001,0.0,-0.0,-0.0,0.0,0.0,0.001,-0.0,-0.0,-0.0,...,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0
2,-0.001,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.001,-0.0,0.0,...,-0.0,0.001,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0
3,-0.002,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,...,-0.0,-0.001,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0


In [74]:
display_topics(lsa_topics, cv.get_feature_names_out(), 10)


Topic  0
park, ride, time, day, disney, disneyland, good, wait, queue, visit

Topic  1
ride, line, fast, wait, pass, time, long, mountain, minute, kid

Topic  2
disneyland, park, ride, line, california, world, adventure, visit, mountain, pass

Topic  3
park, ride, disney, queue, minute, mountain, hour, euro, hotel, theme


## Topic Modeling with TdifVectorizer

### NMF

In [35]:
num_topics = 4

nmf_topics = NMF(num_topics)
nmf_doc_topic_tdif = topics.fit_transform(X2) 

In [36]:
nmf_topic_word_tdif = pd.DataFrame(nmf_topics.components_.round(3),
                         columns=tfv.get_feature_names_out())
nmf_topic_word_tdif

Unnamed: 0,00,000,0000,0000hrs,005,00a,00am,00euros,00for,00h,...,zonewise,zoo,zoom,zootopia,zoover,zorb,zorg,zulqairil,zurg,zurich
0,0.053,0.0,0.0,0.0,0.0,0.002,0.017,0.0,0.0,0.0,...,0.0,0.007,0.001,0.0,0.0,0.0,0.0,0.0,0.001,0.0
1,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.014,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.001,...,0.0,0.004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.047,0.001,0.0,0.0,0.0,0.0,0.008,0.001,0.0,0.0,...,0.0,0.003,0.002,0.001,0.0,0.0,0.0,0.0,0.001,0.0


In [38]:
display_topics(nmf_topics, tfv.get_feature_names_out(), 10)


Topic  0
ride, line, pass, wait, fast, time, long, day, mountain, minute

Topic  1
place, love, kid, great, disneyland, fun, year, family, time, happy

Topic  2
disney, park, disneyland, world, visit, small, hong, kong, florida, paris

Topic  3
day, queue, park, food, good, parade, ride, character, hotel, stay


### LSA

In [77]:
lsa_topics = TruncatedSVD(num_topics)
lsa_doc_topic_tdif = lsa_topics.fit_transform(X2)

In [79]:
lsa_topic_word_tdif = pd.DataFrame(lsa_topics.components_.round(3),
                         columns=tfv.get_feature_names_out())
lsa_topic_word_tdif

Unnamed: 0,00,000,0000,0000hrs,005,00a,00am,00euros,00for,00h,...,zonewise,zoo,zoom,zootopia,zoover,zorb,zorg,zulqairil,zurg,zurich
0,0.009,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.013,-0.0,0.0,0.0,-0.0,-0.0,-0.004,-0.0,-0.0,-0.0,...,-0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0
2,0.005,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0
3,0.002,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,...,-0.0,0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0


In [80]:
display_topics(lsa_topics, tfv.get_feature_names_out(), 10)


Topic  0
park, ride, day, time, disney, disneyland, great, visit, good, place

Topic  1
place, love, disneyland, kid, visit, fun, great, happy, family, earth

Topic  2
disney, park, world, small, paris, florida, hong, kong, visit, hotel

Topic  3
disneyland, line, california, world, pass, adventure, time, love, fast, mountain


# CorEx

In [35]:
vectorizer = CountVectorizer(max_features=2500,
                             stop_words='english', 
                             token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)
doc_word = vectorizer.fit_transform(df.Reviews)
words = list(np.asarray(vectorizer.get_feature_names_out()))

In [36]:
doc_word.shape

(37547, 2500)

In [37]:
words[:20]

['ability',
 'able',
 'absolute',
 'absolutely',
 'accept',
 'acceptable',
 'access',
 'accessible',
 'accommodate',
 'accommodating',
 'accommodation',
 'according',
 'accordingly',
 'accurate',
 'act',
 'action',
 'activities',
 'activity',
 'actors',
 'actual']

In [38]:
# Create an unsupervised topic model with CorEx without the use of anchor words
topic_model = ct.Corex(n_hidden= 10,
                       words=words, 
                       seed=1)

topic_model.fit(doc_word,         
                words= words,     
                docs= df.Reviews)


<corextopic.corextopic.Corex at 0x1bf0a920a60>

In [39]:
# Topics without anchored words
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: people, way, money, told, think, know, said, let, rude, thing
1: food, hotel, paris, queues, queue, good, staff, parks, stayed, expensive
2: day, park, did, got, minutes, rides, hour, went, didn, hours
3: main, king, street, small, train, castle, lion, parade, hong, story
4: disney, just, like, disneyland, really, world, magic, experience, going, year
5: mountain, space, pirates, thunder, jones, ride, indiana, caribbean, peter, star
6: water, bring, snacks, buy, drinks, need, outside, walking, free, sit
7: time, line, wait, make, early, don, want, times, plan, long
8: characters, mickey, meet, breakfast, character, minnie, lunch, mouse, princess, old
9: fast, pass, passes, use, california, adventure, haunted, mansion, lines, hopper


In [40]:
# Create semi-supervised topic model with CorEx with the use of anchor words
"""
The anchors we'll use are:

1. disneyland, disney, california
2. disneyland, paris
3. disneyland, hong kong
4. disney, food

"""
topic_model2 = ct.Corex(n_hidden= 6,
                       words=words,  
                       seed=1)

topic_model2.fit(doc_word,
                words= words,
                docs= df.Reviews,
                anchors=[
                         ['disneyland', 'disney', 'california'],
                         ['disneyland', 'disney', 'paris'],
                         ['disneyland', 'disney', 'hong', 'kong'],
                         ['disney', 'food']],
                anchor_strength=5);

In [42]:
# Topics with anchored words
topics2 = topic_model2.get_topics()

for n,topic in enumerate(topics2):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: disneyland, california, disney, adventure, world, hopper, southern, anaheim, original, radiator
1: disney, paris, walt, florida, disneyland, downtown, orlando, smaller, fan, euro
2: disneyland, disney, hong, kong, compared, mtr, hk, unique, tokyo, version
3: food, expensive, disney, drinks, drink, quality, prices, options, bring, overpriced
4: ride, park, day, people, just, time, rides, did, fast, minutes
5: hotel, characters, main, good, mickey, parade, really, queue, small, castle


# More preprocessing

In [47]:
#Loading Stopwords into a list
nltk.download('stopwords')
NLTK_stop_words_list = stopwords.words('english')

# Adding new stop words
add_stop_words_list = ['disneyland', 'disney']

final_stop_words_list = (NLTK_stop_words_list + add_stop_words_list)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
# Vectorize
vectorizer2 = CountVectorizer(max_features=2500,
                             stop_words=final_stop_words_list,
                             token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)
doc_word2 = vectorizer2.fit_transform(df.Reviews)
words2 = list(np.asarray(vectorizer2.get_feature_names_out()))

In [49]:
# Create an unsupervised topic model with CorEx without the use of anchor words
topic_model3 = ct.Corex(n_hidden= 10,
                       words=words2, 
                       seed=1)

topic_model3.fit(doc_word2,         
                words= words2,     
                docs= df.Reviews)

<corextopic.corextopic.Corex at 0x1bf0a8e6190>

In [50]:
# Topics without anchored words
topics3 = topic_model3.get_topics()
for n,topic in enumerate(topics3):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: people, would, us, could, money, think, told, another, going, know
1: one, day, park, minutes, got, hour, tickets, hours, way, two
2: get, fast, pass, time, use, line, early, wait, go, passes
3: hotel, paris, queues, staff, queue, parks, stayed, studios, half, children
4: characters, mickey, main, see, meet, street, breakfast, character, lunch, old
5: like, even, first, back, many, years, last, trip, never, made
6: show, parade, rides, fireworks, also, night, really, castle, went, well
7: mountain, space, pirates, jones, ride, thunder, indiana, caribbean, haunted, mansion
8: food, water, take, expensive, around, good, snacks, drinks, eat, bring
9: kong, hong, train, king, lion, small, story, toy, grizzly, mystic


In [56]:
# Create semi-supervised topic model with CorEx with the use of anchor words
"""
The anchors we'll use are:

1. park, california
2. park, paris
3. park, hong kong
4. park, food

"""
topic_model4 = ct.Corex(n_hidden= 6,
                       words=words2,  
                       seed=1)

topic_model4.fit(doc_word2,
                words= words2,
                docs= df.Reviews,
                anchors=[
                         ['park', 'california'],
                         ['park', 'paris'],
                         ['park', 'hong', 'kong'],
                         ['food']],
                anchor_strength=5);

In [57]:
# Topics with anchored words
topics4 = topic_model4.get_topics()

for n,topic in enumerate(topics4):
    topic_words,_,_ = zip(*topic)
    print(f'{n}: {", ".join(topic_words)}')

0: park, california, theme, hopper, clean, amusement, ocean, original, history, navigate
1: park, paris, would, people, us, parks, could, hour, staff, two
2: park, hong, kong, smaller, maintained, smallest, subway, attractive, senior, organized
3: food, expensive, drinks, drink, quality, prices, options, overpriced, priced, pricey
4: ride, get, one, mountain, time, day, rides, also, go, space
5: hotel, take, around, show, see, good, characters, queue, parade, want
