In [17]:
%matplotlib inline

import json
from operator import itemgetter
from collections import defaultdict

from matplotlib import pyplot as plt
import numpy as np
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk import FreqDist,pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.naive_bayes import MultinomialNB

In [5]:
import cPickle as pickle
with open('tripadvisor_reviews.pickle','rb') as f:
    ta_reviews = pickle.load(f)

In [7]:
reviews = [ta_review['review'] for ta_review in ta_reviews]

In [8]:
## building the term documnet matrix
vec = CountVectorizer(min_df = 50)
X = vec.fit_transform(reviews)
terms = vec.get_feature_names()
len(terms)

4535

In [9]:
# PMI type measure via matrix multiplication
def getcollocations_matrix(X):
    XX=X.T.dot(X)  ## multiply X with it's transpose to get number docs in which both w1 (row) and w2 (column) occur
    term_freqs = np.asarray(X.sum(axis=0)) ## number of docs in which a word occurs
    pmi = XX.toarray() * 1.0  ## Casting to float, making it an array to use simple operations
    pmi /= term_freqs.T ## dividing by the number of documents in which w1 occurs
    pmi /= term_freqs  ## dividing by the number of documents in which w2 occurs
    
    return pmi  # this is not technically PMI beacuse we are ignoring some normalization factor and not taking the log 
                # but it's sufficient for ranking

In [10]:
pmi_matrix = getcollocations_matrix(X)
pmi_matrix.shape 

(4535, 4535)

In [11]:
def getcollocations(w,PMI_MATRIX=pmi_matrix,TERMS=terms):
    if w not in TERMS:
        return []
    idx = TERMS.index(w)
    col = PMI_MATRIX[:,idx].ravel().tolist()
    return sorted([(TERMS[i],val) for i,val in enumerate(col)],key=itemgetter(1),reverse=True)

In [None]:
getcollocations("good")

In [14]:
tokenizer = TreebankWordTokenizer()

In [19]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jihunkim/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [20]:
##example part of speech (POS) tagging (note that you need to tokenize the sentence first)
pos_tag(tokenizer.tokenize("This was a great day but the time is running out fast"))

[('This', 'DT'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('great', 'JJ'),
 ('day', 'NN'),
 ('but', 'CC'),
 ('the', 'DT'),
 ('time', 'NN'),
 ('is', 'VBZ'),
 ('running', 'VBG'),
 ('out', 'RP'),
 ('fast', 'RB')]

In [21]:
reviews_pos_tagged=[pos_tag(tokenizer.tokenize(m)) for m in reviews]

In [22]:
reviews_adj_adv_only=[" ".join([w for w,tag in m if tag in ["JJ","RB","RBS","RBJ","JJR","JJS"]])
                      for m in reviews_pos_tagged]

In [23]:
X = vec.fit_transform(reviews_adj_adv_only)
terms = vec.get_feature_names()
len(terms)

1223

In [24]:
pmi_matrix=getcollocations_matrix(X)
pmi_matrix.shape  # n_words by n_words

(1223, 1223)

In [126]:
getcollocations("good", pmi_matrix, terms)[:10]

[(u'good', 9.009290218214751e-05),
 (u'instant', 5.32102194921554e-05),
 (u'indian', 5.277761608165008e-05),
 (u'japanese', 5.117289126835666e-05),
 (u'potential', 5.014954230990001e-05),
 (u'plastic', 4.937865196280142e-05),
 (u'internal', 4.930630228959585e-05),
 (u'korean', 4.844512520927582e-05),
 (u'price', 4.8377733341933824e-05),
 (u'miss', 4.7732696897374704e-05)]

In [27]:
def seed_score(pos_seed,PMI_MATRIX=pmi_matrix,TERMS=terms):
    score=defaultdict(int)
    for seed in pos_seed:
        c=dict(getcollocations(seed,PMI_MATRIX,TERMS))
        for w in c:
            score[w]+=c[w]
    return score

In [125]:
sorted(seed_score(['good',
                   'great', 
                   'best',
                   'magical',
                   'marvelous',
                   'tremendous',
                   'homely',
                   'exemplary',
                   'favourite',
                   'phenomenal',
                   'personalized',
                   'magnificent',
                   'spectacular',
                   'exceptional',
                   'genuine',
                   'exquisite',
                   'heavenly',
                   'geuninely'
                  ], pmi_matrix, terms).items(),key=itemgetter(1),reverse=True)[:50]

[(u'marvelous', 0.0210367013497605),
 (u'tremendous', 0.02077955226810417),
 (u'personalized', 0.020157594703138288),
 (u'exquisite', 0.019939227021001066),
 (u'magical', 0.019579960200982016),
 (u'exemplary', 0.01774741463914602),
 (u'homely', 0.01677426009881342),
 (u'heavenly', 0.016743351792235642),
 (u'magnificent', 0.011709660252548786),
 (u'phenomenal', 0.010780816690441397),
 (u'favourite', 0.009414202254676514),
 (u'genuine', 0.008400317688886699),
 (u'instantly', 0.0032843771166931536),
 (u'spectacular', 0.0030529382678294503),
 (u'potential', 0.002701627642476028),
 (u'exceptional', 0.002677073905188852),
 (u'ish', 0.0021854939453479034),
 (u'proud', 0.002159763099215447),
 (u'iconic', 0.0018670156183468035),
 (u'business', 0.0018206878176718342),
 (u'remarkable', 0.0017279915543682428),
 (u'oldest', 0.001687007275944929),
 (u'welcomed', 0.0015659018041394358),
 (u'enthusiastic', 0.0015557672521075811),
 (u'endless', 0.001505723370697934),
 (u'defiantly', 0.00149568597885386

In [124]:
sorted(seed_score(['bad',
                   'terrible',
                   'wrong',
                   "crap",
                   "long",
                   "boring",
                   'bad',
                   'nasty',
                   'worst',
                   'disgusting',
                   'horrendous',
                   'rude',
                   'filthy',
                   'horrible',
                   'awful',
                   'unhappy',
                   'unhelpful',
                   'uncomfortable',
                   'dirty',
                   'unprofessional',
                   'gross',
                   'unpleasant'
                  ], pmi_matrix, terms).items(),key=itemgetter(1),reverse=True)[:50]

[(u'horrendous', 0.020732534824933833),
 (u'disgusting', 0.01738916286194967),
 (u'unprofessional', 0.017236468186201122),
 (u'unhappy', 0.013750803363176186),
 (u'nasty', 0.013656419202695976),
 (u'gross', 0.012648333294830318),
 (u'unpleasant', 0.010285748967681096),
 (u'filthy', 0.00767809253400677),
 (u'unhelpful', 0.007353780579215872),
 (u'awful', 0.005342462469031775),
 (u'horrible', 0.005014553705255097),
 (u'arrogant', 0.00470493364366358),
 (u'rude', 0.004603055449323188),
 (u'dirty', 0.004157117983028036),
 (u'worst', 0.00400797216616474),
 (u'terrible', 0.003968449610415639),
 (u'uncomfortable', 0.003919883300881516),
 (u'upset', 0.003511660054251222),
 (u'sick', 0.0030546154832528133),
 (u'stale', 0.00289857762561014),
 (u'dangerous', 0.0028978030721459576),
 (u'wrong', 0.0028574976587882645),
 (u'sticky', 0.0028421752753491477),
 (u'unacceptable', 0.002694658827267448),
 (u'angry', 0.002644882813988538),
 (u'cleaner', 0.0024701822939938766),
 (u'dead', 0.00241304629310404

In [98]:
tripadvisor_posscores = seed_score(['good',
                   'great', 
                   'best',
                   'magical',
                   'marvelous',
                   'tremendous',
                   'homely',
                   'exemplary',
                   'favourite',
                   'phenomenal',
                   'personalized',
                   'magnificent',
                   'spectacular',
                   'exceptional',
                   'genuine',
                   'exquisite',
                   'heavenly',
                   'geuninely'
                  ], pmi_matrix, terms)

tripadvisor_negscores = seed_score(['bad',
                   'terrible',
                   'wrong',
                   "crap",
                   "long",
                   "boring",
                   'bad',
                   'nasty',
                   'worst',
                   'disgusting',
                   'horrendous',
                   'rude',
                   'filthy',
                   'horrible',
                   'awful',
                   'unhappy',
                   'unhelpful',
                   'uncomfortable',
                   'dirty',
                   'unprofessional',
                   'gross',
                   'unpleasant'
                  ], pmi_matrix, terms)

In [101]:
tripadvisor_sentscores={}
for w in terms:
    tripadvisor_sentscores[w] = tripadvisor_posscores[w] - tripadvisor_negscores[w]



In [31]:
airbnb_data = pickle.load(open("airbnb_reviews.pickle", 'rb'))
airbnb_reviews = []
for listing_id, reviews in airbnb_data.items():
    for review in reviews: 
        airbnb_reviews.append(review)

In [32]:
airbnb_reviews_pos_tagged=[pos_tag(tokenizer.tokenize(m)) for m in airbnb_reviews]

In [33]:
airbnb_reviews_adj_adv_only=[" ".join([w for w,tag in m if tag in ["JJ","RB","RBS","RBJ","JJR","JJS"]])
                      for m in airbnb_reviews_pos_tagged]

In [34]:
vec2 = CountVectorizer(min_df = 50)
Y = vec2.fit_transform(airbnb_reviews_adj_adv_only)
airbnb_terms = vec2.get_feature_names()
len(airbnb_terms)

1960

In [35]:
airbnb_pmi_matrix=getcollocations_matrix(Y)
airbnb_pmi_matrix.shape  # n_words by n_words

(1960, 1960)

In [123]:
sorted(seed_score([
            'good',
            'great',
            'perfect',
            'cool', 
            'ambient', 
            'joyful',
            'inclusive',
            'sensational',
            'charmingly',
            'chic',
            'magical',
            'magnificent',
            'flawless', 
            'free',
            'homy',
            'caring',
            'sincere',
            'extraordinarily',
            'magical',
            'glorious',
            'classy'
        ], 
                  airbnb_pmi_matrix, 
                  airbnb_terms).items(),key=itemgetter(1),reverse=True)[:50]

[(u'inclusive', 0.020088083337619824),
 (u'ambient', 0.019457241833622112),
 (u'homy', 0.019349037426134044),
 (u'joyful', 0.017346476097376756),
 (u'sensational', 0.015643602171395612),
 (u'charmingly', 0.013626071377282094),
 (u'glorious', 0.010281809897474958),
 (u'extraordinarily', 0.009618611581047218),
 (u'caring', 0.007465870560576507),
 (u'magical', 0.0066750731020315535),
 (u'sincere', 0.006517573992124082),
 (u'classy', 0.005935311912287163),
 (u'flawless', 0.003965406954813205),
 (u'magnificent', 0.0036745094814091064),
 (u'chic', 0.002311219394003473),
 (u'uma', 0.0016471389047722509),
 (u'um', 0.0014853453467177135),
 (u'continental', 0.0008449931914577644),
 (u'uncommon', 0.0008130635496499765),
 (u'ugly', 0.0007755456103367136),
 (u'legendary', 0.0007746936642292856),
 (u'artist', 0.0007126204196082227),
 (u'agreeable', 0.0006795576517786452),
 (u'mad', 0.0006766226910385523),
 (u'festive', 0.0006626268138376387),
 (u'sparkling', 0.0006382897028458716),
 (u'que', 0.00061

In [122]:
sorted(seed_score(['moldy', 
                   'disappointed', 
                   'unhappy' ,
                   'disappointing', 
                   'unacceptable', 
                   'inadequate', 
                   'dirty', 
                   'worst', 
                   'gross', 
                   'filthy', 
                   'horrible', 
                   'nasty', 
                   'unusable', 
                   'misleading', 
                   'miserable', 
                   'terrible', 
                   'unplesant', 
                   'smelly', 
                   'upset', 
                   'poor', 
                   'messy', 
                   'untidy',
                   'weird',
                   'bad',
                   'strange',
                   'rubbish',
                   'ugly'
                  ], 
                  airbnb_pmi_matrix, 
                  airbnb_terms).items(),key=itemgetter(1),reverse=True)[:50]

[(u'inadequate', 0.024437835172164152),
 (u'miserable', 0.02297874799436424),
 (u'ugly', 0.021194345885534795),
 (u'untidy', 0.02065875701309372),
 (u'moldy', 0.020297928061508996),
 (u'unhappy', 0.019475421911002103),
 (u'misleading', 0.016246248352534576),
 (u'rubbish', 0.015135684734098178),
 (u'unusable', 0.013190728937912056),
 (u'unacceptable', 0.012990373218416038),
 (u'upset', 0.011114484229654392),
 (u'nasty', 0.011024909870469021),
 (u'gross', 0.010218803425632906),
 (u'filthy', 0.007805548279542542),
 (u'smelly', 0.007070855420123548),
 (u'disappointing', 0.005742164825257027),
 (u'messy', 0.005431358371905467),
 (u'horrible', 0.004941338511252913),
 (u'worst', 0.004550375420302404),
 (u'terrible', 0.0038920021350270276),
 (u'weird', 0.0033756539096123574),
 (u'strange', 0.003046846240682892),
 (u'poor', 0.003016155357428104),
 (u'rude', 0.0029338739082233733),
 (u'unlocked', 0.0028726158558034223),
 (u'disappointed', 0.0028152361698589117),
 (u'dirty', 0.0025748523493614194

In [109]:
airbnb_posscores = seed_score(['good',
            'great',
            'perfect',
            'cool', 
            'ambient', 
            'joyful',
            'inclusive',
            'sensational',
            'charmingly',
            'chic',
            'magical',
            'magnificent',
            'flawless', 
            'free',
            'homy',
            'caring',
            'sincere',
            'extraordinarily',
            'magical',
            'glorious',
            'classy'
                  ], airbnb_pmi_matrix, 
                  airbnb_terms)

airbnb_negscores = seed_score(['bad',
                   'terrible',
                   'wrong',
                   "crap",
                   "long",
                   "boring",
                   'bad',
                   'nasty',
                   'worst',
                   'horrendous',
                   'rude',
                   'filthy',
                   'horrible',
                   'awful',
                   'unhappy',
                   'unhelpful',
                   'uncomfortable',
                   'dirty',
                   'unprofessional',
                   'gross',
                   'unpleasant'
                  ], airbnb_pmi_matrix, 
                  airbnb_terms)

In [110]:
airbnb_sentscores={}
for w in airbnb_terms:
    airbnb_sentscores[w] = airbnb_posscores[w] - airbnb_negscores[w]

In [127]:
with open('tripadvisor_sentscores.pickle', 'wb') as f:
    pickle.dump(tripadvisor_sentscores, f) 

with open('airbnb_sentscores.pickle', 'wb') as f:
    pickle.dump(airbnb_sentscores, f)