In [92]:
from __future__ import print_function
from scipy.sparse.linalg import svds
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [93]:
ta_data = pickle.load(open("data/tripadvisor_reviews.pickle", 'rb'))
airbnb_data = pickle.load(open("data/airbnb_reviews.pickle", 'rb'))

In [94]:
ta_listings = pickle.load(open("data/tripadvisor_hotel_info.pickle", 'rb'))
airbnb_listings = pickle.load(open("data/airbnb_listings.pickle", 'rb'))

In [95]:
print(ta_listings.items()[0])
ta_listing_to_index = {}
for i,d in enumerate(ta_listings.items()):
    ta_listing_to_index[d[0].strip()] = i
print(ta_listing_to_index['La Semana Hotel'])
print(len(ta_listing_to_index))

(u'La Semana Hotel ', ('https://www.tripadvisor.com/Hotel_Review-g60763-d290982-Reviews-La_Semana_Hotel-New_York_City_New_York.html', '1.5 of 5 bubbles'))
0
78


In [96]:
print(ta_data[0])
ta_reviews = []
ta_adj_mat = np.zeros((len(ta_listing_to_index), len(ta_data)))
for j,d in enumerate(ta_data):
    i = ta_listing_to_index[d['hotel_name'].strip()]
    ta_reviews.append(d['review'])
    ta_adj_mat[i,j] += 1

{'review': u"This is place is great. Service is excellent, location is ideal, beds are comfy. Not much I would really say I didn't like. There was a woman, Gabriele I believe? who was one of the nicest hotel workers I have ever encountered. Consistently friendly and helpful. I would highly recommend this hotel.", 'review_stars': '5 of 5 bubbles', 'hotel_name': u'Hotel Giraffe by Library Hotel Collection ', 'title': u'Awesome stay'}


In [97]:
print(airbnb_data.items()[0])
airbnb_index_to_listing = {}
airbnb_listing_to_mat_index = {}
airbnb_reviews = []
i = 0
for (ind, (listing_id, reviews)) in enumerate(airbnb_data.items()):
    for review in reviews:
        airbnb_reviews.append(review)
        airbnb_listing_to_mat_index[listing_id] = ind
        airbnb_index_to_listing[i] = listing_id
        i += 1
print(i)
count = i

('9234415', ['Emme did everything he could to make our stay super comfortable We could check in early 3 hours earlier and late check out too at no cost The apartment has 3 cozy rooms and is very close to green line subway station Excellent location to get in a few minutes to any place in Manhattan Many many places to have lunch dinner just around the corner and super safe to walk at any time through the day or night Emme recommended lots of places to visit directions to get anywhere and was very responsive through messaging'])
85000


In [98]:
airbnb_adj_mat = np.zeros((len(airbnb_data),i))
for (j, l) in airbnb_index_to_listing.items():
    airbnb_adj_mat[airbnb_listing_to_mat_index[l],j] += 1 
print(airbnb_adj_mat)

[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  1.  1.  1.]]


In [99]:
total_adj_mat = np.zeros((len(ta_listing_to_index)+len(airbnb_data),len(ta_data)+count))
total_mat_index_to_listing = {}
for j,d in enumerate(ta_data):
    i = ta_listing_to_index[d['hotel_name'].strip()]
    total_adj_mat[i,j] += 1
    total_mat_index_to_listing[i] = d['hotel_name'].strip()
for (j, l) in airbnb_index_to_listing.items():
    row = len(ta_listing_to_index)+airbnb_listing_to_mat_index[l]
    col = len(ta_data)+j
    total_adj_mat[row,col] += 1 
    total_mat_index_to_listing[row] = l

In [56]:
ta_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)
airbnb_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)

In [57]:
ta_tfidf = ta_vectorizer.fit_transform(ta_reviews)

In [58]:
airbnb_tfidf = airbnb_vectorizer.fit_transform(airbnb_reviews)

In [59]:
ta_tfidf_feature_names = ta_vectorizer.get_feature_names()

In [60]:
airbnb_tfidf_feature_names = airbnb_vectorizer.get_feature_names()

In [61]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [62]:
# Don't need to run this every time, only run if absolutely necessary
# ta_lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
#                                 learning_method='online',
#                                 learning_offset=10.,
#                                 random_state=0).fit(ta_tfidf)

In [63]:
# Don't need to run this every time, only run if absolutely necessary
# airbnb_lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
#                                 learning_method='online',
#                                 learning_offset=10.,
#                                 random_state=0).fit(airbnb_tfidf)

In [64]:
total_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)
total_tfidf = total_vectorizer.fit_transform(ta_reviews + airbnb_reviews)

In [65]:
total_tfidf_feature_names = total_vectorizer.get_feature_names()

In [66]:
# Don't need to run this every time, only run if absolutely necessary
# total_lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
#                                 learning_method='online',
#                                 learning_offset=10.,
#                                 random_state=0).fit(total_tfidf)

In [67]:
# ta_lda_topic_term_matrix = ta_lda.components_
# ta_lda_doc_topic_matrix = ta_lda.transform(ta_tfidf)

In [68]:
# ta_lda_topic_term_matrix.dump("ta_lda_tt.mat")
# ta_lda_doc_topic_matrix.dump("ta_lda_dt.mat")

In [69]:
# airbnb_lda_topic_term_matrix = airbnb_lda.components_
# airbnb_lda_doc_topic_matrix = airbnb_lda.transform(airbnb_tfidf)

In [70]:
# airbnb_lda_topic_term_matrix.dump("airbnb_lda_tt.mat")
# airbnb_lda_doc_topic_matrix.dump("airbnb_lda_dt.mat")

In [71]:
# total_lda_topic_term_matrix = total_lda.components_
# total_lda_doc_topic_matrix = total_lda.transform(total_tfidf)

In [72]:
# total_lda_topic_term_matrix.dump("total_lda_tt.mat")
# total_lda_doc_topic_matrix.dump("total_lda_dt.mat")

### Everything above is un-normalized, so if you run the above again, make sure to normalize again!

In [114]:
# ta_lda_tt = normalize(ta_lda_tt, axis = 1)
# airbnb_lda_tt = normalize(airbnb_lda_tt, axis = 1)
# total_lda_tt = normalize(total_lda_tt, axis = 1)

# ta_lda_tt.dump("ta_lda_tt.mat")
# airbnb_lda_tt.dump("airbnb_lda_tt.mat")
# total_lda_tt.dump("total_lda_tt.mat")

In [115]:
# ta_lda_ht = normalize(np.matmul(ta_adj_mat, ta_lda_dt), axis=1)
# airbnb_lda_ht = normalize(np.matmul(airbnb_adj_mat, airbnb_lda_dt), axis=1)
# total_lda_ht = normalize(np.matmul(total_adj_mat, total_lda_dt), axis=1)

# ta_lda_ht.dump("ta_lda_ht.mat")
# airbnb_lda_ht.dump("airbnb_lda_ht.mat")
# total_lda_ht.dump("total_lda_ht.mat")