In [2]:
from __future__ import print_function
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [3]:
ta_data = pickle.load(open("data/tripadvisor_reviews.pickle", 'rb'))
airbnb_data = pickle.load(open("data/airbnb_reviews.pickle", 'rb'))

In [4]:
ta_listings = pickle.load(open("data/tripadvisor_hotel_info.pickle", 'rb'))

In [5]:
print(ta_data[0])
ta_reviews = []
for d in ta_data:
    ta_reviews.append(d['review'])

{'review': u"This is place is great. Service is excellent, location is ideal, beds are comfy. Not much I would really say I didn't like. There was a woman, Gabriele I believe? who was one of the nicest hotel workers I have ever encountered. Consistently friendly and helpful. I would highly recommend this hotel.", 'review_stars': '5 of 5 bubbles', 'hotel_name': u'Hotel Giraffe by Library Hotel Collection ', 'title': u'Awesome stay'}


In [6]:
print(airbnb_data.items()[0])
airbnb_index_to_listing = {}
airbnb_reviews = []
i = 0
for (listing_id, reviews) in airbnb_data.items():
    for review in reviews:
        airbnb_reviews.append(review)
        airbnb_index_to_listing[i] = listing_id
        i += 1

('9234415', ['Emme did everything he could to make our stay super comfortable We could check in early 3 hours earlier and late check out too at no cost The apartment has 3 cozy rooms and is very close to green line subway station Excellent location to get in a few minutes to any place in Manhattan Many many places to have lunch dinner just around the corner and super safe to walk at any time through the day or night Emme recommended lots of places to visit directions to get anywhere and was very responsive through messaging'])


In [7]:
ta_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)
airbnb_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)

In [8]:
ta_tfidf = ta_vectorizer.fit_transform(ta_reviews)

In [9]:
airbnb_tfidf = airbnb_vectorizer.fit_transform(airbnb_reviews)

In [10]:
ta_tfidf_feature_names = ta_vectorizer.get_feature_names()
print(ta_tfidf_feature_names)

[u'10', u'able', u'air', u'amazing', u'amenities', u'area', u'arrived', u'ask', u'asked', u'available', u'avenue', u'away', u'bad', u'bar', u'bathroom', u'beautiful', u'bed', u'beds', u'best', u'better', u'big', u'bit', u'block', u'blocks', u'booked', u'breakfast', u'broadway', u'building', u'business', u'called', u'came', u'center', u'central', u'check', u'checked', u'city', u'clean', u'close', u'coffee', u'come', u'comfortable', u'complimentary', u'concierge', u'convenient', u'corner', u'couldn', u'couple', u'day', u'days', u'definitely', u'desk', u'did', u'didn', u'distance', u'don', u'door', u'double', u'early', u'easy', u'eat', u'empire', u'end', u'enjoyed', u'especially', u'evening', u'excellent', u'expect', u'expensive', u'experience', u'extra', u'extremely', u'family', u'fantastic', u'far', u'feel', u'felt', u'fine', u'floor', u'food', u'free', u'friendly', u'given', u'going', u'good', u'got', u'great', u'guests', u'happy', u'having', u'help', u'helpful', u'high', u'highly', u'

In [11]:
airbnb_tfidf_feature_names = airbnb_vectorizer.get_feature_names()
print(airbnb_tfidf_feature_names)

[u'10', u'able', u'access', u'accommodating', u'airbnb', u'amazing', u'apartment', u'area', u'arrival', u'arrived', u'available', u'away', u'awesome', u'bars', u'bathroom', u'beautiful', u'bed', u'bedroom', u'best', u'better', u'big', u'bit', u'blocks', u'brooklyn', u'building', u'central', u'check', u'city', u'clean', u'close', u'coffee', u'come', u'comfortable', u'communication', u'convenient', u'couple', u'cozy', u'day', u'days', u'definitely', u'described', u'did', u'didn', u'distance', u'easy', u'enjoyed', u'exactly', u'excellent', u'experience', u'extremely', u'family', u'fantastic', u'feel', u'felt', u'floor', u'food', u'friendly', u'gave', u'good', u'got', u'great', u'help', u'helpful', u'highly', u'home', u'host', u'hosts', u'house', u'just', u'kind', u'kitchen', u'late', u'left', u'like', u'little', u'living', u'located', u'location', u'looking', u'lot', u'lots', u'loved', u'lovely', u'make', u'manhattan', u'meet', u'met', u'metro', u'minute', u'minutes', u'near', u'nearby', 

In [12]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [13]:
ta_nmf = NMF(n_components=10, random_state=1,alpha=.1, l1_ratio=.5).fit(ta_tfidf)

In [14]:
airbnb_nmf = NMF(n_components=10, random_state=1,alpha=.1, l1_ratio=.5).fit(airbnb_tfidf)

In [15]:
print_top_words(ta_nmf, ta_tfidf_feature_names, 20)

Topic #0:
hotel staff stay nyc friendly helpful stayed comfortable clean definitely recommend perfect rooms wonderful location spacious extremely highly visit loved
Topic #1:
room bed floor bathroom shower small night noise view water did size king double nights street booked large comfortable check
Topic #2:
hotel like just time check night did place desk day didn stay told don got people really said went make
Topic #3:
new york city hotel visit trip stay time staff hotels stayed perfect recommend year staying helpful best return fantastic friendly
Topic #4:
good rooms small hotel location price breakfast clean lobby bit size value quite area really hotels overall wifi little beds
Topic #5:
square times central park walk subway close away blocks walking station distance street restaurants block easy location broadway avenue just
Topic #6:
breakfast empire state building free coffee day tea hotel lovely wine complimentary view evening available walk wifi morning reception recommend
Top

In [16]:
print_top_words(airbnb_nmf, airbnb_tfidf_feature_names, 20)

Topic #0:
apartment stay clean recommend comfortable perfect nyc location located needed host definitely building spacious highly helpful wonderful questions exactly stayed
Topic #1:
room bathroom bed kitchen good clean living comfortable small night nice didn bedroom use space did people big bit little
Topic #2:
place stay recommend definitely clean nyc perfect highly looking super needed city amazing stayed comfortable need staying helpful host just
Topic #3:
new york city time visit trip perfect experience come best stay amazing staying days day thank want subway square lot
Topic #4:
really nice good enjoyed lot time close helpful like big didn metro neighbourhood people come gave thanks ny friendly places
Topic #5:
check easy late time night accommodating times didn day super arrived did able communication sure location arrival meet convenient access
Topic #6:
subway walk manhattan away restaurants minutes just park station walking close brooklyn street easy neighborhood good block

In [17]:
ta_lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
                                learning_method='online',
                                learning_offset=10.,
                                random_state=0).fit(ta_tfidf)

In [18]:
airbnb_lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
                                learning_method='online',
                                learning_offset=10.,
                                random_state=0).fit(airbnb_tfidf)

In [19]:
print_top_words(ta_lda, ta_tfidf_feature_names, 20)

Topic #0:
hotel coffee wine breakfast room staff great tea free complimentary lobby evening nice helpful friendly stay rooftop bar day lovely
Topic #1:
hotel like room home rooms place stay old don staff just feel great say really know new good time small
Topic #2:
state empire building view breakfast views close near central lovely corner square big problem used wait end amazing shower station
Topic #3:
hotel rooms great hotels value manhattan location clean staff stay room business nyc good small stayed ny price friendly place
Topic #4:
room hotel great stay staff amazing suite views new stayed view upgraded extremely york helpful definitely city friendly clean location
Topic #5:
hotel park central great square walking subway room location distance times blocks restaurants staff stay close rooms good clean walk
Topic #6:
room hotel check told night desk did asked stay said got didn day time just rooms bed called booked arrived
Topic #7:
station hotel penn room square walk great times

In [20]:
print_top_words(airbnb_lda, airbnb_tfidf_feature_names, 20)

Topic #0:
room apartment bathroom bedroom kitchen living great small bed nice place floor good stay clean building location comfortable bit really
Topic #1:
york new apartment stay great place city time really neighbourhood perfect visit lovely gave host definitely helpful recommend come trip
Topic #2:
check great apartment place stay didn night meet late clean time nice room did location responsive host got arrived good
Topic #3:
apartment great excellent family access stay easy plenty nearby place space location clean comfortable subway neighborhood lovely restaurants host brooklyn
Topic #4:
apartment place stay great exactly looking extremely recommend clean host location nyc definitely accommodating described highly easy perfect questions comfortable
Topic #5:
make sure way stay apartment great place help feel welcome home host needed comfortable time room clean really did nice
Topic #6:
home place stay great amazing experience like better apartment best feel airbnb time felt host 

In [21]:
ta_lda_topic_term_matrix = ta_lda.components_
ta_lda_doc_topic_matrix = ta_lda.transform(ta_tfidf)

In [37]:
ta_lda_topic_term_matrix.dump("ta_lda_tt.mat")
ta_lda_doc_topic_matrix.dump("ta_lda_dt.mat")

In [22]:
airbnb_lda_topic_term_matrix = airbnb_lda.components_
airbnb_lda_doc_topic_matrix = airbnb_lda.transform(airbnb_tfidf)

In [38]:
airbnb_lda_topic_term_matrix.dump("airbnb_lda_tt.mat")
airbnb_lda_doc_topic_matrix.dump("airbnb_lda_dt.mat")

In [39]:
ta_lda_dt = pickle.load(open("data/ta_lda_dt.mat"))
ta_lda_tt = pickle.load(open("data/ta_lda_tt.mat"))
airbnb_lda_dt = pickle.load(open("data/airbnb_lda_dt.mat"))
airbnb_lda_tt = pickle.load(open("data/airbnb_lda_tt.mat"))

In [40]:
print(ta_tfidf.shape)
print(ta_lda_dt.shape)
print(ta_lda_tt.shape)

(46606, 227)
(46606, 10)
(10, 227)


In [41]:
print(airbnb_tfidf.shape)
print(airbnb_lda_dt.shape)
print(airbnb_lda_tt.shape)

(85000, 156)
(85000, 10)
(10, 156)


In [42]:
ta_vectorizer.transform(['test, bed'])

<1x227 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [43]:
def search_lda(query, vectorizer, dt_mat, tt_mat, top_k = 10):
    vec = vectorizer.transform([query]).todense().T
    results = np.dot(dt_mat, np.dot(tt_mat, vec)).T
    indices = np.squeeze(np.asarray(np.argsort(results)))[::-1].T[:top_k]
    scores = np.squeeze(np.asarray(np.sort(results)))[::-1].T[:top_k]
    listings = np.zeros(indices.shape)
    for i in range(indices.shape[0]):
        listings[i] = airbnb_index_to_listing[indices[i]]
    return (listings.tolist(), indices.tolist(), scores.tolist())

In [45]:
listings, indices, scores = search_lda("clean bed", airbnb_vectorizer, airbnb_lda_dt, airbnb_lda_tt)

In [46]:
airbnb_listings = pickle.load(open("data/airbnb_listings.pickle", 'rb'))

In [47]:
for (l, ind, score) in zip(listings, indices, scores):
    listing_id = str(int(l))
    airbnb_listing_info = airbnb_listings[listing_id]
    print("Listing ID: " + listing_id)
    print("Listing Name: " + airbnb_listing_info['name'])
    print("Review : \n" + airbnb_reviews[ind])
    print("Listing URL: " + airbnb_listing_info['listing_url'])
    print("Image URL: " + airbnb_listing_info['picture_url'])
    print("Score (Similarity): " + str(score))
    print("**************")

Listing ID: 26785
Listing Name: Park Slope Green Guest House
Review : 
We stayed at the Green Guest House for 5 nights in May 2011 and had a fantastic experience Below are my thoughts on some specific aspects of the apartment that I hope you find useful LOCATION After having lived in New York for 4 years previously my partner and I specifically searched for rental properties in Park Slope for our vacation back to NYC because it is a great walkable neighborhood with plenty of restaurants bars and entertainment options We also find that Park Slope has a small town feel despite being in the largest city in the U S The location of the Green Guest House is on a safe street where we felt very comfortable walking even late in the evening It is about 3 4 blocks a 5 minute walk from the nearest subway station at 4th and 9th where you can catch the F train or the R train to Manhattan 15 minute ride to Downtown Manhattan or the G train to other cool Brooklyn neighborhoods such as Carroll Gardens 