In [329]:
from __future__ import print_function
from scipy import sparse
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import numpy as np
import pickle
import matplotlib.pyplot as plt

### Preface:
Almost all the data has put in pickles, so it's just a matter of loading. If you want to see how that data was generated go to the ```lda_preprocessing``` file. Some of it is below, but that's mostly just index mapping stuff.

In [330]:
ta_data = pickle.load(open("data/tripadvisor_reviews.pickle", 'rb'))
airbnb_data = pickle.load(open("data/airbnb_reviews.pickle", 'rb'))

In [331]:
ta_listings = pickle.load(open("data/tripadvisor_hotel_info.pickle", 'rb'))
airbnb_listings = pickle.load(open("data/airbnb_listings.pickle", 'rb'))

In [332]:
print(ta_listings.items()[0])

(u'La Semana Hotel ', ('https://www.tripadvisor.com/Hotel_Review-g60763-d290982-Reviews-La_Semana_Hotel-New_York_City_New_York.html', '1.5 of 5 bubbles'))


In [333]:
print(airbnb_listings.items()[0])

('9261217', {'listing_url': 'https://www.airbnb.com/rooms/9261217', 'picture_url': 'https://a0.muscache.com/im/pictures/346c5ca8-6deb-406c-85cc-84e74cadce4b.jpg?aki_policy=large', 'name': 'Room in luxury high rise condo'})


In [334]:
print(ta_listings.items()[0])
ta_listing_to_index = {}
ta_index_to_listing = {}
for i,d in enumerate(ta_listings.items()):
    ta_listing_to_index[d[0]] = i
    ta_index_to_listing[i] = d[0]

print(ta_data[0])
ta_reviews = []
ta_adj_mat = np.zeros((len(ta_listing_to_index), len(ta_data)))
for j,d in enumerate(ta_data):
    i = ta_listing_to_index[d['hotel_name']]
    ta_reviews.append(d['review'])
    ta_adj_mat[i,j] += 1
    
ta_mat_index_to_listing = {}
for i,_ in enumerate(ta_listings.items()):
    ta_mat_index_to_listing[i] = i

(u'La Semana Hotel ', ('https://www.tripadvisor.com/Hotel_Review-g60763-d290982-Reviews-La_Semana_Hotel-New_York_City_New_York.html', '1.5 of 5 bubbles'))
{'review': u"This is place is great. Service is excellent, location is ideal, beds are comfy. Not much I would really say I didn't like. There was a woman, Gabriele I believe? who was one of the nicest hotel workers I have ever encountered. Consistently friendly and helpful. I would highly recommend this hotel.", 'review_stars': '5 of 5 bubbles', 'hotel_name': u'Hotel Giraffe by Library Hotel Collection ', 'title': u'Awesome stay'}


In [335]:
print(airbnb_data.items()[0])
airbnb_index_to_listing = {}
airbnb_listing_to_mat_index = {}
airbnb_mat_index_to_listing = {}
airbnb_reviews = []
i = 0
for (ind, (listing_id, reviews)) in enumerate(airbnb_data.items()):
    for review in reviews:
        airbnb_reviews.append(review)
        airbnb_listing_to_mat_index[listing_id] = ind
        airbnb_mat_index_to_listing[ind] = listing_id
        airbnb_index_to_listing[i] = listing_id
        i += 1
print(i)
count = i
airbnb_adj_mat = np.zeros((len(airbnb_data),i))
for (j, l) in airbnb_index_to_listing.items():
    airbnb_adj_mat[airbnb_listing_to_mat_index[l],j] += 1 

('9234415', ['Emme did everything he could to make our stay super comfortable We could check in early 3 hours earlier and late check out too at no cost The apartment has 3 cozy rooms and is very close to green line subway station Excellent location to get in a few minutes to any place in Manhattan Many many places to have lunch dinner just around the corner and super safe to walk at any time through the day or night Emme recommended lots of places to visit directions to get anywhere and was very responsive through messaging'])
85000


In [373]:
total_adj_mat = np.zeros((len(ta_listing_to_index)+len(airbnb_data),len(ta_data)+count))
total_index_to_listing = {}
total_mat_index_to_listing = {}
for j,d in enumerate(ta_data):
    i = ta_listing_to_index[d['hotel_name']]
    ta_adj_mat[i,j] += 1
    total_mat_index_to_listing[i] = i
for (j, l) in airbnb_index_to_listing.items():
    row = len(ta_listing_to_index)+airbnb_listing_to_mat_index[l]
    col = len(ta_data)+j
    total_adj_mat[row,col] += 1 
    total_mat_index_to_listing[row] = l

In [337]:
ta_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)
airbnb_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)
total_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)

ta_tfidf = ta_vectorizer.fit_transform(ta_reviews)
airbnb_tfidf = airbnb_vectorizer.fit_transform(airbnb_reviews)

ta_tfidf_feature_names = ta_vectorizer.get_feature_names()
airbnb_tfidf_feature_names = airbnb_vectorizer.get_feature_names()

total_tfidf = total_vectorizer.fit_transform(ta_reviews + airbnb_reviews)
total_tfidf_feature_names = total_vectorizer.get_feature_names()

In [391]:
ta_lda_ht = pickle.load(open("data/ta_lda_ht.mat"))
ta_lda_dt = pickle.load(open("data/ta_lda_dt.mat"))
ta_lda_tt = pickle.load(open("data/ta_lda_tt.mat"))
airbnb_lda_ht = pickle.load(open("data/airbnb_lda_ht.mat"))
airbnb_lda_dt = pickle.load(open("data/airbnb_lda_dt.mat"))
airbnb_lda_tt = pickle.load(open("data/airbnb_lda_tt.mat"))
total_lda_ht = pickle.load(open("data/total_lda_ht.mat"))
total_lda_dt = pickle.load(open("data/total_lda_dt.mat"))
total_lda_tt = pickle.load(open("data/total_lda_tt.mat"))

print(ta_lda_ht.shape)
print(ta_lda_dt.shape)
print(ta_lda_tt.shape)
print()
print(airbnb_lda_ht.shape)
print(airbnb_lda_dt.shape)
print(airbnb_lda_tt.shape)
print()
print(total_lda_ht.shape)
print(total_lda_dt.shape)
print(total_lda_tt.shape)

(78, 10)
(46606, 10)
(10, 227)

(17720, 10)
(85000, 10)
(10, 156)

(17798, 10)
(131606, 10)
(10, 184)


In [392]:
def print_top_words(mat, feature_names, n_top_words):
    for topic_idx, topic in enumerate(mat):
        print("Rank #%d:" % topic_idx)
        indices = topic.argsort()[:-n_top_words - 1:-1]
        print(" ".join([feature_names[i] for i in indices if topic[i] > 0]))
    print()

In [393]:
def search_lda(query, vectorizer, ht_mat, tt_mat, mat_to_listing_dict, top_k = 10):
    vec = vectorizer.transform([query]).todense().T
    results = np.dot(ht_mat, np.dot(tt_mat, vec)).T
    indices = np.squeeze(np.asarray(np.argsort(results)))[::-1].T[:top_k]
    scores = np.squeeze(np.asarray(np.sort(results)))[::-1].T[:top_k]

    explanation = np.squeeze(np.asarray(np.multiply(np.dot(ht_mat, tt_mat),vec.T)[indices,:]))
#     print_top_words(explanation, airbnb_tfidf_feature_names, 10)
    
    listings = np.zeros(indices.shape)
    for i in range(indices.shape[0]):
        listings[i] = mat_to_listing_dict[indices[i]]
    return (listings.tolist(), indices.tolist(), scores.tolist())

In [394]:
listings, indices, scores = search_lda("quiet neat cozy clean comfortable close", 
                                       airbnb_vectorizer, 
                                       airbnb_lda_ht, 
                                       airbnb_lda_tt, 
                                       airbnb_mat_index_to_listing)

In [395]:
for (l, ind, score) in zip(listings, indices, scores):
    listing_id = str(int(l))
    airbnb_listing_info = airbnb_listings[listing_id]
    print("Listing ID: " + listing_id)
    print("Listing Name: " + airbnb_listing_info['name'])
#     print("Review : \n" + airbnb_reviews[ind])
    print("Listing URL: " + airbnb_listing_info['listing_url'])
    print("Image URL: " + airbnb_listing_info['picture_url'])
    print("Score (Similarity): " + str(score))
    print("**************")

Listing ID: 8343222
Listing Name: New Apartment, Close to Ferry
Listing URL: https://www.airbnb.com/rooms/8343222
Image URL: https://a0.muscache.com/im/pictures/d5a143d1-233a-447e-8467-b36f934c523b.jpg?aki_policy=large
Score (Similarity): 0.552498986667
**************
Listing ID: 175069
Listing Name: Room Right by Columbia University
Listing URL: https://www.airbnb.com/rooms/175069
Image URL: https://a0.muscache.com/im/pictures/1840727/f58bbb32_original.jpg?aki_policy=large
Score (Similarity): 0.548781373963
**************
Listing ID: 4784728
Listing Name: ARTIST LOFT 1 - INSTANT BOOK NOW :)
Listing URL: https://www.airbnb.com/rooms/4784728
Image URL: https://a0.muscache.com/im/pictures/63652808/6741155c_original.jpg?aki_policy=large
Score (Similarity): 0.545824865766
**************
Listing ID: 10444139
Listing Name: Budget Room in Brooklyn&Breakfast hosts' coop
Listing URL: https://www.airbnb.com/rooms/10444139
Image URL: https://a0.muscache.com/im/pictures/0ac2dbd6-83d4-4dda-b126-705

In [396]:
listings, indices, scores = search_lda("comfortable", 
                                       ta_vectorizer, 
                                       ta_lda_ht, 
                                       ta_lda_tt, 
                                       ta_mat_index_to_listing)

In [397]:
for (l, ind, score) in zip(listings, indices, scores):
    listing_id = str(int(l))
    
    ta_listing_info = ta_listings[ta_index_to_listing[int(l)]]
#     print("Listing ID: " + listing_id)
    print("Listing Name: " + ta_index_to_listing[int(l)])
#     print("Review : \n" + airbnb_reviews[ind])
    print("Listing URL: " + ta_listing_info[0])
#     print("Image URL: " + airbnb_listing_info['picture_url'])
    print("Score (Similarity): " + str(score))
    print("**************")

Listing Name: Cassa Hotel Times Square 
Listing URL: https://www.tripadvisor.com/Hotel_Review-g60763-d7148761-Reviews-Cassa_Hotel_Times_Square-New_York_City_New_York.html
Score (Similarity): 0.25104141804
**************
Listing Name: The Tuscany - A St Giles Signature Hotel 
Listing URL: https://www.tripadvisor.com/Hotel_Review-g60763-d93431-Reviews-The_Tuscany_A_St_Giles_Signature_Hotel-New_York_City_New_York.html
Score (Similarity): 0.248540275734
**************
Listing Name: Hotel Chandler 
Listing URL: https://www.tripadvisor.com/Hotel_Review-g60763-d223024-Reviews-Hotel_Chandler-New_York_City_New_York.html
Score (Similarity): 0.246744821335
**************
Listing Name: Kimpton Ink48 Hotel 
Listing URL: https://www.tripadvisor.com/Hotel_Review-g60763-d1158753-Reviews-Kimpton_Ink48_Hotel-New_York_City_New_York.html
Score (Similarity): 0.246473069371
**************
Listing Name: SIXTY SoHo 
Listing URL: https://www.tripadvisor.com/Hotel_Review-g60763-d93398-Reviews-SIXTY_SoHo-New_Yor

In [400]:
listings, indices, scores = search_lda("comfortable", 
                                       total_vectorizer, 
                                       total_lda_ht, 
                                       total_lda_tt, 
                                       total_mat_index_to_listing)

# Analysis
Looks like Airbnb is dominating, even though everything is normalized... could it be quantity of listings in Tripadvisor that just gives the edge to Airbnb because there's higher probability? I can't find any query where Tripadvisor does better.

I think this is where SVD and related words can come in clutch - if we expand the query using more related words, we have a higher probability of roping in Tripadvisor listings for consideration. 

I still can't figure out how we can search broadly on a topic (i.e. a dimension in the scope of the matrices) - matrix multiplication doesn't quite achieve that; but the downside of that would be that if we search on something like "quiet" we also get things like "noise"...

Of those two alternatives, this seems easier to handle, especially if we can get the scale of returned results / figure out how to incorporate related words that also reflect the same polarity.

In [399]:
threshold = (ta_lda_ht.shape)[0]
for (l, ind, score) in zip(listings, indices, scores):
    if (ind > threshold):
        # Airbnb
        listing_id = str(int(l))
        airbnb_listing_info = airbnb_listings[listing_id]
        print("Listing ID: Airbnb " + listing_id)
        print("Listing Name (Airbnb): " + airbnb_listing_info['name'])
    #     print("Review : \n" + airbnb_reviews[ind])
        print("Listing URL: " + airbnb_listing_info['listing_url'])
        print("Image URL: " + airbnb_listing_info['picture_url'])
        print("Score (Similarity): " + str(score))
        print("**************")
    else:
        # Tripadvisor
        listing_id = str(int(l))

        ta_listing_info = ta_listings[ta_index_to_listing[int(l)]]
        print("Listing ID: Tripadvisor " + listing_id)
        print("Listing Name: " + ta_index_to_listing[int(l)])
    #     print("Review : \n" + airbnb_reviews[ind])
        print("Listing URL: " + ta_listing_info[0])
    #     print("Image URL: " + airbnb_listing_info['picture_url'])
        print("Score (Similarity): " + str(score))
        print("**************")

Listing ID: Airbnb 962045
Listing Name (Airbnb): Room & Coffee, 3 blocks to L Train!
Listing URL: https://www.airbnb.com/rooms/962045
Image URL: https://a0.muscache.com/im/pictures/33443716/89d3fff5_original.jpg?aki_policy=large
Score (Similarity): 0.295406313607
**************
Listing ID: Airbnb 240918
Listing Name (Airbnb): W4 Clean and Cozy 1 minute to Metro & Park
Listing URL: https://www.airbnb.com/rooms/240918
Image URL: https://a0.muscache.com/im/pictures/15584434/250e87c7_original.jpg?aki_policy=large
Score (Similarity): 0.293380852123
**************
Listing ID: Airbnb 476919
Listing Name (Airbnb): Upper East Side City Cottage Near Parks Museums
Listing URL: https://www.airbnb.com/rooms/476919
Image URL: https://a0.muscache.com/im/pictures/fb0dff7f-9bd5-43a2-92eb-0a8682dcf654.jpg?aki_policy=large
Score (Similarity): 0.293295082764
**************
Listing ID: Airbnb 1654417
Listing Name (Airbnb): Great Bedroom in Worlds Best 'Hood!
Listing URL: https://www.airbnb.com/rooms/165441