In [4]:
#reference: https://www.datacamp.com/community/tutorials/recommender-systems-python
import pandas as pd
clean_review1 = pd.read_csv('./data/Hotel_Reviews.csv')

In [5]:
# selecting subset of feature from original dataset. grouping hotel names and joining negative and positive review column with it. 
clean_review1 = clean_review1.groupby('Hotel_Name').agg({
                             'Negative_Review': ', '.join,'Positive_Review': ', '.join}).reset_index()

In [6]:
clean_review1

Unnamed: 0,Hotel_Name,Negative_Review,Positive_Review
0,11 Cadogan Gardens,Thought the prise of drinks at the bar a litt...,We were particularly impressed by the very wa...
1,1K Hotel,Air conditioning in room didn t work and desp...,Location good close to le Marais and 3e arron...
2,25hours Hotel beim MuseumsQuartier,Breakfast not included and buffet really expe...,Cool vintage style in the middle of the museu...
3,41,"There wasn t a thing that we didn t like , No...",Its central proximity close to all services a...
4,45 Park Lane Dorchester Collection,More kinds of fruit juice will make the mini ...,Everything here are almost perfect the staffs...
5,88 Studios,Maybe more selection of tea coffee hot chocol...,It was a very nice apartment and the customer...
6,9Hotel Republique,The room was very small but maybe reasonable ...,The bus and metro station are located just in...
7,A La Villa Madame,"No Negative, just a better map of surrounding...",The bed was extra comfy the street is really ...
8,ABaC Restaurant Hotel Barcelona GL Monumento,The room looks nice in the pictures with the ...,The room size was bigger than average Barcelo...
9,AC Hotel Barcelona Forum a Marriott Lifestyle ...,no tea and coffee facilities in the room this...,Staff were super friendly and helpful faultle...


In [7]:
# extracting new columne review_text by merging postitive and negative review column
clean_review1['review_text'] = clean_review1['Positive_Review'].astype(str) + clean_review1['Negative_Review'].astype(str)

In [8]:
clean_review1

Unnamed: 0,Hotel_Name,Negative_Review,Positive_Review,review_text
0,11 Cadogan Gardens,Thought the prise of drinks at the bar a litt...,We were particularly impressed by the very wa...,We were particularly impressed by the very wa...
1,1K Hotel,Air conditioning in room didn t work and desp...,Location good close to le Marais and 3e arron...,Location good close to le Marais and 3e arron...
2,25hours Hotel beim MuseumsQuartier,Breakfast not included and buffet really expe...,Cool vintage style in the middle of the museu...,Cool vintage style in the middle of the museu...
3,41,"There wasn t a thing that we didn t like , No...",Its central proximity close to all services a...,Its central proximity close to all services a...
4,45 Park Lane Dorchester Collection,More kinds of fruit juice will make the mini ...,Everything here are almost perfect the staffs...,Everything here are almost perfect the staffs...
5,88 Studios,Maybe more selection of tea coffee hot chocol...,It was a very nice apartment and the customer...,It was a very nice apartment and the customer...
6,9Hotel Republique,The room was very small but maybe reasonable ...,The bus and metro station are located just in...,The bus and metro station are located just in...
7,A La Villa Madame,"No Negative, just a better map of surrounding...",The bed was extra comfy the street is really ...,The bed was extra comfy the street is really ...
8,ABaC Restaurant Hotel Barcelona GL Monumento,The room looks nice in the pictures with the ...,The room size was bigger than average Barcelo...,The room size was bigger than average Barcelo...
9,AC Hotel Barcelona Forum a Marriott Lifestyle ...,no tea and coffee facilities in the room this...,Staff were super friendly and helpful faultle...,Staff were super friendly and helpful faultle...


In [10]:
#findout similarity between the reviews of hotel using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
clean_review1['review_text'] = clean_review1['review_text'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(clean_review1['review_text'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape
#You see that over 80,000 different words were used to describe the 1492 hotels in dataset.

(1492, 81002)

In [11]:
#used linear_kernal method for calculating similarity between the hotels.
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
#Construct a reverse map of indices and hotel names
indices = pd.Series(clean_review1.index, index=clean_review1['Hotel_Name']).drop_duplicates()

In [13]:
# Function that takes in hotel name as input and outputs most similar hotels
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the hotel that matches the hotel_name
    idx = indices[title]

    # Get the pairwsie similarity scores of all hotels with that hotel
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the hotels based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar hotels
    sim_scores = sim_scores[1:11]

    # Get the hotel indices
    hotel_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar hotel
    return clean_review1['Hotel_Name'].iloc[hotel_indices]

In [15]:
#user input hotel name 11 cadogan garden and get top 10 similar hotel based on user review
get_recommendations('11 Cadogan Gardens')

330                         Flemings Mayfair
1316                    The Bloomsbury Hotel
1393                   The Royal Horseguards
1323                The Chesterfield Mayfair
1368             The Montague On The Gardens
1319                    The Cavendish London
1175        Radisson Blu Edwardian Hampshire
1384                    The Principal London
1370                The Montcalm Marble Arch
1177    Radisson Blu Edwardian Mercer Street
Name: Hotel_Name, dtype: object

In [16]:
get_recommendations('XO Hotel')

160     Best Western Premier Kapital Op ra
752          Hotel Saint Petersbourg Opera
708                         Hotel Monsieur
416                       H tel Diva Opera
720                 Hotel Op ra Richepanse
1071                       Newhotel Roblin
399                          H tel Bedford
344                    Gardette Park Hotel
422              H tel Exquis by Elegancia
165       Best Western Premier Op ra Li ge
Name: Hotel_Name, dtype: object