In [475]:
from __future__ import annotations
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import plotly.graph_objs as go
import plotly as py
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='space')
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re

In [476]:
df =  pd.read_csv(r"C:\Users\INPRB11\projects\Test_junk\hotels\makemytrip_com-travel_sample.csv", encoding='utf-8')

### Checking for a suitable count of property listings.
#### The recommendation for hotels will be based on the description or the text contained for each hotel. Now, if a customer wants a hotel similar to a certain hotel's description he/she liked, we should recommend a hotel that is in a same city. So, here we subset hotel information pertaining to a specific city for instance, Goa.

In [477]:
df['city'].value_counts()

NewDelhiAndNCR                     1163
Goa                                1122
Mumbai                              543
Jaipur                              534
Bangalore                           512
Hyderabad                           468
Srinagar                            453
New Delhi And NCR                   425
Pune                                392
Kolkata                             369
Coorg                               347
Cochin                              337
Manali                              325
Chennai                             292
Haridwar                            249
Varanasi                            230
Mysore                              226
Kodaikanal                          221
Nainital                            214
Katra                               210
Ahmedabad                           199
Munnar                              195
Chikmagalur                         185
Udaipur                             184
Allepey                             177


In [478]:
goa = df.loc[df['city'] == 'Goa'].reset_index(drop=True)

In [479]:
goa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122 entries, 0 to 1121
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   area                             931 non-null    object 
 1   city                             1122 non-null   object 
 2   country                          1117 non-null   object 
 3   crawl_date                       662 non-null    object 
 4   highlight_value                  965 non-null    object 
 5   hotel_overview                   1006 non-null   object 
 6   hotel_star_rating                1122 non-null   object 
 7   image_urls                       472 non-null    object 
 8   in_your_room                     897 non-null    object 
 9   is_value_plus                    1122 non-null   object 
 10  latitude                         1121 non-null   float64
 11  longitude                        1121 non-null   float64
 12  mmt_holidayiq_review

#### checking how many unique scores were recorded by mmt

In [480]:
print(goa.mmt_review_score.value_counts())
print(f'Total number of unique review scores recorded by mmt is: {len(goa.mmt_review_score.value_counts())}')

3.9    38
4.2    37
4.0    37
4.4    36
4.1    33
4.3    33
5.0    25
3.8    22
0.0    22
3.6    21
3.5    21
4.5    21
3.7    20
4.6    17
3.4    17
3.3    17
3.2    16
4.8    14
3.1    12
4.9    11
4.7    10
2.7     8
2.4     7
3.0     6
1.9     5
2.3     5
2.5     3
2.8     3
2.9     2
Name: mmt_review_score, dtype: int64
Total number of unique review scores recorded by mmt is: 29


#### Analysing room types for each property type: We see all room types which have 'bhk' can mostly be apartments and we can use this knowledge to preprocess our data

In [481]:
goa.loc[goa['property_type'].isnull()][['property_type', 'room_types']]

Unnamed: 0,property_type,room_types
150,,2 Bhk
187,,Standard Ac Room
824,,2 Bhk
1112,,2 Bhk
1117,,3 Bhk Aparment


In [482]:
#filling nans with xyz
goa.property_type.fillna(value='xyz', inplace=True)

In [483]:
#ensuring there are no null values in property type column
goa.property_type.isnull().sum()

0

In [484]:
"""
prepared a mask to subset only the part in the df where poperty name contains 'apartment' and 
room types contains 'bhk' and property type contains 'xyz'
"""
mask = goa[goa['property_name'].str.lower().str.contains('apartment', na= False) 
    & goa['room_types'].str.lower().str.contains('bhk', na= False)
    & goa['property_type'].str.lower().str.contains('xyz', na= False)]

In [485]:
goa.loc[mask.index, 'property_type'] = goa.loc[mask.index, 'property_type'].apply(lambda x: "Apartment")

In [486]:
goa['hotel_star_rating'].value_counts()

1 star       505
3 star       206
1            138
2 star        99
4 star        53
2             48
3             37
5 star        19
4             10
5              6
Five on 5      1
Name: hotel_star_rating, dtype: int64

#### The hotel star rating has records like 5 as well as 5 stars. So we apply a lambda function to extract only the numeric part. Interestingly we find that the number of hotels in 1 star category is way more than the other categories. Also, there are more 3 star hotels than 2, 4 or 5 stars.

In [487]:
#preprocessing star rating
goa['hotel_star_rating'] = goa['hotel_star_rating'].apply(lambda s: s.split()[0] if len(s)>1 else s)

In [488]:
goa['hotel_star_rating'].value_counts()

1       643
3       243
2       147
4        63
5        25
Five      1
Name: hotel_star_rating, dtype: int64

In [499]:
#hotel star distribution and relationship of score with it
goa['hotel_star_rating'].value_counts().iplot(kind='barh', yTitle='Count', linecolor='black', title='Hotel star rating distribution')

In [489]:
#replacing invalid entries with nan
goa['hotel_star_rating'] = goa['hotel_star_rating'].apply(pd.to_numeric, errors='coerce')

In [490]:
goa['hotel_star_rating'].value_counts()

1.0    643
3.0    243
2.0    147
4.0     63
5.0     25
Name: hotel_star_rating, dtype: int64

#### Checking the number of unique travellers associated with each accomodation, .

In [491]:
def get_customer_stats_per_hotel(df:pd.DataFrame)->dict:
    max_customer = df.groupby(['property_name'])['uniq_id'].agg('count').max()
    min_customer = df.groupby(['property_name'])['uniq_id'].agg('count').min()
    mean_customer = (df.groupby(['property_name'])['uniq_id'].agg('count').mean()).round()
    return {"The maximum no of customers each accomodation is ": max_customer,
            "The minimum no of customers each accomodation is ": min_customer,
            "The average no of customers each accomodation is ": mean_customer}

In [492]:
print(get_customer_stats_per_hotel(goa))

{'The maximum no of customers each accomodation is ': 5, 'The minimum no of customers each accomodation is ': 1, 'The average no of customers each accomodation is ': 1.0}


#### The max number of customers who visited a hotel was 5 and min was 1. Hotel Om shiv and Roasdhouse hostels were among the popular picks. While majority of the hotels was visited by only one customer.

In [493]:
goa['property_name'].value_counts().iplot(kind='barh', yTitle='Count', linecolor='black', title='No. of travlers visited each property')

#### Customer preference of property types: The customers usually preferred hotels over other types of accomodation. This can be due to customer age and other details as well. Usually, hostels are preferred by single and solo travellers with lower budgets while families stick to comfortable hotels and average to higher rentals. This is worth investigating but the data does not provide more details on customers.

In [494]:
goa['property_type'].value_counts().iplot(kind='barh', yTitle='Count', linecolor='black', title='No. of travlers visited each hotel')

In [495]:
goa[['property_type','room_types']].head()

Unnamed: 0,property_type,room_types
0,Hotel,Standard Ac
1,Hotel,Luxury Flat 1 Bhk
2,Hotel,4 Bedroom Pool Villa
3,Apartment,2 Bhk Apartment
4,Hotel,`standard Room


In [496]:
goa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122 entries, 0 to 1121
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   area                             931 non-null    object 
 1   city                             1122 non-null   object 
 2   country                          1117 non-null   object 
 3   crawl_date                       662 non-null    object 
 4   highlight_value                  965 non-null    object 
 5   hotel_overview                   1006 non-null   object 
 6   hotel_star_rating                1121 non-null   float64
 7   image_urls                       472 non-null    object 
 8   in_your_room                     897 non-null    object 
 9   is_value_plus                    1122 non-null   object 
 10  latitude                         1121 non-null   float64
 11  longitude                        1121 non-null   float64
 12  mmt_holidayiq_review

In [497]:
goa[['is_value_plus','property_name', 'hotel_overview', 'mmt_review_score', 'traveller_rating', 'property_address', 'hotel_star_rating']]

Unnamed: 0,is_value_plus,property_name,hotel_overview,mmt_review_score,traveller_rating,property_address,hotel_star_rating
0,yes,Hotel Om Shiv,"Just 2.7 km from Margao Railway Station, Hotel...",4.3,Location:4.3/5 | Hospitality:4.4/5 | Facilitie...,"Rajadhyax Tower, Behind Bank of India, Near Ma...",2.0
1,no,Studio Service Apartments - Arpora,"Offering swimming pool, lawns/gardens and wate...",,,RR ORG (Rievera hermitage double tree by Hilto...,3.0
2,no,Diakon Holidays,Diakon Holidays is situated in one of the most...,0.0,,"Villa Goa Raj , Behind St Joseph School Arpora...",1.0
3,no,TripThrill Lotus Hermitage Apartment,"Nestle in Goa, a land of never ceasing festivi...",,,"Lotus Hermitage Resort, Benaulim Beach, Benaulim",1.0
4,no,JOHNS Guest House,,,,"H/No65/9, Escrivao Vaddo Calangute",1.0
5,no,Stay in a Homestay in Bogmalo,This is a family run Bed & Breakfast set in be...,,,"Silk Cotton Resort61, Sea View Estate,Bogmalo ...",1.0
6,no,Laxmi Guest House,"A budget property, Laxmi Guest House is locate...",,,"Laxmi Guest House, House No 942/B Vithal Das V...",1.0
7,no,Castle House,Located just 5 minutes drive from Calangute an...,4.1,Location:4.2/5 | Hospitality:4.3/5 | Facilitie...,"Lavina Street, Opp Calangute Panchayat, Calang...",1.0
8,no,Flowers Guest House,,,,"8/41 A Nigwaddo Saligao, Bardez",1.0
9,no,Vagator Suites 1B BHK,"Nestled in the tourist hub of India, renowned ...",,,"North Goa Suites, 306/2 Ozran Beach Road, Vaga...",1.0


In [500]:
goa.loc[goa['property_type'] == 'xyz']

Unnamed: 0,area,city,country,crawl_date,highlight_value,hotel_overview,hotel_star_rating,image_urls,in_your_room,is_value_plus,...,property_type,qts,query_time_stamp,room_types,site_review_count,site_review_rating,sitename,state,traveller_rating,uniq_id
187,Anjuna Bardez,Goa,India,2016-09-04,Airport Transfer|Doctor on Call|Fitness Centre...,"Whispering Woods, Goa, ensures that its guests...",2.0,,Room Service|Hot & Cold Running Water|Air cond...,no,...,xyz,2016-09-04 10:11:02 +0000,2016-09-04 10:11:02 +0000,Standard Ac Room,,,makemytrip,Goa,Location:3.5/5 | Hospitality:3.5/5 | Facilitie...,4f461bf1efc74dfca6e62fb81c477aac


#### Let's understand the distribution of star ratings by each type of accomodations. Quite obviously, hotels which we know see most of the bookings have all sorts if star ratings with highest numbers of 5 and 3 stars. We can ignore xyz category as we had treated all the null types to be xyz in our preceeding steps and that's just one record. Interestingly, there's one 5 star villa and one 3 star villa, while there's one cottage with a 3 star rating as well.

In [501]:
goa.groupby(['property_type'])['hotel_star_rating'].value_counts().iplot(kind='barh', yTitle='star ratings', linecolor='black', title='Hotel star rating distribution for each type of property')

#### Let's understand if there is any relationship between hotel's star rating and mmt review score

#### Although the data is highly imbalanced when it comes to different ratings but we can notice, higher rated hotels tend to have a higher mmt score too. The properties rated 4 and 5 have more records with higher mmt rating score, while the proerties rated 1, 2 and 3 have more or less equal number of high and low mmt rating scores.

In [502]:
#colour coding the review scores
colrs = [0 if 0<=s<=1 else 1 if 1<s<=2 else 2 if 2<s<=3 else 3\
    if 3<s<=4 else 4 if 4<s<=5 else -1\
    for s in goa['mmt_review_score'].values]
fig = go.Figure(data=go.Scatter(x=goa.hotel_star_rating,
                                y=goa.mmt_review_score, 
                                mode='markers',
                                marker=dict(
                                size=10,
                                color=colrs),
                                name="mmt review scores"))
fig.update_layout(title={
    'text':"Property's star rating vs MMT review score",
    'y':0.9,
    'x':0.5},
    xaxis_title="Property star ratings",
    yaxis_title="Mmt review scores",showlegend=True)
fig.show()

## Content based recommender

In [507]:
# There are lot of ascii characters which if not ignored will create wrong word frequency
goa['hotel_overview'] = goa['hotel_overview'].apply(lambda x: x.encode("ascii", "ignore") if x is not np.NaN else x)

AttributeError: 'bytes' object has no attribute 'encode'

In [508]:
def get_details(df:pd.DataFrame, property_id:str)->dict:
    return {property_id: [{'name': df.loc[df['property_id']== property_id][['property_name', 'hotel_overview']].values[0][0]},
           {'name': df.loc[df['property_id']== property_id][['property_name', 'hotel_overview']].values[0][1]}]}

In [509]:
get_details(goa, '201409031419274461')

{'201409031419274461': [{'name': 'Diakon Holidays'},
  {'name': b'Diakon Holidays is situated in one of the most loved holiday destination, Goa. Guests can arrive at the property by Goa International Airport and Vasco-Da-Gama Railway Station which are at an accessible distance. Along with very peace full location the property offers an array of facilities like room service, power backup, Wi-Fi access, swimming pool and airport/railway transfer for the convenience of guests. Unwind and relax in splendidly appointed villas and apartments equipped with amenities like private swimming pool with sun beds, garden table and chairs. Baby Cots are also provided on request. Moreover air condition, television, washing machine, attached bathrooms and various essential bathroom toiletries are also available. Furthermore guests can avail the facility of fully furnished kitchen with refrigerator, microwave, gas stove, toaster, electric kettle and crockery where they can cook food for their own. The p

In [510]:
goa['hotel_overview']

0       b'Just 2.7 km from Margao Railway Station, Hot...
1       b'Offering swimming pool, lawns/gardens and wa...
2       b'Diakon Holidays is situated in one of the mo...
3       b'Nestle in Goa, a land of never ceasing festi...
4                                                     NaN
5       b"This is a family run Bed & Breakfast set in ...
6       b'A budget property, Laxmi Guest House is loca...
7       b'Located just 5 minutes drive from Calangute ...
8                                                     NaN
9       b"Nestled in the tourist hub of India, renowne...
10                                                    NaN
11      b"|An ideal getaway offering nature's best in ...
12      b'|Situated in Goa, the party capital of India...
13      b'| Palolem Delights is located amidst lush gr...
14      b'|Nestled in the city of beaches, Goa, Sun N ...
15      b'| Yo Yo Goa The Apartment Hotel is located a...
16      b'|Nestled among the beaches of Goa, Sevas Hut...
17      b'|Cal

In [574]:
# def get_top_n_words(corpus, n=None):
#     vec = CountVectorizer(stop_words='english').fit(corpus)
#     bow = vec.transform(corpus)
#     sum_words = bow.sum(axis=0) 
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:n]

# common_words = get_top_n_words(goa['hotel_overview'].dropna(), 20)

In [618]:
# Generalised function to get common words
def get_top_ngram(corpus: str, no_words=20, *ngrams_params):
    if ngrams_params:
        ngrams_args = ','.join(str(x) for x in ngrams_params)
        stop_words = ''
        n = ''
        for x in ngrams_args:
            if isinstance(x, str):
                stop_words += x
            elif isinstance(x, int):
                n += x
                n = int(n)
            else:
                return "please enter valid stopwords or ngrams range"  
        if n:
            if stop_words:
                vec = CountVectorizer(ngram_range=(n, n), stop_words=stop_words).fit(corpus)
            else:
                vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
        elif stop_words:
            vec = CountVectorizer(stop_words='english').fit(corpus)
            
    else:
        vec = CountVectorizer().fit(corpus)    
    
    bow = vec.transform(corpus)
    sum_words = bow.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:no_words]

In [619]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20))
top_words.columns = ['words', 'count']

In [620]:
# function to plot bar chart of the top 20 words of hote description
def plot_barcharts(df:pd.DataFrame, title: str)->None:
    fig = go.Figure(data=[
    go.Bar(x= df['words'],y=df['count'],text= df['words'], textposition='auto')])
    fig.update_layout(
    title={
    'text':title,
    'y':0.9,
    'x':0.5},
    xaxis_title="top words",
    yaxis_title="word counts",
    )
    fig.show()
    

In [621]:
#plot the top 20 words from the hote description
plot_barcharts(top_words, "Top 20 words in hotel description before removing stop words")

In [622]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 'english'))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 words in hotel description after removing stop words")

In [624]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 2))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 bi-grams in hotel description before removing stop words")

In [600]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 'english', 2))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 bi-grams in hotel description after removing stop words")

In [None]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 'english', 2))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 bi-grams in hotel description after removing stop words")

In [321]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
common_words = get_top_n_bigram(goa['hotel_overview'].dropna(), 20)
df3 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df3.groupby('desc').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in hotel description before removing stop words')


In [322]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(goa['hotel_overview'].dropna(), 20)
df4 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df4.groupby('desc').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in hotel description After removing stop words')


In [243]:
df4.groupby('desc').sum()['count'].sort_values(ascending=False)

desc
living room         1262
walking distance     882
washer dryer         874
dupont circle        767
minute walk          764
size bed             586
columbia heights     566
capitol hill         533
adams morgan         505
queen bed            499
metro station        489
queen size           482
national mall        463
washington dc        458
street parking       458
eastern market       450
union station        447
white house          445
restaurants bars     400
blocks away          394
Name: count, dtype: int64

In [323]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(goa['hotel_overview'].dropna(), 20)
df5 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df5.groupby('desc').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in hotel description before removing stop words')


In [324]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(goa['hotel_overview'].dropna(), 20)
df6 = pd.DataFrame(common_words, columns = ['desc' , 'count'])
df6.groupby('desc').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in hotel description after removing stop words')


In [325]:
goa.columns

Index(['area', 'city', 'country', 'crawl_date', 'highlight_value',
       'hotel_overview', 'hotel_star_rating', 'image_urls', 'in_your_room',
       'is_value_plus', 'latitude', 'longitude', 'mmt_holidayiq_review_count',
       'mmt_location_rating', 'mmt_review_count', 'mmt_review_rating',
       'mmt_review_score', 'mmt_traveller_type_review_count',
       'mmt_tripadvisor_count', 'pageurl', 'property_address', 'property_id',
       'property_name', 'property_type', 'qts', 'query_time_stamp',
       'room_types', 'site_review_count', 'site_review_rating', 'sitename',
       'state', 'traveller_rating', 'uniq_id'],
      dtype='object')

In [326]:
goa['word_count'] = goa['hotel_overview'].apply(lambda x: len(str(x).split()))
desc_lengths = list(goa['word_count'])
print("Number of descriptions:",len(desc_lengths),
      "\nAverage word count", np.average(desc_lengths),
      "\nMinimum word count", min(desc_lengths),
      "\nMaximum word count", max(desc_lengths))

Number of descriptions: 1122 
Average word count 153.92869875222817 
Minimum word count 1 
Maximum word count 789


In [327]:
goa['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution in Hotel Description')

In [342]:
goa['hotel_overview'] = goa['hotel_overview'].astype(str).apply(lambda x: x.replace("b'","")).apply(lambda x: x.replace('b"',""))

In [343]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = str(text.lower()) # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
    
goa['desc_clean'] = goa['hotel_overview'].dropna().apply(clean_text)

In [344]:
goa

Unnamed: 0,area,city,country,crawl_date,highlight_value,hotel_overview,hotel_star_rating,image_urls,in_your_room,is_value_plus,...,query_time_stamp,room_types,site_review_count,site_review_rating,sitename,state,traveller_rating,uniq_id,word_count,desc_clean
282,South Goa,Goa,India,2016-08-28,Bar|Doctor on Call|Extra Mattress (On Request)...,"Just 2.7 km from Margao Railway Station, Hotel...",2,,Television|Air Conditioning|Attached Bathroom|...,yes,...,2016-08-28 16:13:39 +0000,Standard Ac,,,makemytrip,Goa,Location:4.3/5 | Hospitality:4.4/5 | Facilitie...,afa2b33d49d47b83026b94d54449192c,237,27 km margao railway station hotel om shiv off...
288,,Goa,India,2016-08-28,Airport Transfer|Bicycles on Hire|Coffee Shop|...,"Offering swimming pool, lawns/gardens and wate...",3,,Tea/Coffee Maker|Bathroom Toiletries|DVD Playe...,no,...,2016-08-28 16:13:39 +0000,Luxury Flat 1 Bhk,,,makemytrip,Goa,,4953f7d9c9a00e8246fcd58f0cf7d350,120,offering swimming pool lawns gardens water spo...
296,,Goa,India,2016-08-28,24 hour security|A/C Power Backup Available|Ai...,Diakon Holidays is situated in one of the most...,1,,Telephone|Bathroom Toiletries|Gas Stove|Washin...,no,...,2016-08-28 16:13:39 +0000,4 Bedroom Pool Villa,,,makemytrip,Goa,,2c57c7604510c1a2ab519a84a31cf030,175,diakon holidays situated one loved holiday des...
297,Benaulim,Goa,India,2016-08-28,,"Nestle in Goa, a land of never ceasing festivi...",1,,,no,...,2016-08-28 16:13:39 +0000,2 Bhk Apartment,,,makemytrip,Goa,,248ec7a4718e571cdaa54009140e7fdc,161,nestle goa land never ceasing festivity tripth...
299,,Goa,India,2016-08-28,Free WiFi Internet,,1,,,no,...,2016-08-28 16:13:39 +0000,`standard Room,,,makemytrip,goa,,36c3efe2a8223e1d8fb4797f39583fe1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19852,,Goa,India,2016-08-28,,,1,,,no,...,2016-08-28 15:29:09 +0000,3 Bhk Aparment,,,makemytrip,Goa,,e9cb801446979bbd0bb55a6ac2356ce0,1,
19854,Bardez,Goa,India,2016-08-28,Banquet facilities|Currency Exchange|Front des...,"Within walking distance from Baga Beach, 1.4 k...",3,,Telephone|Television|Attached Bathroom|Hot & C...,no,...,2016-08-28 15:29:09 +0000,Keys Room|Keys Executive Room|Keys Deluxe Room,,,makemytrip,Goa,Location:4.8/5 | Hospitality:4.1/5 | Facilitie...,3c041b97114e692b17f946d2108114e0,220,within walking distance baga beach 14 km calan...
19858,Betalbatim,Goa,India,2016-08-28,Doctor on Call|Front desk|Room Service,"Nestled in Goa, the city known for its natural...",1,,Attached Bathroom|Hot & Cold water,no,...,2016-08-28 15:29:09 +0000,1 Bedroom Luxury Villa,,,makemytrip,Goa,,c0e70946fe3aada615dea796815a596e,186,nestled goa city known natural picturesque bea...
19861,North Goa,Goa,India,2016-08-28,Bar|Coffee Shop|Dining Hall|Doctor on Call|Ext...,"1.9 km from Calangute Beach, 3.1 km from Apror...",4,,Tea/Coffee Maker|Ironing Board|Bathroom Toilet...,no,...,2016-08-28 15:29:09 +0000,Superior Deluxe Non-refundable|Superior Deluxe,,,makemytrip,,Location:4.6/5 | Hospitality:4.2/5 | Facilitie...,f573a77e82a1a93f648dae58ba82ea66,181,19 km calangute beach 31 km aprora market 61 k...


In [345]:
goa['desc_clean']

282      27 km margao railway station hotel om shiv off...
288      offering swimming pool lawns gardens water spo...
296      diakon holidays situated one loved holiday des...
297      nestle goa land never ceasing festivity tripth...
299                                                    nan
                               ...                        
19852                                                  nan
19854    within walking distance baga beach 14 km calan...
19858    nestled goa city known natural picturesque bea...
19861    19 km calangute beach 31 km aprora market 61 k...
19866    resorte marinha dourada situated goa offers ex...
Name: desc_clean, Length: 1122, dtype: object

In [393]:
# goa.set_index('property_name', inplace = True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(goa['desc_clean'].dropna())
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(goa.index)

def recommendations(name, cosine_similarities = cosine_similarities):
    
    recommended_hotels = []
    
    # gettin the index of the hotel that matches the name
    idx = indices[indices == name].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar hotels except itself
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the names of the top 10 matching hotels
    for i in top_10_indexes:
        recommended_hotels.append(list(goa.index)[i])
        
    return recommended_hotels

In [390]:
tf.get_feature_names()

['00',
 '00 pm',
 '00 pm checkout',
 '000',
 '000 square',
 '000 square feet',
 '04',
 '04 14',
 '04 14 30',
 '04th',
 '04th july',
 '04th july 2015',
 '06th',
 '06th june',
 '06th june 2016',
 '10',
 '10 15',
 '10 15 minutes',
 '10 airconditioned',
 '10 airconditioned rooms',
 '10 calangute',
 '10 calangute superior',
 '10 km',
 '10 km baga',
 '10 km basilica',
 '10 km calangute',
 '10 km capital',
 '10 km carambolim',
 '10 km casino',
 '10 km chapora',
 '10 km directorate',
 '10 km dona',
 '10 km hotel',
 '10 km miramar',
 '10 km palolem',
 '10 km pandit',
 '10 km panjim',
 '10 km railway',
 '10 km salim',
 '10 km se',
 '10 km shoppers',
 '10 km thivim',
 '10 km vagator',
 '10 km viceroys',
 '10 min',
 '10 min anjuna',
 '10 min baga',
 '10 mins',
 '10 mins away',
 '10 minute',
 '10 minute drive',
 '10 minute rides',
 '10 minutes',
 '10 minutes anjuna',
 '10 minutes away',
 '10 minutes calangute',
 '10 minutes drive',
 '10 minutes margao',
 '10 minutes walk',
 '10 people',
 '10 people

In [391]:
goa.head()

Unnamed: 0_level_0,area,city,country,crawl_date,highlight_value,hotel_overview,hotel_star_rating,image_urls,in_your_room,is_value_plus,...,query_time_stamp,room_types,site_review_count,site_review_rating,sitename,state,traveller_rating,uniq_id,word_count,desc_clean
property_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Hotel Om Shiv,South Goa,Goa,India,2016-08-28,Bar|Doctor on Call|Extra Mattress (On Request)...,"Just 2.7 km from Margao Railway Station, Hotel...",2,,Television|Air Conditioning|Attached Bathroom|...,yes,...,2016-08-28 16:13:39 +0000,Standard Ac,,,makemytrip,Goa,Location:4.3/5 | Hospitality:4.4/5 | Facilitie...,afa2b33d49d47b83026b94d54449192c,237,27 km margao railway station hotel om shiv off...
Studio Service Apartments - Arpora,,Goa,India,2016-08-28,Airport Transfer|Bicycles on Hire|Coffee Shop|...,"Offering swimming pool, lawns/gardens and wate...",3,,Tea/Coffee Maker|Bathroom Toiletries|DVD Playe...,no,...,2016-08-28 16:13:39 +0000,Luxury Flat 1 Bhk,,,makemytrip,Goa,,4953f7d9c9a00e8246fcd58f0cf7d350,120,offering swimming pool lawns gardens water spo...
Diakon Holidays,,Goa,India,2016-08-28,24 hour security|A/C Power Backup Available|Ai...,Diakon Holidays is situated in one of the most...,1,,Telephone|Bathroom Toiletries|Gas Stove|Washin...,no,...,2016-08-28 16:13:39 +0000,4 Bedroom Pool Villa,,,makemytrip,Goa,,2c57c7604510c1a2ab519a84a31cf030,175,diakon holidays situated one loved holiday des...
TripThrill Lotus Hermitage Apartment,Benaulim,Goa,India,2016-08-28,,"Nestle in Goa, a land of never ceasing festivi...",1,,,no,...,2016-08-28 16:13:39 +0000,2 Bhk Apartment,,,makemytrip,Goa,,248ec7a4718e571cdaa54009140e7fdc,161,nestle goa land never ceasing festivity tripth...
JOHNS Guest House,,Goa,India,2016-08-28,Free WiFi Internet,,1,,,no,...,2016-08-28 16:13:39 +0000,`standard Room,,,makemytrip,goa,,36c3efe2a8223e1d8fb4797f39583fe1,1,


In [394]:
recommendations('Aldeia Santa Rita')

['Aldeia Santa Rita',
 'Aldeia Santa Rita',
 'Summer Ville Resort',
 'Valentines Retreat',
 'Blu Grass',
 'Annapurna Vishram Dhaam',
 'Annapurna Vishram Dhaam',
 'Annapurna Vishram Dhaam',
 'Azzure by Spree Hotels',
 'SeaShell Beach Suites']

## Item-Item Collaborative filtering

In [351]:
goa.head()

Unnamed: 0_level_0,area,city,country,crawl_date,highlight_value,hotel_overview,hotel_star_rating,image_urls,in_your_room,is_value_plus,...,query_time_stamp,room_types,site_review_count,site_review_rating,sitename,state,traveller_rating,uniq_id,word_count,desc_clean
property_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Hotel Om Shiv,South Goa,Goa,India,2016-08-28,Bar|Doctor on Call|Extra Mattress (On Request)...,"Just 2.7 km from Margao Railway Station, Hotel...",2,,Television|Air Conditioning|Attached Bathroom|...,yes,...,2016-08-28 16:13:39 +0000,Standard Ac,,,makemytrip,Goa,Location:4.3/5 | Hospitality:4.4/5 | Facilitie...,afa2b33d49d47b83026b94d54449192c,237,27 km margao railway station hotel om shiv off...
Studio Service Apartments - Arpora,,Goa,India,2016-08-28,Airport Transfer|Bicycles on Hire|Coffee Shop|...,"Offering swimming pool, lawns/gardens and wate...",3,,Tea/Coffee Maker|Bathroom Toiletries|DVD Playe...,no,...,2016-08-28 16:13:39 +0000,Luxury Flat 1 Bhk,,,makemytrip,Goa,,4953f7d9c9a00e8246fcd58f0cf7d350,120,offering swimming pool lawns gardens water spo...
Diakon Holidays,,Goa,India,2016-08-28,24 hour security|A/C Power Backup Available|Ai...,Diakon Holidays is situated in one of the most...,1,,Telephone|Bathroom Toiletries|Gas Stove|Washin...,no,...,2016-08-28 16:13:39 +0000,4 Bedroom Pool Villa,,,makemytrip,Goa,,2c57c7604510c1a2ab519a84a31cf030,175,diakon holidays situated one loved holiday des...
TripThrill Lotus Hermitage Apartment,Benaulim,Goa,India,2016-08-28,,"Nestle in Goa, a land of never ceasing festivi...",1,,,no,...,2016-08-28 16:13:39 +0000,2 Bhk Apartment,,,makemytrip,Goa,,248ec7a4718e571cdaa54009140e7fdc,161,nestle goa land never ceasing festivity tripth...
JOHNS Guest House,,Goa,India,2016-08-28,Free WiFi Internet,,1,,,no,...,2016-08-28 16:13:39 +0000,`standard Room,,,makemytrip,goa,,36c3efe2a8223e1d8fb4797f39583fe1,1,


In [352]:
goa[:1].to_dict()

{'area': {'Hotel Om Shiv': 'South Goa'},
 'city': {'Hotel Om Shiv': 'Goa'},
 'country': {'Hotel Om Shiv': 'India'},
 'crawl_date': {'Hotel Om Shiv': '2016-08-28'},
 'highlight_value': {'Hotel Om Shiv': 'Bar|Doctor on Call|Extra Mattress (On Request)|Indoor Games|Laundry Service|Table Tennis'},
 'hotel_overview': {'Hotel Om Shiv': "Just 2.7 km from Margao Railway Station, Hotel Om Shiv offers easy access to the Colva Beach and has a restaurant and internet cafe. Located in the heart of Margao, close to the famous Colva Beach, Hotel Om Shiv is a popular budget property in this region of Goa. With all the necessary amenities and well-furnished rooms, the hotel promises to provide a comfortable stay to guests. There are three categories of rooms available at the hotel, including Executive Room, Deluxe Room and Suite. All rooms are appointed with basic amenities, including air-conditioner, cable television and attached bathroom with hot and cold water supply. This six storey property is an 

In [395]:
goa = goa.reset_index()
goa.head()

Unnamed: 0,property_name,area,city,country,crawl_date,highlight_value,hotel_overview,hotel_star_rating,image_urls,in_your_room,...,query_time_stamp,room_types,site_review_count,site_review_rating,sitename,state,traveller_rating,uniq_id,word_count,desc_clean
0,Hotel Om Shiv,South Goa,Goa,India,2016-08-28,Bar|Doctor on Call|Extra Mattress (On Request)...,"Just 2.7 km from Margao Railway Station, Hotel...",2,,Television|Air Conditioning|Attached Bathroom|...,...,2016-08-28 16:13:39 +0000,Standard Ac,,,makemytrip,Goa,Location:4.3/5 | Hospitality:4.4/5 | Facilitie...,afa2b33d49d47b83026b94d54449192c,237,27 km margao railway station hotel om shiv off...
1,Studio Service Apartments - Arpora,,Goa,India,2016-08-28,Airport Transfer|Bicycles on Hire|Coffee Shop|...,"Offering swimming pool, lawns/gardens and wate...",3,,Tea/Coffee Maker|Bathroom Toiletries|DVD Playe...,...,2016-08-28 16:13:39 +0000,Luxury Flat 1 Bhk,,,makemytrip,Goa,,4953f7d9c9a00e8246fcd58f0cf7d350,120,offering swimming pool lawns gardens water spo...
2,Diakon Holidays,,Goa,India,2016-08-28,24 hour security|A/C Power Backup Available|Ai...,Diakon Holidays is situated in one of the most...,1,,Telephone|Bathroom Toiletries|Gas Stove|Washin...,...,2016-08-28 16:13:39 +0000,4 Bedroom Pool Villa,,,makemytrip,Goa,,2c57c7604510c1a2ab519a84a31cf030,175,diakon holidays situated one loved holiday des...
3,TripThrill Lotus Hermitage Apartment,Benaulim,Goa,India,2016-08-28,,"Nestle in Goa, a land of never ceasing festivi...",1,,,...,2016-08-28 16:13:39 +0000,2 Bhk Apartment,,,makemytrip,Goa,,248ec7a4718e571cdaa54009140e7fdc,161,nestle goa land never ceasing festivity tripth...
4,JOHNS Guest House,,Goa,India,2016-08-28,Free WiFi Internet,,1,,,...,2016-08-28 16:13:39 +0000,`standard Room,,,makemytrip,goa,,36c3efe2a8223e1d8fb4797f39583fe1,1,


In [396]:
goa.columns

Index(['property_name', 'area', 'city', 'country', 'crawl_date',
       'highlight_value', 'hotel_overview', 'hotel_star_rating', 'image_urls',
       'in_your_room', 'is_value_plus', 'latitude', 'longitude',
       'mmt_holidayiq_review_count', 'mmt_location_rating', 'mmt_review_count',
       'mmt_review_rating', 'mmt_review_score',
       'mmt_traveller_type_review_count', 'mmt_tripadvisor_count', 'pageurl',
       'property_address', 'property_id', 'property_type', 'qts',
       'query_time_stamp', 'room_types', 'site_review_count',
       'site_review_rating', 'sitename', 'state', 'traveller_rating',
       'uniq_id', 'word_count', 'desc_clean'],
      dtype='object')

In [401]:
for i in goa.loc[goa['property_name']== "Aldeia Santa Rita"].mmt_review_score:
    print(i)

3.7
3.7
3.7


In [412]:
goa.loc[goa['property_name']== "Zuariview Guest House"]['mmt_review_score']

765     5.0
1025    5.0
Name: mmt_review_score, dtype: float64

In [357]:
pivot_table = goa.pivot_table(index = ["uniq_id"],columns = ["property_name"],values = "mmt_review_score")
pivot_table.head(50)

property_name,10 Calangute,16 Degrees North,Acron Waterfront Resort and Spa,Aishwarya Guest Inn,Alagoa Resort,Aldeia Santa Rita,Alenea Resort,Alila Diwa,All Seasons Beach Classic Resort,Alor Grande Holiday Resort,...,Wildernest Nature Resort,Willo's Apartments,Willows Elite Tourist Resort,Wind Mist Apartments,Woodstock Village,Yo Yo Goa The Apartment Hotel,Zappia Cove Guest House,Zense Resort,Zuariview Guest House,juSTa Panjim
uniq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000d7055120ae9c0972f0edbf8249a9e,,,,,,,,,,,...,,,,,,,,,,
0024774e6669a52ce651c4a512108717,,,,,,,,,,,...,,,,,,,,,,
01cb934e9edfa37e1b6645d541f3b89c,,,,,,,,,,,...,,,,,,,,,,
02bb40a4a3bd8099c4d472b2bc72a865,,,,,,,,,,,...,,,,,,,,,,
03b3d623a4bb85a3fecaf8d9e6fafa48,,,,,,,,,,,...,,,,,,,,,,
045fb0e6e1c21b2257ded1a0703e6714,,,,,,,,,,,...,,,,,,,,,,
04c89caca12d28681486f7153069474d,,,,,,,,,,,...,,,,,,,,,,
053581d6ea3d7f8b54ea803a788fd3ad,,,,,,,,,,,...,,,,,,,,,,
05b94e009ad9d07dad82a3e312e328f5,,,,,,,,,,,...,,,,,,,,,,
05cad2b3a7942e041b6b814a10bc0dad,,,,,3.1,,,,,,...,,,,,,,,,,


In [364]:
pivot_table.head(60)

property_name,10 Calangute,16 Degrees North,Acron Waterfront Resort and Spa,Aishwarya Guest Inn,Alagoa Resort,Aldeia Santa Rita,Alenea Resort,Alila Diwa,All Seasons Beach Classic Resort,Alor Grande Holiday Resort,...,Wildernest Nature Resort,Willo's Apartments,Willows Elite Tourist Resort,Wind Mist Apartments,Woodstock Village,Yo Yo Goa The Apartment Hotel,Zappia Cove Guest House,Zense Resort,Zuariview Guest House,juSTa Panjim
uniq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000d7055120ae9c0972f0edbf8249a9e,,,,,,,,,,,...,,,,,,,,,,
0024774e6669a52ce651c4a512108717,,,,,,,,,,,...,,,,,,,,,,
01cb934e9edfa37e1b6645d541f3b89c,,,,,,,,,,,...,,,,,,,,,,
02bb40a4a3bd8099c4d472b2bc72a865,,,,,,,,,,,...,,,,,,,,,,
03b3d623a4bb85a3fecaf8d9e6fafa48,,,,,,,,,,,...,,,,,,,,,,
045fb0e6e1c21b2257ded1a0703e6714,,,,,,,,,,,...,,,,,,,,,,
04c89caca12d28681486f7153069474d,,,,,,,,,,,...,,,,,,,,,,
053581d6ea3d7f8b54ea803a788fd3ad,,,,,,,,,,,...,,,,,,,,,,
05b94e009ad9d07dad82a3e312e328f5,,,,,,,,,,,...,,,,,,,,,,
05cad2b3a7942e041b6b814a10bc0dad,,,,,3.1,,,,,,...,,,,,,,,,,


In [358]:
pivot_table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 519 entries, 000d7055120ae9c0972f0edbf8249a9e to ffa404133e44134a2a94f535054515c9
Columns: 359 entries, 10 Calangute to juSTa Panjim
dtypes: float64(359)
memory usage: 1.4+ MB


In [386]:
pivot_table['Aldeia Santa Rita']

3.7    3
Name: Aldeia Santa Rita, dtype: int64

In [383]:
hotel_rated = pivot_table["Alagoa Resort"]
similarity_with_other_hotels = pivot_table.corrwith(hotel_rated)  # find correlation between "Bad Boys (1995)" and other movies
similarity_with_other_hotels = similarity_with_other_hotels.sort_values(ascending=False)
similarity_with_other_hotels.head()

property_name
Alagoa Resort                      1.0
10 Calangute                       NaN
16 Degrees North                   NaN
Acron Waterfront Resort and Spa    NaN
Aishwarya Guest Inn                NaN
dtype: float64

In [292]:
washington.review_scores_value.count()

2779