In [213]:
from __future__ import annotations
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import plotly.graph_objs as go
import plotly as py
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='space')
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [215]:
#Reading the csv file
df =  pd.read_csv(r"makemytrip_com-travel_sample.csv", encoding='utf-8')

### Checking for a suitable count of property listings.
#### The recommendation for hotels will be based on the description or the text contained for each hotel. Now, if a customer wants a hotel similar to a certain hotel's description he/she liked, we should recommend a hotel that is in a same city. So, here we subset hotel information pertaining to a specific city for instance, Goa.

In [396]:
df['city'].value_counts().head(10)

NewDelhiAndNCR       1163
Goa                  1122
Mumbai                543
Jaipur                534
Bangalore             512
Hyderabad             468
Srinagar              453
New Delhi And NCR     425
Pune                  392
Kolkata               369
Name: city, dtype: int64

In [217]:
goa = df.loc[df['city'] == 'Goa'].reset_index(drop=True)

In [218]:
goa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122 entries, 0 to 1121
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   area                             931 non-null    object 
 1   city                             1122 non-null   object 
 2   country                          1117 non-null   object 
 3   crawl_date                       662 non-null    object 
 4   highlight_value                  965 non-null    object 
 5   hotel_overview                   1006 non-null   object 
 6   hotel_star_rating                1122 non-null   object 
 7   image_urls                       472 non-null    object 
 8   in_your_room                     897 non-null    object 
 9   is_value_plus                    1122 non-null   object 
 10  latitude                         1121 non-null   float64
 11  longitude                        1121 non-null   float64
 12  mmt_holidayiq_review

#### checking how many unique scores were recorded by mmt

In [219]:
print(goa.mmt_review_score.value_counts())
print(f'Total number of unique review scores recorded by mmt is: {len(goa.mmt_review_score.value_counts())}')

3.9    38
4.2    37
4.0    37
4.4    36
4.1    33
4.3    33
5.0    25
3.8    22
0.0    22
3.6    21
3.5    21
4.5    21
3.7    20
4.6    17
3.4    17
3.3    17
3.2    16
4.8    14
3.1    12
4.9    11
4.7    10
2.7     8
2.4     7
3.0     6
1.9     5
2.3     5
2.5     3
2.8     3
2.9     2
Name: mmt_review_score, dtype: int64
Total number of unique review scores recorded by mmt is: 29


#### Analysing room types for each property type: We see all room types which have 'bhk' can mostly be apartments and we can use this knowledge to preprocess our data

In [220]:
goa.loc[goa['property_type'].isnull()][['property_type', 'room_types']]

Unnamed: 0,property_type,room_types
150,,2 Bhk
187,,Standard Ac Room
824,,2 Bhk
1112,,2 Bhk
1117,,3 Bhk Aparment


In [221]:
#filling nans with xyz
goa.property_type.fillna(value='xyz', inplace=True)

In [222]:
#ensuring there are no null values in property type column
goa.property_type.isnull().sum()

0

In [223]:
"""
prepared a mask to subset only the part in the df where poperty name contains 'apartment' and 
room types contains 'bhk' and property type contains 'xyz'
"""
mask = goa[goa['property_name'].str.lower().str.contains('apartment', na= False) 
    & goa['room_types'].str.lower().str.contains('bhk', na= False)
    & goa['property_type'].str.lower().str.contains('xyz', na= False)]

In [224]:
goa.loc[mask.index, 'property_type'] = goa.loc[mask.index, 'property_type'].apply(lambda x: "Apartment")

In [225]:
goa['hotel_star_rating'].value_counts()

1 star       505
3 star       206
1            138
2 star        99
4 star        53
2             48
3             37
5 star        19
4             10
5              6
Five on 5      1
Name: hotel_star_rating, dtype: int64

#### The hotel star rating has records like 5 as well as 5 stars. So we apply a lambda function to extract only the numeric part. Interestingly we find that the number of hotels in 1 star category is way more than the other categories. Also, there are more 3 star hotels than 2, 4 or 5 stars.

In [226]:
#preprocessing star rating
goa['hotel_star_rating'] = goa['hotel_star_rating'].apply(lambda s: s.split()[0] if len(s)>1 else s)

In [227]:
goa['hotel_star_rating'].value_counts()

1       643
3       243
2       147
4        63
5        25
Five      1
Name: hotel_star_rating, dtype: int64

In [228]:
#hotel star distribution and relationship of score with it
goa['hotel_star_rating'].value_counts().iplot(kind='barh', yTitle='Count', linecolor='black', title='Hotel star rating distribution')

In [229]:
#replacing invalid entries with nan
goa['hotel_star_rating'] = goa['hotel_star_rating'].apply(pd.to_numeric, errors='coerce')

In [230]:
goa['hotel_star_rating'].value_counts()

1.0    643
3.0    243
2.0    147
4.0     63
5.0     25
Name: hotel_star_rating, dtype: int64

#### Checking the number of unique travellers associated with each accomodation, .

In [231]:
def get_customer_stats_per_hotel(df:pd.DataFrame)->dict:
    max_customer = df.groupby(['property_name'])['uniq_id'].agg('count').max()
    min_customer = df.groupby(['property_name'])['uniq_id'].agg('count').min()
    mean_customer = (df.groupby(['property_name'])['uniq_id'].agg('count').mean()).round()
    return {"The maximum no of customers each accomodation is ": max_customer,
            "The minimum no of customers each accomodation is ": min_customer,
            "The average no of customers each accomodation is ": mean_customer}

In [232]:
print(get_customer_stats_per_hotel(goa))

{'The maximum no of customers each accomodation is ': 5, 'The minimum no of customers each accomodation is ': 1, 'The average no of customers each accomodation is ': 1.0}


#### The max number of customers who visited a hotel was 5 and min was 1. Hotel Om shiv and Roasdhouse hostels were among the popular picks. While majority of the hotels was visited by only one customer.

In [233]:
goa['property_name'].value_counts().iplot(kind='barh', yTitle='Count', linecolor='black', title='No. of travlers visited each property')

#### Customer preference of property types: The customers usually preferred hotels over other types of accomodation. This can be due to customer age and other details as well. Usually, hostels are preferred by single and solo travellers with lower budgets while families stick to comfortable hotels and average to higher rentals. This is worth investigating but the data does not provide more details on customers.

In [234]:
goa['property_type'].value_counts().iplot(kind='barh', yTitle='Count', linecolor='black', title='No. of travlers visited each hotel')

In [235]:
goa[['property_type','room_types']].head()

Unnamed: 0,property_type,room_types
0,Hotel,Standard Ac
1,Hotel,Luxury Flat 1 Bhk
2,Hotel,4 Bedroom Pool Villa
3,Apartment,2 Bhk Apartment
4,Hotel,`standard Room


In [236]:
goa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122 entries, 0 to 1121
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   area                             931 non-null    object 
 1   city                             1122 non-null   object 
 2   country                          1117 non-null   object 
 3   crawl_date                       662 non-null    object 
 4   highlight_value                  965 non-null    object 
 5   hotel_overview                   1006 non-null   object 
 6   hotel_star_rating                1121 non-null   float64
 7   image_urls                       472 non-null    object 
 8   in_your_room                     897 non-null    object 
 9   is_value_plus                    1122 non-null   object 
 10  latitude                         1121 non-null   float64
 11  longitude                        1121 non-null   float64
 12  mmt_holidayiq_review

In [238]:
goa.loc[goa['property_type'] == 'xyz']

Unnamed: 0,area,city,country,crawl_date,highlight_value,hotel_overview,hotel_star_rating,image_urls,in_your_room,is_value_plus,...,property_type,qts,query_time_stamp,room_types,site_review_count,site_review_rating,sitename,state,traveller_rating,uniq_id
187,Anjuna Bardez,Goa,India,2016-09-04,Airport Transfer|Doctor on Call|Fitness Centre|Laundry Service|Outdoor Activities|Parking Facility|Railway Station Transfer|Restaurant|Room Service|Sun Beds (pool)|Swimming Pool|Taxi Services|WiFi at a charge,"Whispering Woods, Goa, ensures that its guests enjoy their stay to the fullest. The hotel is designed to render a relaxing refuge to the guests. It offers uncompromising comfort and convenience with necessary amenities and neatly arranged cosy rooms. Guests can visit places like Nyex Beach Club(3 km), Anjuna Beach(3 km), Baga Beach(5 km), Calangute Beach(6 km), St Alex Church(6 km) and many more.ÃƒÆ’Ã¢â‚¬Å¡Ãƒâ€šÃ‚Â ÃƒÆ’Ã¢â‚¬Å¡Ãƒâ€šÃ‚Â The rooms are designed to offer the perfect blend of solace and convenience. Elegantly appointed and equipped with carefully selected amenities, the rooms are perfect haven to retire to after a long day. They give the most relaxed feeling and comfort of home to the patrons. The hotel is located at a distance of 19 km from Thivim Railway Station and 43 km from Goa International Airport.ÃƒÆ’Ã¢â‚¬Å¡Ãƒâ€šÃ‚Â ÃƒÆ’Ã¢â‚¬Å¡Ãƒâ€šÃ‚Â Have a great stay at Whispering Woods, Goa!",2.0,,Room Service|Hot & Cold Running Water|Air conditioning|Refrigerator|TV|Attached Bathroom,no,...,xyz,2016-09-04 10:11:02 +0000,2016-09-04 10:11:02 +0000,Standard Ac Room,,,makemytrip,Goa,Location:3.5/5 | Hospitality:3.5/5 | Facilities:3.7/5 | Cleanliness:3.2/5 | Value for Money:3.5/5 | Food:1.3/5,4f461bf1efc74dfca6e62fb81c477aac


#### Let's understand the distribution of star ratings by each type of accomodations. Quite obviously, hotels which we know see most of the bookings have all sorts if star ratings with highest numbers of 5 and 3 stars. We can ignore xyz category as we had treated all the null types to be xyz in our preceeding steps and that's just one record. Interestingly, there's one 5 star villa and one 3 star villa, while there's one cottage with a 3 star rating as well.

In [239]:
goa.groupby(['property_type'])['hotel_star_rating'].value_counts().iplot(kind='barh', yTitle='star ratings', linecolor='black', title='Hotel star rating distribution for each type of property')

#### Let's understand if there is any relationship between hotel's star rating and mmt review score

#### Although the data is highly imbalanced when it comes to different ratings but we can notice, higher rated hotels tend to have a higher mmt score too. The properties rated 4 and 5 have more records with higher mmt rating score, while the proerties rated 1, 2 and 3 have more or less equal number of high and low mmt rating scores.

In [240]:
#colour coding the review scores
colrs = [0 if 0<=s<=1 else 1 if 1<s<=2 else 2 if 2<s<=3 else 3\
    if 3<s<=4 else 4 if 4<s<=5 else -1\
    for s in goa['mmt_review_score'].values]
fig = go.Figure(data=go.Scatter(x=goa.hotel_star_rating,
                                y=goa.mmt_review_score, 
                                mode='markers',
                                marker=dict(
                                size=10,
                                color=colrs),
                                name="mmt review scores"))
fig.update_layout(title={
    'text':"Property's star rating vs MMT review score",
    'y':0.9,
    'x':0.5},
    xaxis_title="Property star ratings",
    yaxis_title="Mmt review scores",showlegend=True)
fig.show()

## Content based recommender

In [241]:
# There are lot of ascii characters which if not ignored will create wrong word frequency
goa['hotel_overview'] = goa['hotel_overview'].apply(lambda x: x.encode("ascii", "ignore") if x is not np.NaN else x)

In [242]:
# function to return description on giving the id of the concerned property
def get_details(df:pd.DataFrame, property_id:str)->dict:
    return {property_id: [{'name': df.loc[df['property_id']== property_id][['property_name', 'hotel_overview']].values[0][0]},
           {'name': df.loc[df['property_id']== property_id][['property_name', 'hotel_overview']].values[0][1]}]}

In [243]:
# Generalised function to get common words
def get_top_ngram(corpus: str, no_words=20, *ngrams_params):
    if ngrams_params:
        ngrams_args = [x for x in ngrams_params]
        stop_words = ''
        n = ''
        for x in ngrams_args:
            if isinstance(x, str):
                stop_words += x
            elif isinstance(x, int):
                n += str(x)
                n = int(n)
            else:
                return "please enter valid stopwords or ngrams range"  
        if n:
            if stop_words:
                vec = CountVectorizer(ngram_range=(n, n), stop_words=stop_words).fit(corpus)
            else:
                vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
        elif stop_words:
            vec = CountVectorizer(stop_words='english').fit(corpus)
            
    else:
        vec = CountVectorizer().fit(corpus)    
    
    bow = vec.transform(corpus)
    sum_words = bow.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:no_words]

In [244]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20))
top_words.columns = ['words', 'count']

In [245]:
# function to plot bar chart of the top 20 words of hote description
def plot_barcharts(df:pd.DataFrame, title: str)->None:
    fig = go.Figure(data=[
    go.Bar(x= df['words'],y=df['count'],text= df['words'], textposition='auto')])
    fig.update_layout(
    title={
    'text':title,
    'y':0.9,
    'x':0.5},
    xaxis_title="top words",
    yaxis_title="word counts",
    )
    fig.show()
    

In [246]:
#plot the top 20 words from the hote description
plot_barcharts(top_words, "Top 20 words in hotel description before removing stop words")

In [247]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 'english'))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 words in hotel description after removing stop words")

In [248]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 2))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 bi-grams in hotel description before removing stop words")

In [249]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 'english', 2))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 bi-grams in hotel description after removing stop words")

In [250]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 3))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 tri-grams in hotel description before removing stop words")

In [251]:
top_words = pd.DataFrame(get_top_ngram(goa['hotel_overview'].dropna(), 20, 'english', 3))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 tri-grams in hotel description after removing stop words")

In [252]:
goa['word_count'] = goa['hotel_overview'].apply(lambda x: len(str(x).split()))
desc_lengths = list(goa['word_count'])
print("Number of descriptions:",len(desc_lengths),
      "\nAverage word count", np.average(desc_lengths),
      "\nMinimum word count", min(desc_lengths),
      "\nMaximum word count", max(desc_lengths))

Number of descriptions: 1122 
Average word count 153.92869875222817 
Minimum word count 1 
Maximum word count 789


In [253]:
goa['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution in Hotel Description')

In [254]:
goa['hotel_overview'] = goa['hotel_overview'].astype(str).apply(lambda x: x.replace("b'","")).apply(lambda x: x.replace('b"',""))

In [255]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = str(text.lower()) # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
    
goa['desc_clean'] = goa['hotel_overview'].dropna().apply(clean_text)

In [256]:
"""
making hotel name as index so that the user can enter just the hotel name while,
recommendation logic remains on hotel description
"""
goa.set_index('property_name', inplace = True)

In [387]:

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(goa['desc_clean'].drop_duplicates().dropna())
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

#noting the indices of unique properties with valid inputs
indices = pd.Series(goa['desc_clean'].drop_duplicates().dropna().index)

def recommendations(name, cosine_similarities = cosine_similarities):
    
    recommended_hotels = []
    # gettin the index of the hotel that matches the name
    idx = indices[indices == name].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar hotels except itself
    top_10_indexes = list(score_series.iloc[0:10].index)
    
    # populating the list with the names of the top 10 matching hotels
    for i in top_10_indexes:
        recommended_hotels.append(goa['desc_clean'].drop_duplicates().dropna().index[i])
        
    return recommended_hotels

In [388]:
# Let's check what are the n-grams features that were generated
tf.get_feature_names()

['00',
 '00 pm',
 '00 pm checkout',
 '000',
 '000 square',
 '000 square feet',
 '04',
 '04 14',
 '04 14 30',
 '04th',
 '04th july',
 '04th july 2015',
 '06th',
 '06th june',
 '06th june 2016',
 '10',
 '10 15',
 '10 15 minutes',
 '10 airconditioned',
 '10 airconditioned rooms',
 '10 calangute',
 '10 calangute superior',
 '10 km',
 '10 km baga',
 '10 km basilica',
 '10 km calangute',
 '10 km capital',
 '10 km carambolim',
 '10 km casino',
 '10 km chapora',
 '10 km directorate',
 '10 km dona',
 '10 km hotel',
 '10 km miramar',
 '10 km palolem',
 '10 km pandit',
 '10 km panjim',
 '10 km railway',
 '10 km salim',
 '10 km se',
 '10 km shoppers',
 '10 km thivim',
 '10 km vagator',
 '10 km viceroys',
 '10 min',
 '10 min anjuna',
 '10 min baga',
 '10 mins',
 '10 mins away',
 '10 minute',
 '10 minute drive',
 '10 minute rides',
 '10 minutes',
 '10 minutes anjuna',
 '10 minutes away',
 '10 minutes calangute',
 '10 minutes drive',
 '10 minutes margao',
 '10 minutes walk',
 '10 people',
 '10 people

#### The recommendations of similar properties based on property description is given out on feeding the property name

In [389]:
recommendations('Keys Ronil Resort, Goa')

['Keys Ronil Resort, Goa',
 'Keys Ronil Resort, Goa',
 'Zinhos Beach Resort',
 'Beira Mar Alfran',
 'Graciano Cottage',
 'Palmarinha Resort and Suites',
 'Deltin Suites',
 'Deltin Suites',
 'Calangute Grande',
 'Valentines Retreat']

#### Lets try to see if we find some common words from the description of the target 'Keys Ronil Resort, Goa' and a recommendation- 'Palmarinha Resort and Suites'

In [391]:
top_words = pd.DataFrame(get_top_ngram(goa.loc['Beira Mar Alfran']['desc_clean'].dropna(), 20, 'english'))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 words in hotel description after removing stop words from- Beira Mar Alfran")

In [392]:
top_words = pd.DataFrame(get_top_ngram(goa.loc['Keys Ronil Resort, Goa']['desc_clean'].dropna(), 20, 'english'))
top_words.columns = ['words', 'count']
plot_barcharts(top_words, "Top 20 words in property description after removing stop words from -Keys Ronil Resort, Goa")

#### There are some common words like 'beach', 'rooms', 'goa', etc. There are also other combinations of bigrams and trigrams which are common acroos both the properties.



In [394]:
def plot_similarities(df:pd.DataFrame, target: str)->None:
    idx = indices[indices == target].index[0]
    fig = go.Figure(data=go.Scatter(x=goa['desc_clean'].
                                    drop_duplicates().
                                    dropna().index,
                                    y=cosine_similarities[idx], 
                                    mode='markers',
                                    marker=dict(
                                    size=10,
                                    color=colrs),
                                    name="respective cosine similarities"))
    fig.update_layout(title={
        'text':"Cosine similarities of all properties with "+target,
        'y':0.95,
        'x':0.5},
        xaxis_title="Property Names",
        yaxis_title="cosine similarities",showlegend=True, height=1000)
    fig.show()

In [395]:
plot_similarities(goa, 'Keys Ronil Resort, Goa')

#### We can see from the plot,'Zinhos Beach Resort' is the most similar to the description of 'Keys Ronil Resort, Goa'. Similarly, our other recommendations too stand out in the plot. The highest point is the similarity of the resort with itself, we can just ignore this and focus on the other recommendations. With better data, we might be able to see more similarity. 

### Evaluation

#### In the academic world, we can evaluate our recommender systems with metrics like prediction errors(RMSE/MAE) and Recall/Catalog coverage. In RMSE based approaches, it is assumed that the user has explicitly rated a product. Now the difference between the ratings/scores and whatever score our recommender system has computed for each data point is calculated and squared, then the mean is found out and finally the square root is found out. The resultant is the RMSE. MAE on the other hand is simply the mean of the absolute differences between the observed and predicted values. Lesser the RMSE/MAE, better the model. However, in real life as in this example, the ratings are really sparse hence, we cannot rely solely on these metrics when it comes to putting the models to production.
#### In real life, we can evaluate our models by dividng our users into A/B test groups with the help of hashed userID cookies or tools likje VWO, Optimizely, etc. Then we can track the click through rate(CTR) and the conversion rate(CR) of the recommendations. in order to help the business, we should also look at the return on investment(ROI).


### Conclusion
#### For our data, we are performing a content based recommendation using NLP. Higher the  similarity of descriptions of a property with one another, better the recommendation. We could have worked on some other types of recommender systems like item-item based collaborative filtering, user-user based collaborative filtering, etc. But our data is really sparse in terms of user ratings, usually this kind of data is difficult to find and is not public, also scraping is disabled for most of the aggregator websites, so we stick to content based recommendations using NLP. We have seen from the data, that we are giving the names of the properties which have hoghest cosine similarity values(which are not that high in our case as it depends on the description). Better descriptions can certainly bring better meaning and help in computing the similarities.