In [250]:
import pandas as pd
import numpy as np

# Loading in Data

In [251]:
hotel_review = pd.read_csv('Hotel_Reviews.csv')
hotel_review_sample = hotel_review.sample(frac=0.2, random_state = 42)
hotel_review_sample.to_csv("hotel_review_sample.csv", index = False)

In [252]:
hotel_prices = pd.read_csv('booking_bcn1.csv', index_col = 0)

Checking to see how many hotel names match

In [253]:
hotel_review_sample['name_processed'] = hotel_review_sample['Hotel_Name'].str.strip().str.lower()

drop_cols = [
    'Hotel_Name'
]

hotel_review_sample = hotel_review_sample.drop(columns=drop_cols)

In [254]:
hotel_prices['name_processed'] = hotel_prices['Hotels'].str.lower().str.split()

drop_cols = [
    'Hotels'
]

hotel_prices = hotel_prices.drop(columns=drop_cols)

In [255]:
hotel_prices['name_processed'] = hotel_prices['name_processed'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
matching_hotels = set(hotel_prices['name_processed']).intersection(set(hotel_review_sample['name_processed']))

# Count the number of matches
num_matches = len(matching_hotels)

print(f"Number of matching hotel names: {num_matches}")


Number of matching hotel names: 96


Getting rid of columns we don't need

In [256]:
hotel_review_sample.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Reviewer_Nationality', 'Negative_Review',
       'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews',
       'Positive_Review', 'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags',
       'days_since_review', 'lat', 'lng', 'name_processed'],
      dtype='object')

In [257]:
drop_cols = [
    'Hotel_Address', 
    'Additional_Number_of_Scoring', 
    'Review_Date', 
    'Average_Score', 
    'Reviewer_Nationality', 
    'Total_Number_of_Reviews', 
    'Tags', 
    'days_since_review', 
    'lat', 
    'lng'
]

hotel_review_sample = hotel_review_sample.drop(columns=drop_cols)

In [258]:
merged_df = hotel_review_sample.merge(
    hotel_prices, on="name_processed", how="inner"
)


In [259]:
# merged_df
merged_df.columns

Index(['Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Positive_Review', 'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'name_processed', 'Prices', 'Descriptions', 'Full_Descriptions'],
      dtype='object')

In [260]:
print(merged_df.isna().sum())  # See missing values


Negative_Review                               0
Review_Total_Negative_Word_Counts             0
Positive_Review                               0
Review_Total_Positive_Word_Counts             0
Total_Number_of_Reviews_Reviewer_Has_Given    0
Reviewer_Score                                0
name_processed                                0
Prices                                        0
Descriptions                                  0
Full_Descriptions                             0
dtype: int64


In [270]:
merged_df = merged_df[['name_processed']+ ['Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Positive_Review', 'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
        'Prices', 'Descriptions', 'Full_Descriptions']]

In [271]:
merged_df

Unnamed: 0,name_processed,Negative_Review,Review_Total_Negative_Word_Counts,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Prices,Descriptions,Full_Descriptions
0,room mate anna,Nothing to dislike,4,Location Culture bars and restaurants Transpo...,13,2,9.6,1418.0,1 double bed,Featuring a seasonal rooftop plunge pool with ...
1,hotel barcelona center,No Negative,0,The hotel was in a great spot near to everyth...,29,7,8.8,1584.0,Multiple bed types,Barcelona Center is 400 metres from Barcelona’...
2,hotel barcelona catedral,Due to the location and it being a very touri...,31,We had such a great experience here The staff...,138,1,10.0,2259.0,1 extra-large double bed,There is a gym and a rooftop chill-out terrace...
3,petit palace boqueria garden,No Negative,0,Free coffee if wanted during breakfast withou...,41,1,7.9,2068.0,Beds: 1 double or 2 singles,The Petit Palace Boqueria Garden is set in a h...
4,hotel ronda lesseps,Semi transparent wall of bathroom It cause un...,14,Easy to reach from and to airport by metro wi...,41,4,7.9,1352.0,1 double bed,The Hotel Ronda Lesseps is a quiet and familia...
...,...,...,...,...,...,...,...,...,...,...
6003,capri by fraser barcelona,There was a faint sewage smell in the bathroo...,24,Lovely studio and apartment hotel with large ...,43,13,9.6,1347.0,Beds: 1 double or 2 singles,This motorcycle-friendly hotel is 3 Metro stop...
6004,petit palace boqueria garden,The streets were very noisy from 3 6am No cof...,18,Great location,4,10,7.9,2068.0,Beds: 1 double or 2 singles,The Petit Palace Boqueria Garden is set in a h...
6005,hotel villa emilia,The room was ready early when we arrived afte...,21,Very nice hotel and extremely helpful and fri...,10,1,10.0,1607.0,Beds: 1 double or 2 singles,This stylish design hotel is located 150 metre...
6006,olivia balmes hotel,No Negative,0,Staff were lovely very useful The shower was ...,29,15,9.2,1904.0,Beds: 1 double or 2 singles,"Offering an outdoor swimming pool, Olivia Balm..."


In [262]:
merged_df['Prices'] = merged_df['Prices'].str.replace('€', '', regex=True).str.replace(',', '', regex=True).str.strip().astype(float)

In [264]:
merged_df.columns

Index(['Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Positive_Review', 'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'name_processed', 'Prices', 'Descriptions', 'Full_Descriptions'],
      dtype='object')

In [272]:
merged_df

Unnamed: 0,name_processed,Negative_Review,Review_Total_Negative_Word_Counts,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Prices,Descriptions,Full_Descriptions
0,room mate anna,Nothing to dislike,4,Location Culture bars and restaurants Transpo...,13,2,9.6,1418.0,1 double bed,Featuring a seasonal rooftop plunge pool with ...
1,hotel barcelona center,No Negative,0,The hotel was in a great spot near to everyth...,29,7,8.8,1584.0,Multiple bed types,Barcelona Center is 400 metres from Barcelona’...
2,hotel barcelona catedral,Due to the location and it being a very touri...,31,We had such a great experience here The staff...,138,1,10.0,2259.0,1 extra-large double bed,There is a gym and a rooftop chill-out terrace...
3,petit palace boqueria garden,No Negative,0,Free coffee if wanted during breakfast withou...,41,1,7.9,2068.0,Beds: 1 double or 2 singles,The Petit Palace Boqueria Garden is set in a h...
4,hotel ronda lesseps,Semi transparent wall of bathroom It cause un...,14,Easy to reach from and to airport by metro wi...,41,4,7.9,1352.0,1 double bed,The Hotel Ronda Lesseps is a quiet and familia...
...,...,...,...,...,...,...,...,...,...,...
6003,capri by fraser barcelona,There was a faint sewage smell in the bathroo...,24,Lovely studio and apartment hotel with large ...,43,13,9.6,1347.0,Beds: 1 double or 2 singles,This motorcycle-friendly hotel is 3 Metro stop...
6004,petit palace boqueria garden,The streets were very noisy from 3 6am No cof...,18,Great location,4,10,7.9,2068.0,Beds: 1 double or 2 singles,The Petit Palace Boqueria Garden is set in a h...
6005,hotel villa emilia,The room was ready early when we arrived afte...,21,Very nice hotel and extremely helpful and fri...,10,1,10.0,1607.0,Beds: 1 double or 2 singles,This stylish design hotel is located 150 metre...
6006,olivia balmes hotel,No Negative,0,Staff were lovely very useful The shower was ...,29,15,9.2,1904.0,Beds: 1 double or 2 singles,"Offering an outdoor swimming pool, Olivia Balm..."


# P(Hotel Recommendation | Sentiment, Reviewer Score, Price)
Converting observations into probabilities and generating CPTs

In [279]:
# Labeling sentiment based on positive/negative word count
def classify_sentiment(row):
    if row['Review_Total_Positive_Word_Counts'] > row['Review_Total_Negative_Word_Counts']:
        return 'Positive'
    elif row['Review_Total_Positive_Word_Counts'] < row['Review_Total_Negative_Word_Counts']:
        return 'Negative'
    else:
        return 'Neutral'

# Apply function to create sentiment column
merged_df['Sentiment'] = merged_df.apply(classify_sentiment, axis=1)


In [280]:
np.mean(merged_df['Reviewer_Score'])

np.float64(8.502496671105193)

In [281]:
# Define recommendation criteria
def recommend_hotel(row):
    if row['Reviewer_Score'] >= 8 and row['Sentiment'] == 'Positive':
        return 1  # Recommend
    else:
        return 0  # Do not recommend

# Apply function
merged_df['Hotel_Recommendation'] = merged_df.apply(recommend_hotel, axis=1)


In [283]:
merged_df['Hotel_Recommendation'].value_counts()


Hotel_Recommendation
0    3182
1    2826
Name: count, dtype: int64

In [301]:
merged_df

Unnamed: 0,name_processed,Negative_Review,Review_Total_Negative_Word_Counts,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Prices,Descriptions,Full_Descriptions,Sentiment,Hotel_Recommendation
0,room mate anna,Nothing to dislike,4,Location Culture bars and restaurants Transpo...,13,2,9.6,1418.0,1 double bed,Featuring a seasonal rooftop plunge pool with ...,Positive,1
1,hotel barcelona center,No Negative,0,The hotel was in a great spot near to everyth...,29,7,8.8,1584.0,Multiple bed types,Barcelona Center is 400 metres from Barcelona’...,Positive,1
2,hotel barcelona catedral,Due to the location and it being a very touri...,31,We had such a great experience here The staff...,138,1,10.0,2259.0,1 extra-large double bed,There is a gym and a rooftop chill-out terrace...,Positive,1
3,petit palace boqueria garden,No Negative,0,Free coffee if wanted during breakfast withou...,41,1,7.9,2068.0,Beds: 1 double or 2 singles,The Petit Palace Boqueria Garden is set in a h...,Positive,0
4,hotel ronda lesseps,Semi transparent wall of bathroom It cause un...,14,Easy to reach from and to airport by metro wi...,41,4,7.9,1352.0,1 double bed,The Hotel Ronda Lesseps is a quiet and familia...,Positive,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6003,capri by fraser barcelona,There was a faint sewage smell in the bathroo...,24,Lovely studio and apartment hotel with large ...,43,13,9.6,1347.0,Beds: 1 double or 2 singles,This motorcycle-friendly hotel is 3 Metro stop...,Positive,1
6004,petit palace boqueria garden,The streets were very noisy from 3 6am No cof...,18,Great location,4,10,7.9,2068.0,Beds: 1 double or 2 singles,The Petit Palace Boqueria Garden is set in a h...,Negative,0
6005,hotel villa emilia,The room was ready early when we arrived afte...,21,Very nice hotel and extremely helpful and fri...,10,1,10.0,1607.0,Beds: 1 double or 2 singles,This stylish design hotel is located 150 metre...,Negative,0
6006,olivia balmes hotel,No Negative,0,Staff were lovely very useful The shower was ...,29,15,9.2,1904.0,Beds: 1 double or 2 singles,"Offering an outdoor swimming pool, Olivia Balm...",Positive,1


In [304]:
merged_df['Prices'].max()

np.float64(17266.0)

In [303]:
merged_df['Prices'].min()

np.float64(1005.0)

In [316]:
merged_df['Prices'].mean()

np.float64(2042.6183422103861)

In [320]:
# creating bins and labels for Reviewer Score and Prices
reviewer_bins = [0, 5, 8, 10]  
reviewer_labels = ['Low', 'Medium', 'High']

price_bins = [1000, 2043, float('inf')]
price_labels = ['Affordable', 'Expensive']


merged_df['Reviewer_Score_Binned'] = pd.cut(merged_df['Reviewer_Score'], bins=reviewer_bins, labels=reviewer_labels, include_lowest=True)
merged_df['Price_Binned'] = pd.cut(merged_df['Prices'], bins=price_bins, labels=price_labels, include_lowest=True)


merged_df[['Reviewer_Score', 'Reviewer_Score_Binned', 'Prices', 'Price_Binned']].head()


Unnamed: 0,Reviewer_Score,Reviewer_Score_Binned,Prices,Price_Binned
0,9.6,High,1418.0,Affordable
1,8.8,High,1584.0,Affordable
2,10.0,High,2259.0,Expensive
3,7.9,Medium,2068.0,Expensive
4,7.9,Medium,1352.0,Affordable


In [321]:
merged_df

Unnamed: 0,name_processed,Negative_Review,Review_Total_Negative_Word_Counts,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Prices,Descriptions,Full_Descriptions,Sentiment,Hotel_Recommendation,Reviewer_Score_Binned,Price_Binned
0,room mate anna,Nothing to dislike,4,Location Culture bars and restaurants Transpo...,13,2,9.6,1418.0,1 double bed,Featuring a seasonal rooftop plunge pool with ...,Positive,1,High,Affordable
1,hotel barcelona center,No Negative,0,The hotel was in a great spot near to everyth...,29,7,8.8,1584.0,Multiple bed types,Barcelona Center is 400 metres from Barcelona’...,Positive,1,High,Affordable
2,hotel barcelona catedral,Due to the location and it being a very touri...,31,We had such a great experience here The staff...,138,1,10.0,2259.0,1 extra-large double bed,There is a gym and a rooftop chill-out terrace...,Positive,1,High,Expensive
3,petit palace boqueria garden,No Negative,0,Free coffee if wanted during breakfast withou...,41,1,7.9,2068.0,Beds: 1 double or 2 singles,The Petit Palace Boqueria Garden is set in a h...,Positive,0,Medium,Expensive
4,hotel ronda lesseps,Semi transparent wall of bathroom It cause un...,14,Easy to reach from and to airport by metro wi...,41,4,7.9,1352.0,1 double bed,The Hotel Ronda Lesseps is a quiet and familia...,Positive,0,Medium,Affordable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6003,capri by fraser barcelona,There was a faint sewage smell in the bathroo...,24,Lovely studio and apartment hotel with large ...,43,13,9.6,1347.0,Beds: 1 double or 2 singles,This motorcycle-friendly hotel is 3 Metro stop...,Positive,1,High,Affordable
6004,petit palace boqueria garden,The streets were very noisy from 3 6am No cof...,18,Great location,4,10,7.9,2068.0,Beds: 1 double or 2 singles,The Petit Palace Boqueria Garden is set in a h...,Negative,0,Medium,Expensive
6005,hotel villa emilia,The room was ready early when we arrived afte...,21,Very nice hotel and extremely helpful and fri...,10,1,10.0,1607.0,Beds: 1 double or 2 singles,This stylish design hotel is located 150 metre...,Negative,0,High,Affordable
6006,olivia balmes hotel,No Negative,0,Staff were lovely very useful The shower was ...,29,15,9.2,1904.0,Beds: 1 double or 2 singles,"Offering an outdoor swimming pool, Olivia Balm...",Positive,1,High,Affordable


In [None]:

cpt_recommendation = (
    merged_df.groupby(['Sentiment', 'Reviewer_Score_Binned', 'Price_Binned'], observed=False)
    ['Hotel_Recommendation']
    .value_counts(normalize=True)
    .unstack(fill_value=0)  # Ensures missing values are treated as 0 probabilities
)


cpt_recommendation

Unnamed: 0_level_0,Unnamed: 1_level_0,Hotel_Recommendation,0,1
Sentiment,Reviewer_Score_Binned,Price_Binned,Unnamed: 3_level_1,Unnamed: 4_level_1
Negative,Low,Affordable,1.0,0.0
Negative,Low,Expensive,1.0,0.0
Negative,Medium,Affordable,1.0,0.0
Negative,Medium,Expensive,1.0,0.0
Negative,High,Affordable,1.0,0.0
Negative,High,Expensive,1.0,0.0
Neutral,Low,Affordable,1.0,0.0
Neutral,Low,Expensive,1.0,0.0
Neutral,Medium,Affordable,1.0,0.0
Neutral,Medium,Expensive,1.0,0.0


In [323]:
group_sizes = merged_df.groupby(['Sentiment', 'Reviewer_Score_Binned', 'Price_Binned']).size()
print(group_sizes)


Sentiment  Reviewer_Score_Binned  Price_Binned
Negative   Low                    Affordable       200
                                  Expensive         70
           Medium                 Affordable       818
                                  Expensive        311
           High                   Affordable       731
                                  Expensive        389
Neutral    Low                    Affordable         7
                                  Expensive          1
           Medium                 Affordable        44
                                  Expensive         13
           High                   Affordable        90
                                  Expensive         39
Positive   Low                    Affordable        22
                                  Expensive          8
           Medium                 Affordable       334
                                  Expensive        106
           High                   Affordable      1993
                  

  group_sizes = merged_df.groupby(['Sentiment', 'Reviewer_Score_Binned', 'Price_Binned']).size()


In [309]:
group_sizes = merged_df.groupby(['Sentiment', 'Reviewer_Score_Binned', 'Price_Binned']).size()
print(group_sizes.describe())  # Check min, max, mean sizes
print(group_sizes[group_sizes == 1])  # See if there are many groups with only 1 entry


count      75.000000
mean       80.106667
std       304.714513
min         0.000000
25%         0.000000
50%         0.000000
75%        16.000000
max      2173.000000
dtype: float64
Sentiment  Reviewer_Score_Binned  Price_Binned
Neutral    High                   Luxury          1
Positive   Low                    Affordable      1
           Medium                 Luxury          1
dtype: int64


  group_sizes = merged_df.groupby(['Sentiment', 'Reviewer_Score_Binned', 'Price_Binned']).size()
