## IMPORTS

In [1]:
import pandas as pd
import statistics as st
from scipy.stats import pearsonr
import sqlite3

## FETCH DATA FROM CSV

In [2]:
booking = pd.read_csv('booking_nyc.csv')
trivago = pd.read_csv('trivago_nyc.csv') # Extra datasets can be parsed similarly and added to website_list

## FETCH DATA FROM DATABASE

In [None]:
conn = sqlite3.connect('hotel_booking_nyc.db')

booking = pd.read_sql('SELECT * FROM booking_hotel_table', conn)
trivago = pd.read_sql('SELECT * FROM trivago_hotel_table', conn) # Extra datasets can be parsed similarly and added to website_list

print(trivago)

## PRE-PROCESSING

In [None]:
website_list = {"booking":booking, "trivago":trivago} # Extra dataset need to be appended to this dictionary
websites = list(website_list.keys())

def convert_price(price_str):
    cleaned_price = price_str.replace(',', '')[1:]
    return int(cleaned_price)


# Dictionary to segregrate data of each hotel website on basis of stars
website_price_data = {} 
for name, website_df in website_list.items():
    website_price_data[name] = {}
    for star in range(1,6):
        filtered_df = website_df[website_df['Star'] == star]
        filtered_df['Price'] = filtered_df['Price'].apply(convert_price)
        website_price_data[name][star] = list(filtered_df[['Hotel Name', 'Price', 'Rating']].itertuples(index=False, name=None))


# Dictionary to store name and price of each hotel on website
hotel_price_dict = {}
for name, website_df in website_list.items():
    hotel_price_dict[name] = website_df.set_index('Hotel Name')['Price'].to_dict()

temp_dict = {}
for website_name in hotel_price_dict.keys():
    temp_dict[website_name] = {name: convert_price(price) for name, price in hotel_price_dict[website_name].items()}

hotel_price_dict = temp_dict


## RATE COMPARISON

1. Common hotel price competitiveness(Scoring out of 5)

In [9]:
def common_hotel_price_competitiveness():
    # Find common hotels across all websites
    common_hotels = set(hotel_price_dict[websites[0]].keys())  # Start with hotels from the first website
    for website in websites[1:]:
        common_hotels.intersection_update(hotel_price_dict[website].keys())  # Keep only hotels common across all websites
    
    # Calculate the website offering the minimum price for each common hotel
    min_price_count = {website: 0 for website in websites}  # To count how often each website offers the lowest price
    total_common_hotels = len(common_hotels)

    for hotel in common_hotels:
        prices = {website: hotel_price_dict[website][hotel] for website in websites}  # Prices of the hotel across all websites
        min_price_website = min(prices, key=prices.get)  # Find website with minimum price
        min_price_count[min_price_website] += 1  # Increment count for the website offering the lowest price

    # Calculate the percentage of times each website offers the minimum price
    score_cheaper = {website: (count / total_common_hotels) * 5 for website, count in min_price_count.items()}
    
     #Print price competitiveness scores for each website
    for website, score in score_cheaper.items():
        print(f"The price competitiveness score for {website} is {score}")

    return score_cheaper

2. Star category price competitiveness - Median and CoV(Scoring out of 5)

In [10]:
def find_coefficient_of_variance(lst):
    price_list = [t[1] for t in lst]  # Extract price from the tuple (name, price, ...)
    if st.mean(price_list) == 0:  # Avoid division by zero
        return 0
    return st.stdev(price_list) / st.mean(price_list)

#Calculate CoV scores for each website
def calculate_cov_score():
    cov_scores = {}

    for website in websites:
        total_cov = 0
        # Calculate sum of CoV for all star category of each website
        for star in range(1,6):
            total_cov += find_coefficient_of_variance(website_price_data[website][star])
        cov_scores[website] = 2.5 - (total_cov / 2)

    #Print CoV scores for each website
    for website, cov_score in cov_scores.items():
        print(f"The CoV score for {website} is {cov_score:.5f}")

    return cov_scores

#Calculate Median 
def calculate_median_scores():

    website_median_list = {}

    #Calculate median of each star for website
    for website in websites:
        website_median_list[website] = []
        for star in range(1,6):
            website_median_list[website].append(st.median([t[1] for t in website_price_data[website][star]]))

    median_scores = {website: 0 for website in website_price_data.keys()}

    for star in range(1,6):
        prices = {website: website_median_list[website][star-1] for website in website_median_list}  # Median prices of the hotel across all websites
        min_price_website = min(prices, key=prices.get)  # Find website with minimum price
        median_scores[min_price_website] += 1  # Increment count for the website offering the lowest price

    median_scores = {website: (count / 5) * 2.5 for website, count in median_scores.items()}

    #Print median scores for each website
    for website, score in median_scores.items():
        print(f"The median score for {website} is {score:.5f}")

    return median_scores
    



3. Correlation between rating and price(Scoring out of 5)

In [11]:
def correlation_between_price_and_rating():
    website_price_rating_correlation_coefficient = {}

    for website in websites:
        website_price_rating_correlation_coefficient[website] = []
        for star in range(1,6):
            website_price_rating_correlation_coefficient[website].append(pearsonr([t[1] for t in website_price_data[website][star] if t[2]!='0'], [float(t[2]) for t in website_price_data[website][star] if t[2]!='0']).correlation)

    correlation_score = {website: 0 for website in website_price_data.keys()}

    for star in range(1,6):
        correlation_coef = {website: website_price_rating_correlation_coefficient[website][star-1] for website in website_price_rating_correlation_coefficient}  
        min_correlation_coef_website = max(correlation_coef, key=correlation_coef.get)  # Find website with maximum correlation coefficient
        correlation_score[min_correlation_coef_website] += 1  # Increment count for the website offering the maximum correlation coefficient

    #Print correlation scores for each website
    for website, score in correlation_score.items():
        print(f"The correlation score for {website} is {score:.5f}")

    return correlation_score

## FINAL SCORE

In [12]:
common_hotel_price_competitiveness_score = common_hotel_price_competitiveness()
cov_score = calculate_cov_score()
median_score = calculate_median_scores()
price_rating_correlation_score = correlation_between_price_and_rating()

final_score = {}

for website in websites:
    final_score[website] = common_hotel_price_competitiveness_score[website] + cov_score[website] + median_score[website] + price_rating_correlation_score[website]


The price competitiveness score for booking is 4.51219512195122
The price competitiveness score for trivago is 0.4878048780487805
The CoV score for booking is 0.59073
The CoV score for trivago is 1.18733
The median score for booking is 2.00000
The median score for trivago is 0.50000
The correlation score for booking is 2.00000
The correlation score for trivago is 3.00000


In [14]:
#Print final scores for each website
for website, score in final_score.items():
    print(f"The final score regarding better rates for {website} is {score:.5f} out of 15")

The final score regarding better rates for booking is 9.10292 out of 15
The final score regarding better rates for trivago is 5.17513 out of 15
