In [10]:
import os
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

# Setting the style
sns.set(style="whitegrid")

# Function to create visualizations for each hotel
def visualize_data(df):
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(df.name + ' Review Scores', fontsize=16)

    # Histogram of Ratings
    sns.histplot(df['Rating'], kde=True, ax=axs[0, 0], bins=5, palette="tab10")
    axs[0, 0].set_title('Rating Distribution')

    # Histogram of Normalized Ratings
    sns.histplot(df['Normalized_Rating'], kde=True, ax=axs[0, 1], bins=5, palette="tab10")
    axs[0, 1].set_title('Normalized Rating Distribution')

    # Histogram of Sentiment Analysis Scores
    sns.histplot(df['SA'], kde=True, ax=axs[1, 0], bins=5, palette="tab10")
    axs[1, 0].set_title('Sentiment Score Distribution')

    # Histogram of Composite Scores
    sns.histplot(df['Composite_Score'], kde=True, ax=axs[1, 1], bins=5, palette="tab10")
    axs[1, 1].set_title('Composite Score Distribution')

    # Adjusting layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    # Show the plots
    plt.show()

    # Boxplot for a clearer view on the quartiles and outliers
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(data=df[['Rating', 'Normalized_Rating', 'SA', 'Composite_Score']], palette="Set2")
    ax.set_title(df.name + ' Score Boxplots')
    plt.xticks(rotation=45)
    plt.show()

# Load data
waterfront_reviews = pd.read_csv(r'processed reviews\Waterfront_Hotel_and_Casino_processed.csv')
bai_reviews = pd.read_csv(r'processed reviews\bai_Hotel_processed.csv')

waterfront_reviews.name = 'Waterfront Hotel and Casino'
bai_reviews.name = 'bai Hotel'

hotels = [waterfront_reviews, bai_reviews]

for df in hotels:
    # Convert 'Rating' to numeric
    df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

    # Normalize ratings to be between -1 and 1
    df['Normalized_Rating'] = df['Rating'].apply(lambda x: (x - 3) / 2)

    # Calculate composite score as the average of normalized rating and sentiment analysis score
    df['Composite_Score'] = (df['Normalized_Rating'] + df['SA']) / 2

    # Divide composite scores into quartiles
    df['Quartile'] = pd.qcut(df['Composite_Score'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

    # Display the first few rows to verify
    print(df.head())
    visualize_data(df)

    # Save processed data
    output_folder = r'scores'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_filename = os.path.join(output_folder, df.name.replace(' ', '_') + '_processed.csv')
    df.to_csv(output_filename, index=False)
    print("Processed data saved to:", output_filename)
    print('\n')

   Rating                                        Review Text      SA  \
0       5  happy recently concluded graduation event wate...  0.9735   
1       4  always book hotel hotel staff friendly always ...  0.9042   
2       5  stayed 5 nights waterfront hotel business trip...  0.9153   
3       3  hotel aok breakfast pool familys primary reaso...  0.4404   
4       3  attended event held hotel decided overnight st... -0.9022   

  Sentiment  Normalized_Rating  Composite_Score Quartile  
0  Positive                1.0          0.98675       Q4  
1  Positive                0.5          0.70210       Q2  
2  Positive                1.0          0.95765       Q4  
3  Positive                0.0          0.22020       Q1  
4  Negative                0.0         -0.45110       Q1  
Processed data saved to: scores\Waterfront_Hotel_and_Casino_processed.csv


   Rating                                        Review Text      SA  \
0       5  hotel restaurants excellent especially staff e...  0.9