In [None]:
import pandas as pd
import numpy as np
from scipy.stats import kendalltau

### Calculating Kendall Tau Correlations between Serendipity-Based System and Ground Truth rankings

In [None]:
file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/system_rag_query1_bars.xlsx"
#file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/system_rag_query2_burgers.xlsx"
#file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/system_rag_query3_cafe.xlsx"
#file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/system_rag_query4_nightlife.xlsx"
#file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/system_rag_query5_american.xlsx"

xls = pd.ExcelFile(file_path)
df = xls.parse('Sheet1')
print("Data loaded from Excel. Number of total records:", len(df))

user_results = []

# Group the data by user_profile
print("\nCalculating Kendall Tau correlations per user...")
for user, group in df.groupby('user_profile'):
    print("\n" + "="*60)
    print(f"User Profile (first 60 chars): {user[:60]}...")
    print(f"Number of items: {len(group)}")

    if len(group) < 2:
        print("Skipped (not enough records for correlation)")
        continue

    # Calculate rankings (higher value = better rank, ties ranked by order of appearance)
    group = group.copy()
    group['rank_numeric_serendipity'] = group['numeric_serendipity'].rank(ascending=False, method='first')
    group['rank_stars'] = group['stars'].rank(ascending=False, method='first')
    group['rank_system_serendipity'] = group['system_numeric_serendipity'].rank(ascending=False, method='first')
    group['rank_system_baseline'] = group['system_numeric_serendipity_baseline'].rank(ascending=False, method='first')

    print("\nItem Rankings:")
    print(group[['name', 
                 'numeric_serendipity', 'rank_numeric_serendipity',
                 'system_numeric_serendipity', 'rank_system_serendipity',
                 'system_numeric_serendipity_baseline', 'rank_system_baseline',
                 'stars', 'rank_stars']])

    print("\nCalculating Kendall Tau correlations...")
    tau_system, _ = kendalltau(group['system_numeric_serendipity'], group['numeric_serendipity'])
    tau_baseline, _ = kendalltau(group['system_numeric_serendipity_baseline'], group['numeric_serendipity'])
    tau_stars, _ = kendalltau(group['stars'], group['numeric_serendipity'])

    print(f"Kendall Tau (System Serendipity vs GT Serendipity): {round(tau_system, 3)}")
    print(f"Kendall Tau (System Surprise vs GT Serendipity):  {round(tau_baseline, 3)}")
    print(f"Kendall Tau (Stars vs GT Serendipity):            {round(tau_stars, 3)}")

    user_results.append({
        'user_profile': user,
        'System vs Numeric Serendipity': round(tau_system, 3),
        'System Baseline vs Numeric Serendipity': round(tau_baseline, 3),
        'Stars vs Numeric Serendipity': round(tau_stars, 3)
    })

user_corr_df = pd.DataFrame(user_results)
print("\nSummary of first few per-user Kendall Tau correlations:")
print(user_corr_df.head())

# Calculate average Kendall Tau values (ignoring NaNs), rounded
average_corr = {
    'user_profile': 'Average',
    'System vs Numeric Serendipity': round(np.nanmean(user_corr_df['System vs Numeric Serendipity']), 3),
    'System Baseline vs Numeric Serendipity': round(np.nanmean(user_corr_df['System Baseline vs Numeric Serendipity']), 3),
    'Stars vs Numeric Serendipity': round(np.nanmean(user_corr_df['Stars vs Numeric Serendipity']), 3)
}
average_corr_df = pd.DataFrame([average_corr])
print("\nAverage Kendall Tau Correlations Across All Users:")
print(average_corr_df)

output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/query1_bars_user_kendall_tau_correlations.xlsx"
#output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/query2_burgers_user_kendall_tau_correlations.xlsx"
#output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/query3_cafe_user_kendall_tau_correlations.xlsx"
#output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/query4_nightlife_user_kendall_tau_correlations.xlsx"
#output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants/query5_american_user_kendall_tau_correlations.xlsx"
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    user_corr_df.to_excel(writer, sheet_name='User Kendall Tau', index=False)
    average_corr_df.to_excel(writer, sheet_name='Average Kendall Tau', index=False)

print(f"\nKendall Tau correlations saved to: {output_file}")

### Calculating Kendall Tau Correlations between Combined Star-Serendipity-Based System and Ground Truth rankings

In [None]:
file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/system_rag_query1_bars_ss.xlsx"
#file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/system_rag_query2_burgers_ss.xlsx"
#file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/system_rag_query3_cafe_ss.xlsx"
#file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/system_rag_query4_nightlife_ss.xlsx"
#file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/system_rag_query5_american_ss.xlsx"

xls = pd.ExcelFile(file_path)
df = xls.parse('Sheet1')
print("Data loaded from Excel. Number of total records:", len(df))

user_results = []

# Group the data by user_profile
print("\nCalculating Kendall Tau correlations per user (vs stars_serendipity)...")
for user, group in df.groupby('user_profile'):
    print("\n" + "="*60)
    print(f"User Profile (first 60 chars): {user[:60]}...")
    print(f"Number of items: {len(group)}")

    if len(group) < 2:
        print("Skipped (not enough records for correlation)")
        continue

    # Copy and rank all relevant columns using 'first' method
    group = group.copy()
    group['rank_stars_serendipity'] = group['stars_serendipity'].rank(ascending=False, method='first')  # Changed here
    group['rank_stars'] = group['stars'].rank(ascending=False, method='first')
    group['rank_system_stars_serendipity'] = group['system_stars_serendipity'].rank(ascending=False, method='first')
    group['rank_system_stars_baseline'] = group['system_stars_serendipity_baseline'].rank(ascending=False, method='first')

    print("\nItem Rankings:")
    print(group[['name', 
                 'stars_serendipity', 'rank_stars_serendipity',
                 'system_stars_serendipity', 'rank_system_stars_serendipity',
                 'system_stars_serendipity_baseline', 'rank_system_stars_baseline',
                 'stars', 'rank_stars']])

    # Calculate Kendall Tau correlations against stars_serendipity
    print("\nCalculating Kendall Tau correlations...")
    tau_system, _ = kendalltau(group['system_stars_serendipity'], group['stars_serendipity'])
    tau_baseline, _ = kendalltau(group['system_stars_serendipity_baseline'], group['stars_serendipity'])
    tau_stars, _ = kendalltau(group['stars'], group['stars_serendipity'])

    print(f"Kendall Tau (System Stars Serendipity vs Stars Serendipity): {round(tau_system, 3)}")
    print(f"Kendall Tau (System Stars Baseline vs Stars Serendipity):  {round(tau_baseline, 3)}")
    print(f"Kendall Tau (Stars vs Stars Serendipity):            {round(tau_stars, 3)}")

    user_results.append({
        'user_profile': user,
        'System Stars Serendipity vs Stars Serendipity': round(tau_system, 3),
        'System Stars Baseline vs Stars Serendipity': round(tau_baseline, 3),
        'Stars vs Stars Serendipity': round(tau_stars, 3)
    })

user_corr_df = pd.DataFrame(user_results)
print("\nSummary of first few per-user Kendall Tau correlations:")
print(user_corr_df.head())

# Calculate averages
average_corr = {
    'user_profile': 'Average',
    'System Stars Serendipity vs Stars Serendipity': round(np.nanmean(user_corr_df['System Stars Serendipity vs Stars Serendipity']), 3),
    'System Stars Baseline vs Stars Serendipity': round(np.nanmean(user_corr_df['System Stars Baseline vs Stars Serendipity']), 3),
    'Stars vs Stars Serendipity': round(np.nanmean(user_corr_df['Stars vs Stars Serendipity']), 3)
}
average_corr_df = pd.DataFrame([average_corr])
print("\nAverage Kendall Tau Correlations Across All Users (vs stars_serendipity):")
print(average_corr_df)

output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/query1_bars_user_kendall_tau_correlations_ss.xlsx"
#output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/query2_burgers_user_kendall_tau_correlations_ss.xlsx"
#output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/query3_cafe_user_kendall_tau_correlations_ss.xlsx"
#output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/query4_nightlife_user_kendall_tau_correlations_ss.xlsx"
#output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/ranking_items/restaurants_star_serendipity/query5_american_user_kendall_tau_correlations_ss.xlsx"
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    user_corr_df.to_excel(writer, sheet_name='User Kendall Tau', index=False)
    average_corr_df.to_excel(writer, sheet_name='Average Kendall Tau', index=False)

print(f"\nKendall Tau correlations saved to: {output_file}")