In [None]:
import os
import json
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Create a dictionary to store the LLM predictions
llm_predictions = {}

user_reports_df = pd.read_excel("user-self-reports/target-enjoyment.xlsx", sheet_name='target-enjoyment', header=1)

# Loop through all the LLM prediction files in the Backup-gemini-2.5-pro directory
for filename in os.listdir("./llm-gemini-2.o-flash-predictions"):
    if not filename.endswith("-LLM-pred.txt"):
        continue
    
    # Extract participant ID from filename
    participant_id = int(re.search(r'P(\d+)', filename).group(1))
    
    # Read the file content
    with open(f"./llm-gemini-2.o-flash-predictions/{filename}", "r") as f:
        content = f.read()
    
    # Split the content by robot
    robot_sections = content.split("Robot: ")
    
    for section in robot_sections[1:]:  # Skip the first empty section
        lines = section.strip().split("\n", 1)
        if len(lines) < 2:
            continue
            
        robot_name = lines[0]

        if robot_name != user_reports_df.loc[user_reports_df['PID'] == participant_id, 'Q1-Robot'].values[0]:
            continue

        # Extract the response text
        response_text = lines[1].replace("Response: ", "")
        
        # Try to parse the JSON response
        try:
            # Extract JSON part if it exists
            json_match = re.search(r'(\{.*\})', response_text, re.DOTALL)
            if json_match:
                response_json = json.loads(json_match.group(1))
                
                # Create a dictionary for this participant and robot if it doesn't exist
                if participant_id not in llm_predictions:
                    llm_predictions[participant_id] = {}
                
                # Store the predictions
                llm_predictions[participant_id][robot_name] = response_json
        except Exception as e:
            print(f"Error parsing response for P{participant_id}, {robot_name}: {e}")

# Create a list to store the rows for the DataFrame
llm_data = []

# Iterate through each participant and their robot data
for participant_id, robots in llm_predictions.items():
    for robot, predictions in robots.items():
        # Create a row for this participant and robot
        row = {
            'user_id': participant_id,
            'robot': robot
        }
        
        # Add the predictions for each question
        for q_num in range(1, 9):
            question_key = f"Question {q_num}"
            if question_key in predictions:
                # For Q7 and Q8, we need to invert the scale (8 - value) to match the original data
                if q_num == 7 or q_num == 8:
                    row[f"Q{q_num}"] = 8 - predictions[question_key]
                else:
                    row[f"Q{q_num}"] = predictions[question_key]
            else:
                row[f"Q{q_num}"] = None
        
        llm_data.append(row)

# Create the DataFrame
llm_df = pd.DataFrame(llm_data)

# Calculate the average score for each participant and robot
llm_df["Average"] = llm_df.iloc[:, 2:10].mean(axis=1)

# Sort by user_id to make it easier to read
llm_df = llm_df.sort_values(by=['user_id', 'robot']).reset_index(drop=True)

# Display the DataFrame
print(llm_df.head(10))
print(f"Total number of records: {len(llm_df)}")

# Create a figure with bigger size
plt.figure(figsize=(12, 8))

# Create a distribution plot of the averages by robot type
ax = sns.histplot(data=llm_df, x='Average', hue='robot', kde=True, bins=10, 
                 element='step', palette=['blue', 'red'], alpha=0.5)

# Add a vertical line for the mean of each robot
for robot, color in zip(['Alice', 'Clara'], ['blue', 'red']):
    mean_val = llm_df[llm_df['robot'] == robot]['Average'].mean()
    plt.axvline(x=mean_val, color=color, linestyle='--', 
                label=f'Mean for {robot}: {mean_val:.2f}')

# Add titles and labels
plt.title('Distribution of LLM-Predicted Average Ratings by Robot', fontsize=16)
plt.xlabel('Average Rating', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.legend(fontsize=12)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Save user id and average to a new CSV file
llm_df[["user_id", "Average"]].to_csv('predictions-llm.csv', index=False)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

# Merge the dataframes to compare LLM predictions with actual user ratings
# First, let's make sure the ordering is consistent by setting index as user_id and robot
ordered_df_indexed = ordered_df.set_index(['user_id', 'robot'])
llm_df_indexed = llm_df.set_index(['user_id', 'robot'])

# Find the common indices (user_id, robot combinations present in both datasets)
common_indices = ordered_df_indexed.index.intersection(llm_df_indexed.index)

# Filter both dataframes to only include common indices
ordered_filtered = ordered_df_indexed.loc[common_indices]
llm_filtered = llm_df_indexed.loc[common_indices]

# Calculate MSE for average ratings
mse = mean_squared_error(ordered_filtered['Average'], llm_filtered['Average'])
rmse = np.sqrt(mse)

# Calculate MSE, correlation, and p-value for each question
question_mse = {}
question_correlation = {}
question_p_values = {}
for q in range(1, 8):
    q_col = f'Q{q}'
    q_mse = mean_squared_error(ordered_filtered[q_col], llm_filtered[q_col])
    question_mse[q_col] = q_mse

    # Calculate correlation and p-value for the question
    correlation, p_value = pearsonr(ordered_filtered[q_col], llm_filtered[q_col])
    question_correlation[q_col] = correlation
    question_p_values[q_col] = p_value

# Calculate correlation and p-value for average ratings
average_correlation, average_p_value = pearsonr(ordered_filtered['Average'], llm_filtered['Average'])

# Print results
print(f"Mean Squared Error (MSE) between LLM and user average ratings: {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Correlation between LLM and user average ratings: {average_correlation:.4f}")
print(f"P-value for average correlation: {average_p_value:.4e}")
print("\nMSE for each question:")
for q, q_mse in question_mse.items():
    print(f"{q}: {q_mse:.4f}")

print("\nCorrelation and P-value for each question:")
for q in range(1, 8):
    print(f"{q}: Correlation = {question_correlation[f'Q{q}']:.4f}, P-value = {question_p_values[f'Q{q}']:.4e}")