In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import io

# 1. Load the combined data (including the 'robot' column)
data_string = """user_id,robot,Average
4,Alice,4.0
5,Clara,6.285714285714286
6,Clara,6.571428571428571
7,Clara,7.0
8,Alice,6.428571428571429
9,Clara,5.0
10,Clara,4.714285714285714
11,Clara,6.571428571428571
12,Alice,3.5714285714285716
13,Alice,5.285714285714286
14,Clara,3.0
15,Alice,3.0
16,Alice,5.285714285714286
17,Alice,6.857142857142857
18,Alice,1.7142857142857142
19,Alice,6.0
20,Clara,5.428571428571429
21,Alice,6.714285714285714
22,Alice,4.0
23,Clara,6.714285714285714
24,Alice,6.428571428571429
25,Clara,6.857142857142857
27,Clara,7.0
28,Alice,4.714285714285714
29,Alice,6.285714285714286
30,Alice,6.285714285714286
31,Alice,4.142857142857143
32,Clara,5.857142857142857
33,Clara,5.0
34,Clara,3.7142857142857144
35,Alice,4.571428571428571
36,Clara,6.0
37,Clara,2.4285714285714284
38,Alice,6.142857142857143
39,Alice,6.285714285714286
40,Clara,5.0
41,Clara,6.428571428571429
42,Alice,5.571428571428571
"""

# Use io.StringIO to read the string data into a pandas DataFrame
data = pd.read_csv(io.StringIO(data_string))

# Get the actual 'Average' values
actual_averages = data['Average'].values

# 2. Generate Random Model Predictions *Per Robot Group*

# Calculate mean, standard deviation, and count per robot group
group_stats = data.groupby('robot')['Average'].agg(['mean', 'std', 'count'])

print("--- Group Statistics ---")
print(group_stats)
print("\n")

# Set a random seed for reproducibility
np.random.seed(42)

# Initialize a column to store the group-specific random predictions
data['Group_Random_Prediction'] = np.nan

# Generate predictions for each group based on its specific distribution
for robot_name, stats_row in group_stats.iterrows():
    mean_val = stats_row['mean']
    std_val = stats_row['std']
    # Ensure count is an integer for the size parameter
    count_val = int(stats_row['count']) 
    
    # Handle cases where std might be 0 or NaN (e.g., if a group had only 1 member)
    if pd.isna(std_val) or std_val == 0:
        std_val = 0 # Generate constant values (the mean) if std dev is zero/undefined

    # Generate random predictions for this group
    predictions = np.random.normal(loc=mean_val, scale=std_val, size=count_val)

    # Find the indices in the original DataFrame belonging to this group
    group_indices = data[data['robot'] == robot_name].index

    # Assign the generated predictions to the correct rows
    # Note: This assumes the order within the group is maintained,
    # which is fine for this assignment method.
    data.loc[group_indices, 'Group_Random_Prediction'] = predictions

print("--- Group-Specific Random Model Predictions ---")
print(data[['user_id', 'robot', 'Average', 'Group_Random_Prediction']].round(3))
print("\n")

# Extract the generated predictions
group_random_predictions = data['Group_Random_Prediction'].values

# Check means and std devs of the generated predictions per group (optional check)
print("--- Prediction Statistics (Verification) ---")
print(data.groupby('robot')['Group_Random_Prediction'].agg(['mean', 'std']))
print("\n")


# 3. Compute Correlation and P-value using Scipy
# Compare actual averages against the *group-specific* random predictions
correlation_coefficient, p_value = stats.pearsonr(actual_averages, group_random_predictions)

print("--- Correlation Results (Actual vs Group-Specific Random) ---")
print(f"Pearson Correlation Coefficient (r): {correlation_coefficient:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation
print("\n--- Interpretation ---")
print(f"The correlation coefficient (r = {correlation_coefficient:.4f}) measures the linear relationship between the actual 'Average' values and the randomly generated predictions, where predictions were sampled from normal distributions specific to each robot group ('Alice' or 'Clara').")
print("A value close to 0 indicates little to no linear relationship.")
print(f"The p-value ({p_value:.4f}) indicates the probability of observing the data (or something more extreme) if there were truly no correlation between the actual values and these group-specific random predictions.")
if p_value < 0.05:
    print("Since the p-value is less than 0.05, we might conclude there's a statistically significant (though likely spurious, given the random nature) correlation.")
else:
    print("Since the p-value is greater than or equal to 0.05, we conclude that there is no statistically significant linear correlation between the actual values and the group-specific random predictions, as expected.")

# Compute R^2
r_squared = correlation_coefficient ** 2

print(f"\nCorrelation coefficient (r): {correlation_coefficient:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"R-squared (r^2): {r_squared:.4f}")

# Store predictions in a csv file with Person ID and Predicted Value
output_file = "random-predictions.csv"
data[['user_id', 'robot', 'Group_Random_Prediction']].to_csv(output_file, index=False)
print(f"\n--- Predictions saved to {output_file} ---")

--- Group Statistics ---
           mean       std  count
robot                           
Alice  5.164286  1.405280     20
Clara  5.531746  1.376187     18


--- Group-Specific Random Model Predictions ---
    user_id  robot  Average  Group_Random_Prediction
0         4  Alice    4.000                    5.862
1         5  Clara    6.286                    7.549
2         6  Clara    6.571                    5.221
3         7  Clara    7.000                    5.625
4         8  Alice    6.429                    4.970
5         9  Clara    5.000                    3.571
6        10  Clara    4.714                    4.783
7        11  Clara    6.571                    5.684
8        12  Alice    3.571                    6.074
9        13  Alice    5.286                    7.305
10       14  Clara    3.000                    3.948
11       15  Alice    3.000                    4.835
12       16  Alice    5.286                    4.835
13       17  Alice    6.857                    7.38