In [2]:
import pandas as pd
from pathlib import Path

data_dir = Path("/Users/sm6511/Desktop/Prediction-Accomodation-Exp/TrialFiles/Main2-7")
file_template = "subj{:03d}_training.csv"

dfs = []

for subj in range(1, 421):
    fname = data_dir / file_template.format(subj)
    if fname.exists():
        df = pd.read_csv(fname)
        df["subject"] = subj
        dfs.append(df)

all_data = pd.concat(dfs, ignore_index=True)

print(f"Loaded {all_data['subject'].nunique()} subjects")
print(f"Total rows: {len(all_data)}")

Loaded 420 subjects
Total rows: 10080


In [6]:
relevant_counts = (
    all_data[["relevant_dim_1", "relevant_dim_2"]]
    .apply(lambda row: set(row), axis=1)
    .explode()
    .value_counts()/24
)

print("\nHow often each dimension was relevant:")
print(relevant_counts)



How often each dimension was relevant:
tail     315.0
color    315.0
wing     210.0
Name: count, dtype: float64


In [7]:
def count_unique_per_row(df, column):
    return (
        df[column]
        .groupby(df.index)
        .apply(lambda x: set(x))
        .explode()
        .value_counts()/24
    )

tail_high_counts = count_unique_per_row(all_data, "tail_high")
color_high_counts = count_unique_per_row(all_data, "color_high")
wing_high_counts = count_unique_per_row(all_data, "wing_high")

print("\nTail high (C vs S):")
print(tail_high_counts)

print("\nColor high (B vs Y):")
print(color_high_counts)

print("\nWing high (T vs N):")
print(wing_high_counts)



Tail high (C vs S):
tail_high
S    263.0
C    157.0
Name: count, dtype: float64

Color high (B vs Y):
color_high
B    261.0
Y    159.0
Name: count, dtype: float64

Wing high (T vs N):
wing_high
T    210.0
N    210.0
Name: count, dtype: float64


In [8]:
avg_food_by_category = (
    all_data
    .groupby("category")["food_amount"]
    .mean()
)

print("\nAverage food amount per category:")
print(avg_food_by_category)



Average food amount per category:
category
high      7.024206
low       3.131746
medium    5.014881
Name: food_amount, dtype: float64


In [9]:
feature_map = {
    "tail": ["C", "S"],
    "color": ["B", "Y"],
    "wing": ["T", "N"]
}

rows = []

for dim, values in feature_map.items():
    for val in values:
        mask = (
            (all_data[dim] == val)
        )

        relevant_mask = (
            (all_data["relevant_dim_1"] == dim) |
            (all_data["relevant_dim_2"] == dim)
        )

        rows.append({
            "dimension": dim,
            "feature": val,
            "relevant": True,
            "avg_food": all_data.loc[mask & relevant_mask, "food_amount"].mean()
        })

        rows.append({
            "dimension": dim,
            "feature": val,
            "relevant": False,
            "avg_food": all_data.loc[mask & ~relevant_mask, "food_amount"].mean()
        })

feature_food_summary = pd.DataFrame(rows)

print("\nAverage food amount per feature (relevant vs not):")
print(feature_food_summary)



Average food amount per feature (relevant vs not):
   dimension feature  relevant  avg_food
0       tail       C      True  4.694974
1       tail       C     False  5.108730
2       tail       S      True  5.363757
3       tail       S     False  5.086508
4      color       B      True  5.392063
5      color       B     False  5.023016
6      color       Y      True  4.707937
7      color       Y     False  5.048413
8       wing       T      True  5.082540
9       wing       T     False  5.025794
10      wing       N      True  5.050794
11      wing       N     False  5.026587
