In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

remove_attention_failers = 0 #Remove participants who failed attention checks (deprecated for first study as they are all replaced)

In [12]:

def extract_basic_info(csv_path):
    df = pd.read_csv(csv_path)


    # Food reported across training trials
    train_responses = df['slider_train.response'].dropna().tolist()

    #Actual food needed (food needed on each training trial)
    food_amount = df['food_amount'].dropna().tolist() [:-1]
    print(food_amount)
    
    # Feedback (free text response about experiment)
    col = 'feedback_text.text'

    if col not in df.columns:
        warnings.warn(
            f"Missing column '{col}' in {os.path.basename(csv_path)}"
        )
        feedback = []
    else:
        feedback = df[col].dropna().tolist()
    
    # Trial stop time (time it took to finish the training loop)
    isi_values = df['ISI.stopped'].dropna().tolist()

    #Get the ISI value for the last training trial, store it
    trial_stop_time = isi_values[-1] if isi_values else np.nan

    #First row with a non-empty value in 'images_list', which shows the order of testing images presented
    images_row = df[df['images_list'].notna()].iloc[0] if not df[df['images_list'].notna()].empty else None

    #Turn the images from PNGs to names
    images = [img.split('/')[-1].replace('.png','') for img in images_row['images_list'].split(',')]

    #First row with a non-empty value in 'sliderRatings', which shows the ratings for testing images
    ratings_row = df[df['sliderRatings'].notna()].iloc[0] if not df[df['sliderRatings'].notna()].empty else None

    #Turn them into floats split by commas
    ratings = [float(r) for r in ratings_row['sliderRatings'].strip('[]').split(',')]

    #Across the training trials, add information about what feature was presented
    train_tail = df['tail'].dropna().tolist() [:-1]
    train_wing = df['wing'].dropna().tolist() [:-1]
    train_color = df['color'].dropna().tolist() [:-1]

    #Across the training trials, add information about feature relevance
    train_categories =  df['category'].dropna().tolist() [:-1]
    train_tail = df['tail_rel'].dropna().tolist() [:-1]
    train_wing = df['wing_rel'].dropna().tolist() [:-1]
    train_color = df['color_rel'].dropna().tolist() [:-1]

    #Extracting the relevant and irrelevant feature dimension info
    dims = {}
    cols = ['relevant_dim_1', 'relevant_dim_2', 'irrelevant_dim', 
            'color_high', 'color_low', 
            'wing_high', 'wing_low', 
            'tail_high', 'tail_low']

    for col in cols:
        vals = df[col].dropna().unique()

        if len(vals) == 0:
            dims[col] = np.nan
            print(f"Warning: No values found in {col}")

        elif len(vals) == 1:
            dims[col] = vals[0]

        else:
            warnings.warn(
                f"Multiple values found in {col}: {vals}"
            )
            dims[col] = vals[0] 

    print(dims)
    
    #Categories for the testing images, in the order shown
    test_categories = images_row['testing_categories'].split(',')
    
    #Write condition (this is the unique identifier for a certain order of trials)
    condition = images_row['condition'] if images_row is not None and 'condition' in images_row else np.nan

    #Add in the order of images during training
    training_image_order = [img.split('/')[-1].replace('.png','') 
                        for img in df['image_file'].dropna().tolist()] [:-1]


    #Updated code to get slider responses (subjective reports of feature relevance)

    slider_responses = {}
    features = ['wing', 'color', 'tail']
    for feat in features:
        # subset only rows of this feature
        if feat == 'wing':
            sub = df[df['feature'] == 'wings']
        else:
            sub = df[df['feature'] == feat]

        # 1. discrete
        disc = sub['discrete_slider.response']
        disc_val = disc[disc.notna() & (disc != "")].iloc[0] if (disc.notna() & (disc != "")).any() else np.nan
        print(disc_val)
        slider_responses[f'{feat}_discrete_slider.response'] = disc_val

        # 2. direction
        col = 'direction_response_label'

        #Warning if the column is missing commented out
        #As the collection works now, if they answer 'no' for all, this column won't exist - but this is fine as it means no direction to store
        if col not in sub.columns:
            #warnings.warn(
            #    f"Column '{col}' not found for feature '{feat}' in {os.path.basename(csv_path)}"
            #)
            dirc_val = np.nan
        else:
            dirc = sub[col]
            dirc_val = (
                dirc[dirc.notna() & (dirc != "")].iloc[0]
                if (dirc.notna() & (dirc != "")).any() and disc_val != 'No'
                else np.nan
            )

        slider_responses[f'{feat}_direction_slider.response'] = dirc_val


        # 3. continuous
        cont = sub['continuous_slider.response']
        cont_val = cont[cont.notna() & (cont != "")].iloc[0] if (cont.notna() & (cont != "")).any() and disc_val != 'No' else np.nan #only store continuous if discrete was "Yes"
        print(cont_val)
        slider_responses[f'{feat}_continuous_slider.response'] = cont_val

    #Adding attention check result
    att_row = df['answer_3_right.numClicks'].dropna()
    print('attention', att_row)
    if not att_row.empty:
        att_row = int(att_row.iloc[0])
    else:
        warnings.warn("No attention check data found.")

    print('attention', att_row)
    print('type', type(att_row))

    # Convert to pass/fail (1 = correct, 0 = incorrect)
    attention_check = 1 if att_row == 1 else 0


    result = {
        'participant': os.path.basename(csv_path)[:3], #participant number
        'training_responses': train_responses, #Response during training 
        'food_amount': food_amount, #real food amount displayed that trial
        'error': [abs(pred - actual) for pred, actual in zip(train_responses, food_amount)],
        'feedback': feedback,
        'trial_stop_time': trial_stop_time,
        'testing_image_order': images,
        'testing_responses': ratings,
        'training_categories': train_categories,
        'training_tail': train_tail,
        'training_wing': train_wing,
        'training_color': train_color,
        'testing_categories': test_categories,
        'conditionOrder': condition, 
        'training_image_order': training_image_order,
        'attention_check': attention_check,
        **dims
    }
    result.update(slider_responses)
    return result

topdir = '/Users/sm6511/Desktop/Prediction-Accomodation-Exp'
study = 'Study2.0Pilot'
dates = [
    '2026-01-30'
]
datadir = os.path.join(topdir, f'data/{study}/Predict')
cleaneddir = os.path.join(topdir, f'data/{study}/Cleaned')
all_participants = []

for fname in os.listdir(datadir):
    if fname.endswith('.csv') and fname:
        participant_id = fname[:3]
        if not any(d in fname for d in dates):
            continue
        csv_path = os.path.join(datadir, fname)
        print(csv_path)
        info = extract_basic_info(csv_path)
        all_participants.append(info)


df_all = pd.DataFrame(all_participants)
if remove_attention_failers:
    df_all = df_all[df_all['attention_check'] == 1]
    df_all.to_csv(os.path.join(cleaneddir, f'{study}Predict.csv'), index=False)
else:
    df_all.to_csv(os.path.join(cleaneddir, f'{study}Predict.csv'), index=False)

print(df_all[df_all['attention_check'] == 1])

/Users/sm6511/Desktop/Prediction-Accomodation-Exp/data/Study2.0Pilot/Predict/012_test_2026-01-30_12h21.14.417.csv
[8.0, 7.0, 4.0, 9.0, 3.0, 5.0, 3.0, 9.0, 3.0, 5.0, 3.0, 7.0, 5.0, 4.0, 5.0, 9.0, 5.0, 6.0, 4.0, 4.0, 6.0, 2.0, 3.0, 6.0]
{'relevant_dim_1': 'tail', 'relevant_dim_2': 'color', 'irrelevant_dim': 'wing', 'color_high': 'B', 'color_low': 'Y', 'wing_high': 'T', 'wing_low': 'N', 'tail_high': 'S', 'tail_low': 'C'}
Yes
3.0
No
nan
Yes
6.0
attention 39    1.0
Name: answer_3_right.numClicks, dtype: float64
attention 1
type <class 'int'>
/Users/sm6511/Desktop/Prediction-Accomodation-Exp/data/Study2.0Pilot/Predict/009_test_2026-01-30_13h45.02.240.csv
[7.0, 8.0, 5.0, 5.0, 8.0, 4.0, 3.0, 5.0, 4.0, 6.0, 5.0, 7.0, 2.0, 7.0, 4.0, 5.0, 4.0, 7.0, 4.0, 1.0, 6.0, 6.0, 8.0, 6.0]
{'relevant_dim_1': 'tail', 'relevant_dim_2': 'wing', 'irrelevant_dim': 'color', 'color_high': 'B', 'color_low': 'Y', 'wing_high': 'T', 'wing_low': 'N', 'tail_high': 'C', 'tail_low': 'S'}
Yes
3.0
No
nan
No
nan
attention 39 

In [14]:
print(np.sort(df_all['conditionOrder']))

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150]


In [24]:
print(len(df_all['training_categories'][0]))

24


In [5]:
print(df_all['testing_categories'])

0    [medium, low, medium, medium, high, high, medi...
1    [high, medium, low, medium, medium, high, low,...
2    [medium, medium, high, low, high, medium, low,...
3    [medium, high, high, low, medium, medium, low,...
Name: testing_categories, dtype: object
