In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

remove_attention_failers = 0

In [7]:

def extract_basic_info(csv_path):
    df = pd.read_csv(csv_path)


    # Food reported across training trials
    train_responses = df['slider_train.response'].dropna().tolist()

    #Actual food needed (food needed on each training trial)
    food_amount = df['food_amount'].dropna().tolist()
    
    # Feedback (free text response about experiment)
    feedback = df['feedback_text.text'].dropna().tolist()
    
    # Trial stop time (time it took to finish the training loop)
    isi_values = df['ISI.stopped'].dropna().tolist()

    #Get the ISI value for the last training trial, store it
    trial_stop_time = isi_values[-1] if isi_values else np.nan

    #First row with a non-empty value in 'images_list', which shows the order of testing images presented
    images_row = df[df['images_list'].notna()].iloc[0] if not df[df['images_list'].notna()].empty else None

    #Turn the images from PNGs to names
    images = [img.split('/')[-1].replace('.png','') for img in images_row['images_list'].split(',')]

    #First row with a non-empty value in 'sliderRatings', which shows the ratings for testing images
    ratings_row = df[df['sliderRatings'].notna()].iloc[0] if not df[df['sliderRatings'].notna()].empty else None

    #Turn them into floats split by commas
    ratings = [float(r) for r in ratings_row['sliderRatings'].strip('[]').split(',')]

    #Across the training trials, add information about what feature was presented
    train_tail = df['tail'].dropna().tolist()
    train_shape = df['shape'].dropna().tolist()
    train_color = df['color'].dropna().tolist()

    #Categories for training images, and the actual relevance of each feature during training
    train_categories =  df['category'].dropna().tolist()
    train_tail = df['tail_rel'].dropna().tolist()
    train_shape = df['shape_rel'].dropna().tolist()
    train_color = df['color_rel'].dropna().tolist()
    
    #Categories for the testing images, in the order shown
    test_categories = images_row['testing_categories'].split(',')
    
    #Write condition (this is the unique identifier for a certain order of trials)
    condition = images_row['condition'] if images_row is not None and 'condition' in images_row else np.nan

    #Add in the order of images during training
    training_image_order = [img.split('/')[-1].replace('.png','') 
                        for img in df['image_file'].dropna().tolist()]


    #Updated code to get slider responses (subjective reports of feature relevance)

    slider_responses = {}
    features = ['shape', 'color', 'tail']
    for feat in features:
        # subset only rows of this feature
        sub = df[df['feature'] == feat]

        # 1. discrete
        disc = sub['discrete_slider.response']
        disc_val = disc[disc.notna() & (disc != "")].iloc[0] if (disc.notna() & (disc != "")).any() else np.nan
        slider_responses[f'{feat}_discrete_slider.response'] = disc_val
        print(disc_val)
        # 2. direction (only for "Yes" responses)
        dirc = sub['direction_response_label']
        dirc_val = dirc[dirc.notna() & (dirc != "")].iloc[0] if (dirc.notna() & (dirc != "")).any() and disc_val != 'No' else np.nan
        slider_responses[f'{feat}_direction_slider.response'] = dirc_val

        # 3. continuous
        cont = sub['continuous_slider.response']
        cont_val = cont[cont.notna() & (cont != "")].iloc[0] if (cont.notna() & (cont != "")).any() and disc_val != 'No' else np.nan
        slider_responses[f'{feat}_continuous_slider.response'] = cont_val
        print(cont_val)

    #Adding attention check result
    att_rows = df[df['answer_3_right.numClicks'].notna()]
    if not att_rows.empty:
        att_val = att_rows.iloc[0]['answer_3_right.numClicks']
    else:
        att_val = np.nan

    # Convert to pass/fail (1 = correct, 0 = incorrect)
    attention_check = 1 if att_val == 1 else 0


    result = {
        'participant': os.path.basename(csv_path)[:3], #participant number
        'training_responses': train_responses, #Response during training 
        'food_amount': food_amount, #real food amount displayed that trial
        'error': [abs(pred - actual) for pred, actual in zip(train_responses, food_amount)],
        'feedback': feedback,
        'trial_stop_time': trial_stop_time,
        'testing_image_order': images,
        'testing_responses': ratings,
        'training_categories': train_categories,
        'training_tail': train_tail,
        'training_shape': train_shape,
        'training_color': train_color,
        'testing_categories': test_categories,
        'conditionOrder': condition, #
        'training_image_order': training_image_order,
        'attention_check': attention_check
    }
    result.update(slider_responses)
    return result

topdir = '/Users/sm6511/Desktop/Prediction-Accomodation-Exp'
study = 'PrePilot2.0'
date = '2025-12-16'
datadir = os.path.join(topdir, f'data/{study}/Predict')
cleaneddir = os.path.join(topdir, f'data/{study}/Cleaned')
all_participants = []

for fname in os.listdir(datadir):
    if fname.endswith('.csv') and fname:
        participant_id = fname[:3]
        if date not in fname:  # skip files
            continue
        csv_path = os.path.join(datadir, fname)
        print(csv_path)
        info = extract_basic_info(csv_path)
        all_participants.append(info)


df_all = pd.DataFrame(all_participants)
if remove_attention_failers:
    df_all = df_all[df_all['attention_check'] == 1]
    df_all.to_csv(os.path.join(cleaneddir, f'{study}Predict.csv'), index=False)
else:
    df_all.to_csv(os.path.join(cleaneddir, f'{study}Predict.csv'), index=False)

#print(df_all[df_all['attention_check'] == 0])
print(df_all)

/Users/sm6511/Desktop/Prediction-Accomodation-Exp/data/PrePilot2.0/Predict/002_test_2025-12-16_14h11.08.917.csv
No
nan
Yes
5.0
Yes
6.0
/Users/sm6511/Desktop/Prediction-Accomodation-Exp/data/PrePilot2.0/Predict/004_test_2025-12-16_11h27.18.921.csv
Yes
5.0
Yes
7.0
Yes
6.0
/Users/sm6511/Desktop/Prediction-Accomodation-Exp/data/PrePilot2.0/Predict/001_test_2025-12-16_13h03.45.534.csv
Yes
4.0
Yes
5.0
Yes
5.0
/Users/sm6511/Desktop/Prediction-Accomodation-Exp/data/PrePilot2.0/Predict/003_test_2025-12-16_14h42.31.889.csv
No
nan
Yes
4.0
Yes
6.0
  participant                                 training_responses  \
0         002  [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...   
1         004  [5.0, 9.0, 8.0, 8.0, 7.0, 8.0, 5.0, 6.0, 7.0, ...   
2         001  [2.0, 2.0, 4.0, 7.0, 5.0, 4.0, 7.0, 5.0, 6.0, ...   
3         003  [3.0, 5.0, 5.0, 4.0, 3.0, 3.0, 6.0, 7.0, 5.0, ...   

                                         food_amount  \
0  [8.0, 6.0, 6.0, 3.0, 6.0, 4.0, 3.0, 10.0, 8.0,...   
1  [1

In [4]:
print(df_all['conditionOrder'])

0    2
1    4
2    1
3    3
Name: conditionOrder, dtype: int64


In [5]:
print(df_all['testing_categories'])

0    [medium, low, medium, medium, high, high, medi...
1    [high, medium, low, medium, medium, high, low,...
2    [medium, medium, high, low, high, medium, low,...
3    [medium, high, high, low, medium, medium, low,...
Name: testing_categories, dtype: object
