# Python Session 1

## We will practice cleaning some Food choice task data

We are going to generate data from 20 individuals to practice our skills. In the task, participants rate 50 foods for healthiness, tastiness and choice. We are simulating this data below.



In [1]:
import pandas as pd
import numpy as np
import random

In [3]:
import random
import pandas as pd

# Define blocks and trial structure
blocks = ['health', 'taste', 'choice']#a list
trials_per_block = 75
participants = range(1, 21)

# Generate 50 unique foods
base_foods = [
    'apple', 'banana', 'burger', 'carrot', 'donut', 'eggs', 'fries', 'grapes', 'ice cream', 'kale',
    'pizza', 'yogurt', 'spinach', 'steak', 'candy', 'popcorn', 'mango', 'nuts', 'cheese', 'chicken',
    'broccoli', 'chocolate', 'granola', 'lettuce', 'pasta', 'salmon', 'tofu', 'soda', 'rice', 'beans',
    'cucumber', 'peach', 'bacon', 'cereal', 'toast', 'avocado', 'beef', 'peanut butter', 'cake', 'milk',
    'watermelon', 'pear', 'turkey', 'onion rings', 'oatmeal', 'cranberries', 'syrup', 'waffles', 'cookie', 'shrimp'
]
assert len(base_foods) == 50#assert is a true/false check

# Assign fat and sugar levels randomly
food_properties = {}
for food in base_foods:
    fat = random.choices(['high', 'low'], weights=[0.4, 0.6])[0]
    sugar = random.choices(['high', 'low'], weights=[0.5, 0.5])[0]
    food_properties[food] = {'fat': fat, 'sugar': sugar}

# Generate trials
all_trials = []

for participant in participants:
    for block in blocks:
        for trial_num in range(1, trials_per_block + 1):
            food = random.choice(base_foods)
            rt_missing = random.random() < 0.02  # 2% chance of missing RT
            reaction_time = None if rt_missing else round(random.uniform(0.5, 4.0), 2)
            rating = None if reaction_time is None else random.randint(1, 10)

            fat = food_properties[food]['fat']
            sugar = food_properties[food]['sugar']

            trial = {
                'participant': participant,
                'block': block,
                'trial_number': trial_num,
                'food': food,
                'reaction_time': reaction_time,
                'rating': rating,
                'fat': fat,
                'sugar': sugar
            }
            all_trials.append(trial)

# Create DataFrame
df = pd.DataFrame(all_trials)

# Validate logic: rating is only missing if RT is missing
assert all(df[df['rating'].isna()]['reaction_time'].isna())

The data are stored in a dataframe object, which we have called df
To access items in the dataframe, we need to type "df"

In [4]:
#If we want to see the data, we can just type
df

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
0,1,health,1,cheese,2.04,7.0,low,low
1,1,health,2,pear,2.16,8.0,low,low
2,1,health,3,pear,1.07,10.0,low,low
3,1,health,4,cheese,0.89,8.0,low,low
4,1,health,5,pizza,3.38,10.0,low,high
...,...,...,...,...,...,...,...,...
4495,20,choice,71,granola,1.14,7.0,low,low
4496,20,choice,72,cereal,3.65,8.0,low,high
4497,20,choice,73,chocolate,3.23,2.0,low,high
4498,20,choice,74,cereal,0.90,4.0,low,high


In [None]:
# To see anything in df we will need to reference df first
df.columns

Index(['participant', 'block', 'trial_number', 'food', 'reaction_time',
       'rating', 'fat', 'sugar'],
      dtype='object')

In [None]:
# We can also look at the values of columns
# All of these will access the food column
df.food
df['food']
df.iloc[:,3]

0         chicken
1           beans
2         granola
3            kale
4        cucumber
          ...    
4495        syrup
4496    ice cream
4497       salmon
4498        candy
4499       burger
Name: food, Length: 4500, dtype: object

In [6]:
# Try here with RT
df.reaction_time
df['reaction_time']
df.iloc[:,5]
df.loc[:'reaction_time']

SyntaxError: invalid syntax (<ipython-input-6-1873539907>, line 5)

In [7]:
# To analyze this data, we will first need to remove any missing trials
# let's find the missing values
df.reaction_time[df.reaction_time.isna()==True]

df =

In [None]:
max(df.reaction_time)

In [8]:
df.reaction_time[df.reaction_time > 3]

# What would we change to see RTs < 2 only?

Unnamed: 0,reaction_time
4,3.38
8,3.59
13,3.22
18,3.14
24,3.35
...,...
4491,3.07
4492,3.21
4493,3.60
4496,3.65


In [None]:
# make a new data frame with no missing values
df1 = df[df.reaction_time.isna()==True]

In [None]:
# Now we want to perform some calculations on this data-set
# let's start by summarizing, for one person the health rating

# Filter for participant 1 and the 'health' block
participant_id = 1
health_block = df[(df['participant'] == participant_id) & (df['block'] == 'health')]

# Remove missing ratings (i.e., where RT was missing)
valid_ratings = health_block['rating'].dropna()

# Calculate the average health rating
average_health_rating = valid_ratings.mean()

print(f"Participant {participant_id}'s average health rating: {average_health_rating:.2f}")


Participant 1's average health rating: 5.12


In [9]:
#Try for health only for low and high-fat
participant_id = 1
health_block = df[(df['participant'] == participant_id) & (df['block'] == 'health') & (df['fat'] == 'low')]

#advanced - try grouping by fat content
health_block.groupby('fat').mean('rating')




Unnamed: 0_level_0,participant,trial_number,reaction_time,rating
fat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
low,1.0,36.642857,2.310179,5.892857


In [None]:
#Now let's create a new dataframe and store each persons average RT and rating for high and low fat foods

# Group by participant, block, and fat level
summary_df = (
    df
    .dropna(subset=['rating', 'reaction_time'])  # Exclude trials with missing values
    .groupby(['participant', 'block', 'fat'])
    .agg(
        average_rating=('rating', 'mean'),
        average_reaction_time=('reaction_time', 'mean'),
        trial_count=('rating', 'count')  # Optional: to see how many valid trials per group
    )
    .reset_index()
)

print(summary_df.head())


   participant   block   fat  average_rating  average_reaction_time  \
0            1  choice  high        5.720000               2.154800   
1            1  choice   low        5.446809               2.444468   
2            1  health  high        5.612903               2.066452   
3            1  health   low        4.772727               2.563182   
4            1   taste  high        5.040000               2.114400   

   trial_count  
0           25  
1           47  
2           31  
3           44  
4           25  


In [None]:
# Pivot to wide format
wide_df = summary_df.pivot_table(
    index='participant',
    columns=['block', 'fat'],
    values=['average_rating', 'average_reaction_time']
)




In [None]:
# Step 3: Flatten column names
wide_df.columns = [f'{stat}_{block}_{fat}' for stat, block, fat in wide_df.columns]
wide_df = wide_df.reset_index()

print(wide_df.head())

   participant  average_rating_choice_high  average_rating_choice_low  \
0            1                    5.720000                   5.446809   
1            2                    5.538462                   5.086957   
2            3                    5.750000                   5.681818   
3            4                    5.533333                   5.600000   
4            5                    6.000000                   5.800000   

   average_rating_health_high  average_rating_health_low  \
0                    5.612903                   4.772727   
1                    5.514286                   6.000000   
2                    5.862069                   4.906977   
3                    5.714286                   5.638298   
4                    5.851852                   5.795455   

   average_rating_taste_high  average_rating_taste_low  \
0                   5.040000                  6.240000   
1                   6.607143                  4.666667   
2                   5.5454

In [None]:
# Here try and simulate a different dataset - a monetary choice task where the participant
# selects between an immediate vs delayed reward. Compare the RT between when the participant
# chooses the immediate vs delayed option

In [None]:
# navigate to the directory
data=pd.read_csv("https://raw.githubusercontent.com/CaitlinLloyd/Psychology_Programming2025/refs/heads/main/Data/DelayDisc_example.csv")

In [None]:
# figure out whether left or right column is delayed (1 is left, 2 is right)
data['delayed_opt']= "none"
data.loc[data['delay_left'] < data['delay_right'],'delayed_opt'] ==2
data.loc[data['delay_left'] > data['delay_right'],'delayed_opt'] ==1

0      False
1      False
2      False
3      False
4      False
       ...  
109    False
111    False
112    False
114    False
117    False
Name: delayed_opt, Length: 62, dtype: bool

In [None]:
# Now summarize the RT for each person when they chose delayed vs chose sooner reward



In [None]:
## Here calculate the average earnings per person and the number of times they chose delayed vs sooner

## Upload solution to Github

## Extra
## These are hard exercises - not homework, for extra practice

In [None]:
# Hard

# Here simulate your own Delay Discounting Task and calculate some average metrics

In [None]:
# Very hard
# One outcome of interest is the discount rate, k, which denotes extent to which someone discounts
# value of delayed rewards (higher values = less patient)

# Here you can use chatGPT to get the formula for k - see whether you can calculate for each person
# in your dataset