In [1]:
import numpy as np
import pandas as pd

## Convert .xlsx file into pandas dataframe

In [2]:
raw = pd.read_excel("fiedler_raw.xlsx", header=0, dtype='object')

columns1 = ['type', 'pid', 'id', 'difficulty', 'trial', 'position', 'cue', 'value', 'fraction_sampled', 'sampled_cues', 'max_cues', 
            'chosen', 'target', 'accuracy', 'cue_choice_aligned']
columns2 = ['type', 'pid', 'id', 'dP', 'trial', 'accuracy', 'cues', 'max_cues']
dfs1 = []
dfs2 = []

sid = 0
for pid in raw['participant_id'].unique():
    print(f'adding data for {pid}')
    d1 = raw.query("participant_id==@pid")
    # print(d1)
    for trial, timestamp in enumerate(d1['timestamp'].unique()):
        d2 = d1.query("timestamp==@timestamp")
        # print(d2)
        sampled_cues = d2['ticks'].to_numpy()[0]
        max_cues = d2['sample_size'].to_numpy()[0]
        started = 'R' if d2['started_right'].to_numpy()[0] == 1 else 'L'
        Ls = [int(d) for d in str(d2['left'].to_numpy()[0])]
        Rs = [int(d) for d in str(d2['right'].to_numpy()[0])]
        delta = float(d2['delta'].to_numpy()[0])
        if delta==0.4: difficulty = 'easy'
        if delta==0.2: difficulty = 'moderate'
        if delta==0.1: difficulty = 'hard'
        chosen = 'R' if d2['selected_right'].to_numpy()[0]==1 else 'L'
        target = 'R' if d2['p2'].to_numpy()[0] > d2['p1'].to_numpy()[0] else 'L'
        accuracy = 1.0 if chosen==target else 0.0
        # print('Ls', Ls)
        # print('Rs', Rs) 
        for p in range(sampled_cues):
            if started=='R':
                cue = 'R' if (p % 2 == 0) else 'L'
            elif started=='L':
                cue = 'L' if (p % 2 == 0) else 'R'
            if cue == 'R':
                value = Rs[int(p/2)]
            elif cue == 'L':
                value = Ls[int(p/2)]
            fraction_sampled = (p+1) / sampled_cues
            cue_choice_aligned = 0.0
            if chosen=='R':
                if (cue=='R' and value==1) or (cue=='L' and value==0):
                    cue_choice_aligned += 1
                else:
                    cue_choice_aligned += -1
            elif chosen=='L':
                if (cue=='L' and value==1) or (cue=='R' and value==0):
                    cue_choice_aligned += 1
                else:
                    cue_choice_aligned += -1
            # print(p, cue, value, fraction_sampled)
            df = pd.DataFrame([[
                'human', pid, sid, difficulty, trial, p, cue, value, fraction_sampled, sampled_cues, max_cues,
                chosen, target, accuracy, cue_choice_aligned
                ]], columns=columns1)
            dfs1.append(df)
        if sampled_cues==0:  # participant chose before sampling any cues
            df = pd.DataFrame([[
                'human', pid, sid, difficulty, trial, -1, None, None, None, 0, max_cues, chosen, target, accuracy, None
                ]], columns=columns1)
            dfs1.append(df)
        df = pd.DataFrame([[
                'human', pid, sid, delta, trial, accuracy, sampled_cues, max_cues
                ]], columns=columns2)
        dfs2.append(df)
    sid += 1

fiedler1 = pd.concat(dfs1, ignore_index=True)
fiedler2 = pd.concat(dfs2, ignore_index=True)

adding data for 11e92cd2764348faa18918c94947d4fa
adding data for e00c31bc24424be5986b63504ef2572c
adding data for 58c54d6d2775404a9c3a3cde65c32a71
adding data for 159be9483bdc4dac871458482bbe7c64
adding data for 54bbae95c6954858b3c4546072c7c7a6
adding data for a1d579a793c1449b944b66bd66e498c9
adding data for 4c000592ba1641d4b00cda3032684321
adding data for d0dd4b374ab345b5ad09786280bf9ecd
adding data for d552b2c7fb97451d97de0c3d489d46a4
adding data for 147db44cbc514f589f7b158c93cbe072
adding data for 81289118898b4d4a8f65e96e38a10e43
adding data for 3148f74b1ffb43cea29844334282c2bd
adding data for c8ee025f8b60429bb194e1000f0297a1
adding data for 47e5820c96ec48078f611d29178ecd3e
adding data for fdcba947f0ec44d1b3e2ee0053c9449f
adding data for 75bc02b1b30147bfbeef0623ef76c315
adding data for 729d32346fde461a910c7887d7860f39
adding data for 9ab9798ae227487385e959254bfbd1d8
adding data for af39bdbe8af44ac4a48e709f78a6ac4d
adding data for 54b5eb179e8d4105aebda1e14df103f4
adding data for 7109

  fiedler1 = pd.concat(dfs1, ignore_index=True)


## Drop participants that don't meet the inclusion criteria

In [3]:
exclude = []
for pid in fiedler2['pid'].unique():
    # cut participants who completed less than 30 trials (excludes 5 people)
    if len(fiedler2.query('pid==@pid')['trial'].unique()) < 30:  
        if pid not in exclude:
            print(f"cutting {pid}, who only completed {len(fiedler2.query('pid==@pid')['trial'].unique())} trials")
            exclude.append(pid)
    # cut participants who chose before sampling evidence 30 or more times (excluces 2 people)
    if len(fiedler2.query('pid==@pid & cues==0')['trial'].unique()) > 30:  
        if pid not in exclude:
            print(f"cutting {pid}, who failed to sample any cues on {len(fiedler2.query('pid==@pid & cues==0')['trial'].unique())} trials")
            exclude.append(pid)

for pid in exclude:
    fiedler1 = fiedler1.drop(fiedler1[fiedler1.pid==pid].index)
    fiedler2 = fiedler2.drop(fiedler2[fiedler2.pid==pid].index)

# cut all remaining data for participants who chose before sampling evidence
fiedler1 = fiedler1.drop(fiedler1[fiedler1.sampled_cues==0].index)

fiedler1.to_pickle("fiedler_position.pkl")  # each row contains data on one sample from one trial
fiedler2.to_pickle("fiedler_trial.pkl")  # each row contains collapsed data from one trial

cutting 11e92cd2764348faa18918c94947d4fa, who only completed 1 trials
cutting d552b2c7fb97451d97de0c3d489d46a4, who failed to sample any cues on 902 trials
cutting c8ee025f8b60429bb194e1000f0297a1, who only completed 18 trials
cutting 75bc02b1b30147bfbeef0623ef76c315, who only completed 11 trials
cutting 729d32346fde461a910c7887d7860f39, who only completed 6 trials
cutting 9ab9798ae227487385e959254bfbd1d8, who failed to sample any cues on 834 trials
cutting af39bdbe8af44ac4a48e709f78a6ac4d, who only completed 2 trials
cutting 54b5eb179e8d4105aebda1e14df103f4, who only completed 21 trials


In [6]:
fiedler2.query("id==1 & max_cues==12")

Unnamed: 0,type,pid,id,dP,trial,accuracy,cues,max_cues
1,human,e00c31bc24424be5986b63504ef2572c,1,0.4,0,1.0,22,12
3,human,e00c31bc24424be5986b63504ef2572c,1,0.4,2,1.0,21,12
8,human,e00c31bc24424be5986b63504ef2572c,1,0.4,7,0.0,0,12
10,human,e00c31bc24424be5986b63504ef2572c,1,0.1,9,0.0,1,12
13,human,e00c31bc24424be5986b63504ef2572c,1,0.1,12,1.0,24,12
15,human,e00c31bc24424be5986b63504ef2572c,1,0.1,14,1.0,24,12
16,human,e00c31bc24424be5986b63504ef2572c,1,0.2,15,1.0,24,12
17,human,e00c31bc24424be5986b63504ef2572c,1,0.2,16,1.0,14,12
18,human,e00c31bc24424be5986b63504ef2572c,1,0.2,17,1.0,24,12
21,human,e00c31bc24424be5986b63504ef2572c,1,0.4,20,1.0,16,12


In [9]:
pd.read_pickle("fiedler_trial.pkl")

Unnamed: 0,type,pid,id,dP,trial,accuracy,cues,max_cues
1,human,e00c31bc24424be5986b63504ef2572c,1,0.4,0,1.0,22,12
2,human,e00c31bc24424be5986b63504ef2572c,1,0.2,1,1.0,18,18
3,human,e00c31bc24424be5986b63504ef2572c,1,0.4,2,1.0,21,12
4,human,e00c31bc24424be5986b63504ef2572c,1,0.2,3,1.0,13,18
5,human,e00c31bc24424be5986b63504ef2572c,1,0.1,4,1.0,36,18
...,...,...,...,...,...,...,...,...
13394,human,a347f4a790a24f8fbcc5a41b458329cf,62,0.2,157,1.0,10,12
13395,human,a347f4a790a24f8fbcc5a41b458329cf,62,0.4,158,1.0,14,18
13396,human,a347f4a790a24f8fbcc5a41b458329cf,62,0.1,159,0.0,33,18
13397,human,a347f4a790a24f8fbcc5a41b458329cf,62,0.1,160,1.0,22,18


## bin and average ```fraction_sampled``` and ```cue_choice_aligned``` data from ```fiedler1.pkl```

In [4]:
bins = np.arange(0.0, 1.2, 0.2)
columns = ['type', 'pid', 'id', 'difficulty', 'max_cues', 'fraction_sampled', 'mean_cue_choice_aligned', 'std_cue_choice_aligned']
dfs = []
for pid in fiedler1['pid'].unique():
    sid = fiedler1.query('pid==@pid')['id'].unique()[0]
    for difficulty in fiedler1['difficulty'].unique():
        for max_cues in fiedler1['max_cues'].unique():
            data = fiedler1.query('pid==@pid & difficulty==@difficulty & max_cues==@max_cues')
            for i in range(len(bins)-1):
                left = bins[i]
                right = bins[i+1]
                midpoint = (left + right) / 2
                cue_choices_aligned = data.query('fraction_sampled>@left & fraction_sampled<=@right')['cue_choice_aligned'].to_numpy()
                mean = np.mean(cue_choices_aligned)
                std = np.std(cue_choices_aligned)
                df = pd.DataFrame([[
                    'human', pid, sid, difficulty, max_cues, midpoint, mean, std,
                    ]], columns=columns)
                dfs.append(df)
fiedler3 = pd.concat(dfs, ignore_index=True)
fiedler3.to_pickle("fiedler_binned.pkl")  # each row contains average and std data from one participant in one condition

## Old

In [None]:
def old_extraction()
    data = pd.read_excel("fiedler_raw.xlsx", header=0)
    # remove test data, then remove redundant "tag" field
    for i, tag in enumerate(data['tag'].unique()):
        if type(tag)=='float':
            continue  # data wasn't label with tester tag
    #         if type(tag)=='float' and np.isnan(tag):
    #             data = data.drop(data[data.tag==tag].index)  # suspicious lack of tag
        if tag in ["Testversion", "TEST JOHANNES / LOL", "999", "Johannes Test 2"]:
            data = data.drop(data[data.tag==tag].index)
        if tag in ["Testversion", "TEST JOHANNES / LOL", "999", "Johannes Test 2"]:
            data = data.drop(data[data.tag==tag].index)
    data = data.drop(columns=['tag'])
    # rename sample_size to maxSamples
    data = data.rename(columns={'sample_size': 'max_cues'})
    # rename ticks to cues, left to A, and right to B, p1 to pA, p2 to pB
    data = data.rename(columns={'ticks': 'cues', 'left': 'A', 'right': 'B', 'p1': 'pA', 'p2': 'pB'})
    # add a "correct" column based on "selected right" and comparing "p1" to "p2"
    chosen_answer = data['selected_right'].to_numpy()
    correct_answer = data['pB'].to_numpy() > data['pA'].to_numpy()
    correct = chosen_answer == correct_answer
    data['accuracy'] = 1.0*correct
    data = data.drop(columns=[
        'timestamp', 'subrange_key', 'duration_ms', 'empirical_delta', 'empirical_p1', 'empirical_p2',
        'selected_right', 'started_right', 'last_left', 'last_right',
    ])
    # remove bad participants (choose before 1st cue, or have insuficient trials)
    data = data.drop(data[data.participant_id=="11e92cd2764348faa18918c94947d4fa"].index)
    data = data.drop(data[data.participant_id=="d552b2c7fb97451d97de0c3d489d46a4"].index)
    data = data.drop(data[data.participant_id=="9ab9798ae227487385e959254bfbd1d8"].index)
    data = data.drop(data[data.participant_id=="c49874f64a5346c6afa3d847f48ea9e6"].index)
    data.to_pickle("fiedler.pkl")

In [None]:
def condense_fiedler():
    dfs = []
    columns = ('type', 'id', 'dP', 'mean cues', 'var cues', 'mean acc', 'var acc')
    emp = pd.read_pickle("data/fiedler2021.pkl")
    for pid in emp['id'].unique():
        for dP in [0.4, 0.2, 0.1]:
            subdata = emp.query("id==@pid & dP==@dP")
            mean_cues = subdata['cues'].mean()
            var_cues = subdata['cues'].std()
            mean_acc = subdata['accuracy'].mean()
            var_acc = subdata['accuracy'].std()
            dfs.append(pd.DataFrame([['human', pid, dP, mean_cues, var_cues, mean_acc, var_acc]], columns=columns))
    new_emp = pd.concat(dfs, ignore_index=True)
    new_emp.to_pickle("data/fiedler2021_condensed.pkl")

In [None]:
def plot_condensed():
    new_emp = read_pickle("data/fiedler2021_condensed.pkl")
    fig, axes = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=((7,2)))
    for i, dP in enumerate([0.4, 0.2, 0.1]):
        subdata = new_emp.query("dP==@dP")
        x = subdata['mean cues'].to_numpy()
        y = subdata['mean acc'].to_numpy()
        xerr = subdata['var cues'].to_numpy()
        yerr = subdata['var acc'].to_numpy()
        axes[i].errorbar(x, y, xerr=xerr, yerr=0*yerr, fmt="o")
    plt.tight_layout()

    fig = sns.FacetGrid(new_emp, row="type", col="dP", col_order=[0.4, 0.2, 0.1], palette=palette, height=2, aspect=1)
    fig.map_dataframe(sns.scatterplot, x="mean cues", y="mean acc")
    fig.set_xlabels("Mean Cues")
    fig.set_ylabels("Mean Accuracy")
    fig.set(yticks=[50, 60, 70, 80, 90, 100])
    fig.add_legend()
    fig.savefig(f"plots/facet2.svg")
    fig.savefig(f"plots/facet2.png", dpi=600)

In [None]:
def remake_fiedler():
    emp = pd.read_pickle("data/empirical.pkl")
    dfs = []
    columns = ('type', 'id', 'dP', 'trial', 'accuracy', 'cues', 'max_cues')
    for i, pid in enumerate(emp['participant_id'].unique()):
        for dP in [0.4, 0.2, 0.1]:
            # print(f"pid {pid}, dP {dP}")
            subdata = emp.query("participant_id==@pid & delta==@dP")
            trial = 0
            for index, row in subdata.iterrows():
                dfs.append(pd.DataFrame([["human", f"{i}", dP, trial, 100*row['correct'], row['cues'], row['maxSamples']]], columns=columns))
                trial += 1
    df = pd.concat(dfs, ignore_index=True)
    df.to_pickle("data/fiedler2021.pkl")