In [1]:
# this code is adapted from Tyler's analysis code in his first replication:

import warnings; warnings.simplefilter('ignore')
import json, scipy
import pandas as pd
import numpy as np
import sys
import os



In [2]:
% pylab inline

UsageError: Line magic function `%` not found.


In [66]:
def preprocess_subject(subject_data, key_map): 
    
    subject_gen = subject_data[subject_data.stage=='generalization']
    subject_gen = subject_gen[subject_gen.valence!='control']
    subject_gen['distance'] = np.abs(subject_gen['distance'])
    
    # defines which key was positive and which was negative (p or q) (these are counterbalanced across subjects)
    if key_map=='first': 
        responses = {81:'negative',80:'positive',32:'neutral',None:None}
    else: 
        responses = {80:'negative',81:'positive',32:'neutral',None:None}
        
    subject_gen['response'] = [responses[i] for i in subject_gen.key_press.values]
    
    return subject_gen

In [67]:
def generalization_curves(gen_data, data_type, name): 

    gen_curves = {}
    distances = np.sort(gen_data['distance'].unique())
    colores = {'positive':'red', 'negative':'blue'}

    for i_valence in ['positive', 'negative']: 

        gen_rates = [] 
        gen_sems = []
        
        for i_distance in distances: 

            conditions = (gen_data.valence==i_valence) * (gen_data.distance==i_distance)
            
            if data_type == 'valence': 
                choices = gen_data.response[conditions] == gen_data.valence[conditions]
            elif data_type == 'rt': 
                choices = gen_data.rt[conditions]
                choices = [i for i in choices if i != None]
            
            gen_rates.append(mean(choices))
            gen_sems.append(scipy.stats.sem(choices))
            
        gen_rates = np.array(gen_rates)
        gen_sems = np.array(gen_sems)
        gen_curves[i_valence] = gen_rates
        plot(distances, gen_rates, linewidth=3, label=i_valence, color=colores[i_valence], alpha=.5)
        fill_between(distances, gen_rates+gen_sems, gen_rates-gen_sems, alpha=.2, color=colores[i_valence])
    
    title("%s's pilot data!"%name)
    
    if data_type == 'valence': ylabel('p( association_valence | tone )')
    elif data_type == 'rt': 
        ylabel('Reaction time')
        ylim([0,2000])
    xlabel('Distance from original tone')
    legend()


In [68]:
def get_pilot_subject(data,subj_id): 
    person_data = data[data.run_id==subj_id]
    # get all the data 
    ignore = ['trial_index', 
              'time_elapsed', 
              'stimulus', 
              'correct_response', 
              'internal_node_id', 
              'trial_type']

    person_data = pd.DataFrame()

    for one_trial in person_data: 
        q = {i:one_trial['trial_data'][i] for i in list(one_trial['trial_data'].keys()) if i not in ignore}
        q['subject'] = one_trial['worker_id']
        person_data = person_data.append(q, ignore_index=True)
    
    return person_data

In [85]:
# runs the extract_data() function in the pilotA_dataclean.py file, and then adds some more processing
# trial_data = extract_data()

# extracts data for pilot A
def extract_data_pA():
    # gets all filenames for subject data
    # use glob for the final analysis so that you don't have to manually type in every file name
    path = os.path.abspath("../data")
    sys.path.append(path)
    subjectfiles = ["/pilotA/pilotA-rescue-251_1.csv", "/pilotA/pilotA-rescue-251_3.csv"]
    
    # initialize data frame
    subject_trial_data = pd.DataFrame()
    
    # iterate over each subject (file)
    for filename in subjectfiles:
        fullname = path + filename
        df_curr = pd.read_csv(fullname)
        subject_trial_data = pd.concat([subject_trial_data,df_curr])
    
    subject_trial_data['positive_key'] = np.nan
    subject_trial_data['negative_key'] = np.nan
    subject_trial_data['association'] = np.nan
    
    null_values = subject_trial_data['correct_response'].isnull()
    subject_trial_data['cr_is_null'] = null_values
    
    return subject_trial_data

# takes in the extracted subject_trial_data df and adds mappings
def add_mappings(extracted_df):
    mapping = {'p':80, 'q':81, 'space':32}
    
    for i_subject in extracted_df.run_id.unique(): 
        for i_valence in ['positive', 'negative']: 
            conditions = (extracted_df.run_id==i_subject) * (extracted_df.valence==i_valence) * (extracted_df.cr_is_null==False)#got rid of the *stage=instrumental
            
            # # get the key value and map it to the numerical value
            assert(not extracted_df[conditions].empty)
            
            print("printing the correct responses:")
            responses = extracted_df[conditions]['correct_response']
            i_key = []
            print(responses)

            
            # for res in responses:
            #     print('one section:')
            #     print(res)
            #     i_key.append(mapping[res])
            #     print(mapping[res])
            #     print("fin")
            
            # i_key = mapping[np.unique(subject_trial_data[conditions]['correct_response'])]
            i_key = extracted_df[conditions]['correct_response'].map(mapping)
            
            print('begin:')
            print(i_key)
            print("fin")
        
            # update subject valence-key mapping 
            extracted_df['%s_key'%i_valence][extracted_df.subject==i_subject] = int(i_key)

    return extracted_df # will return a pandas dataframe with all of the subjects data concatenated, where each row is a subject's trial

In [86]:
trial_data = extract_data_pA()
cleaned_data = add_mappings(trial_data)
print(cleaned_data)

printing the correct responses:
2          q
22         q
33     space
39         q
51     space
57     space
60     space
68         q
86         q
106    space
112    space
118        q
121    space
124    space
132        q
147        q
164    space
170    space
179    space
185    space
188        q
Name: correct_response, dtype: object
begin:
2      81
22     81
33     32
39     81
51     32
57     32
60     32
68     81
86     81
106    32
112    32
118    81
121    32
124    32
132    81
147    81
164    32
170    32
179    32
185    32
188    81
Name: correct_response, dtype: int64
fin


TypeError: cannot convert the series to <class 'int'>

In [None]:

generalization_data = trial_data[trial_data.stage=='generalization']
generalization_data = generalization_data[generalization_data.valence!='control']

response_mapping = {80:'positive',81:'negative',32:'neutral'}
generalization_data['response'] = [response_mapping[i] for i in generalization_data.key_press.values]
generalization_data['distance'] = abs(generalization_data['distance'])

main_question_conditions = (generalization_data.valence != 'control')
valence_by_distance = generalization_data[main_question_conditions][['response', 'distance', 'valence']]

In [63]:
# runs the extract_data() function in the pilotA_dataclean.py file, and then adds some more processing
# trial_data = extract_data()
import sys
import os

# extracts data for pilot A
def extract_data():
    # gets all filenames for subject data
    path = os.path.abspath("../data")
    sys.path.append(path)
    subjectfiles = ["/pilotA/pilotA-rescue-251_1.csv", "/pilotA/pilotA-rescue-251_3.csv"]
    
    # initialize data frame
    subject_trial_data = pd.DataFrame()
    
    # iterate over each subject (file)
    for filename in subjectfiles:
        fullname = path + filename
        df_curr = pd.read_csv(fullname)
        subject_trial_data = pd.concat([subject_trial_data,df_curr])
    
    subject_trial_data['positive_key'] = np.nan
    subject_trial_data['negative_key'] = np.nan
    subject_trial_data['association'] = np.nan

    mapping = {'p':80, 'q':81, 'space':32}
    
    for i_subject in subject_trial_data.run_id.unique(): 
        for i_valence in ['positive', 'negative']: 
            conditions = (subject_trial_data.run_id==i_subject) * (subject_trial_data.valence==i_valence) * (subject_trial_data.stage!='')#got rid of the *stage=instrumental
            
            # # get the key value and map it to the numerical value
            assert(not subject_trial_data[conditions].empty)
            
            print("printing the correct responses:")
            responses = subject_trial_data[conditions]['correct_response']
            i_key = []
            print(responses)
            
            for res in responses:
                if (res.isnull().values):
                    print("caught")
                    continue
                print(res)
                i_key.append(mapping[res])
                print(res)
            
            # i_key = mapping[np.unique(subject_trial_data[conditions]['correct_response'])]
        
        # # update subject valence-key mapping 
        # subject_trial_data['%s_key'%i_valence][data.subject==i_subject] = int(i_key)

    return subject_trial_data # will return a pandas dataframe with all of the subjects data concatenated, where each row is a subject's trial

trial_data = extract_data()

generalization_data = trial_data[trial_data.stage=='generalization']
generalization_data = generalization_data[generalization_data.valence!='control']

response_mapping = {80:'positive',81:'negative',32:'neutral'}
generalization_data['response'] = [response_mapping[i] for i in generalization_data.key_press.values]
generalization_data['distance'] = abs(generalization_data['distance'])

main_question_conditions = (generalization_data.valence != 'control')
valence_by_distance = generalization_data[main_question_conditions][['response', 'distance', 'valence']]

printing the correct responses:
2          q
4        NaN
16       NaN
22         q
33     space
39         q
51     space
57     space
60     space
68         q
74       NaN
86         q
89       NaN
106    space
112    space
118        q
121    space
124    space
132        q
135      NaN
147        q
150      NaN
164    space
170    space
179    space
185    space
188        q
Name: correct_response, dtype: object


AttributeError: 'str' object has no attribute 'isnull'

In [None]:
# look at the df to see what it looks like
valence_by_distance[0:20]

In [None]:
# make basic pilot graphs
colores = {'positive':'red', 'negative':'blue'}

distances = np.sort(generalization_data.distance.unique())
all_things = {'positive':[], 'negative':[]}

for i_subject in generalization_data.subject.unique(): 
    
    for i_valence in ['positive', 'negative']: 
        
        things = [] 
        
        for i_distance in distances: 
        
            gen = generalization_data[generalization_data.subject==i_subject]
            conditions = (gen.valence==i_valence)*(gen.distance==i_distance)
            things.append(mean(gen.response[conditions] == gen.valence[conditions]))
        
        all_things[i_valence].append(things)
        plot(distances, things, color=colores[i_valence], alpha=.1, linewidth=3)

plot(distances, np.mean(all_things['positive'],0), color='red', linewidth=3, label='positive')
plot(distances, np.mean(all_things['negative'],0), color='blue', linewidth=3, label='negative')
title('Pilot A generalization curves')
legend() ; 