In [1]:
%pylab inline
import json, csv, pandas, numpy, statsmodels, seaborn
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scikits.bootstrap as bootstrap 
from scipy import stats
from soph import boot,t_test_text_1samp,t_test_text_2samp,plot_by_one,plot_by_two


seaborn.set_style("whitegrid")

Populating the interactive namespace from numpy and matplotlib


In [2]:
datafile = "../data/artificial-4-concepts/HITResultsFor3WUVMVA7OB8D4J99ULOW700SB28AZU.csv"

In [44]:
label_key = {                              #MTurk ruins labels so we have to recover them
    "Answer 1": "object level training",
    "Answer 2": "num_examples_to_show",
    "Answer 3": "num_examples_clicked",
    "Answer 4": "instruction_wait_time",
    "Answer 5": "feedback_wait_time",
    "Answer 6": "condition",
    "Answer 7": "pseudo_obj",
    "Answer 8": "border_values",
    "Answer 9": "pattern_values",
#     "Answer 7": "r_questions_pretest",
#     "Answer 8": "r_answers_pretest",
    "Answer 10": "training_complete",
    "Answer 11": "r_questions_posttest",
    "Answer 12": "r_answers_posttest",
    "Answer 13": "about",
    "Answer 14": "better",
    "Answer 15": "unclear",
    "Answer 16": "comment",
    "Answer 17": "external_aid",
    "Answer 18": "strategy",
    "Answer 19": "age",
    "Answer 20": "gender",
    "Answer 21": "exp_total_time",
    "WorkerId": "subids",
}
# here are the columns after this is finished ['AcceptTime', 'Annotation', 'AssignmentId', 'HitId', 'HitTitle',
#        'Status', 'SubmitTime', 'subids', 'about', 'age', 'better', 'block',
#        'comment', 'condition', 'correct', 'exp_total_time', 'external_aid',
#        'feedback_wait_time', 'gender', 'instruction_wait_time', 'level',
#        'num_examples_clicked', 'num_examples_to_show', 'phase', 'question',
#        'response', 'shape', 'strategy', 'training_complete',
#        'trial_num_within_block', 'trial_time', 'trial_type', 'unclear']
rel_key = {                               # answer key for relational questions of the form "are all x also y"
    "square_rectangle": True,
    "square_rhombus": True,
    "square_parallelogram": True,
    "rectangle_square": False,
    "rectangle_rhombus": False,
    "rectangle_parallelogram": True,
    "rhombus_square": False,
    "rhombus_rectangle": False,
    "rhombus_parallelogram": True,
    "parallelogram_square": False,
    "parallelogram_rhombus": False,
    "parallelogram_rectangle": False,
}
bool_key = {"Yes": True, "No": False}

In [45]:
with open(datafile, 'r') as f:
    csv_lines = list(csv.reader(f))
#     csv_lines = [ row for row in csv_lines]
# print(csv_lines[0])
# print(csv_lines[1])
labels = csv_lines.pop(0)

labels = [label_key[l] if l in label_key else l for l in labels]

subject_dicts = []

for row in csv_lines:
    
    new_row = []
    for item in row:
        try: 
            new_row.append(json.loads(item))
        except:
            new_row.append(item)
            
    subject_dicts.append(dict(zip(labels,new_row)))

In [46]:
keep_cols = ['subids','exp_total_time', 'condition']
    
tidy_dicts = []

for subj in subject_dicts:

    #grab object-level data
    obj_data = subj.pop("object level training")

    #grab relational-level data
#     pretest_answers = [bool_key[a] for a in subj.pop("r_answers_pretest")]
#     pretest_questions =  [q.replace("r_pretest_","") for q in subj.pop("r_questions_pretest")]
#     pretest_correct = [a == rel_key[q] for a,q in zip(pretest_answers,pretest_questions)]
#     pretest_dict = dict(zip(pretest_questions, pretest_correct))
    print(subj["r_answers_posttest"])
    posttest_answers = [bool_key[a] for a in subj.pop("r_answers_posttest")]
    posttest_questions =  [q.replace("r_posttest_","") for q in subj.pop("r_questions_posttest")]
    posttest_correct = [a == rel_key[q] for a,q in zip(posttest_answers,posttest_questions)]
    posttest_dict = dict(zip(posttest_questions, posttest_correct))
    
    subj = dict([(k,v) for k,v in subj.items() if k in keep_cols])
    
    # create a row for every object-level datapoint
    for obj in obj_data:
        new_row = subj.copy()
        new_row.update(obj)
        new_row["phase"] = "training"
        new_row["level"] = "object"
        tidy_dicts.append(new_row)
    

    # create a row for every relational question
    for q in posttest_dict:
        
        # pretest row
#         new_row = subj.copy()
#         new_row.update({
#             "phase": "pretest",
#             "level": "relational",
#             "question": q,
#             "correct": pretest_dict[q]
#         })
#         tidy_dicts.append(new_row)
        
        # posttest row
        new_row = subj.copy()
        new_row.update({
            "phase": "posttest",
            "level": "relational",
            "question": q,
            "correct": posttest_dict[q]
        })
        tidy_dicts.append(new_row)
        
tidy_df = pandas.DataFrame.from_dict(tidy_dicts)

['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No']
label-order


KeyError: 'l'

In [39]:
subject_dicts[2]

{'AcceptTime': 'Mon Mar 20 10:19:27 PDT 2017',
 'Annotation': ' ',
 'AssignmentId': '3PJUZCGDJ7LN2OWBJSFLM5NQY7U98L',
 'HitId': '3WUVMVA7OB8D4J99ULOW700SB28AZU',
 'HitTitle': '$3 for 15 min. | Concept learning | University of Louisville',
 'Status': 'Approved',
 'SubmitTime': 'Mon Mar 20 10:49:34 PDT 2017',
 'about': True,
 'age': 413387,
 'better': 3,
 'border_values': 'Categorizing shapes as nonsense words like sime and dramand',
 'comment': ['r_posttest_square_rectangle',
  'r_posttest_rectangle_parallelogram',
  'r_posttest_parallelogram_square',
  'r_posttest_square_rhombus',
  'r_posttest_rectangle_square',
  'r_posttest_square_parallelogram',
  'r_posttest_rectangle_rhombus',
  'r_posttest_parallelogram_rhombus',
  'r_posttest_parallelogram_rectangle',
  'r_posttest_rhombus_square',
  'r_posttest_rhombus_rectangle',
  'r_posttest_rhombus_parallelogram'],
 'condition': ['dash', 'solid'],
 'exp_total_time': '29',
 'external_aid': " I have never felt so stupid in my life (okay, hyp

In [6]:
block_to_section = lambda x: x if x in ["pretest", "posttest"] else "training"
tidy_df["section"] = tidy_df.block.map(block_to_section)
# baseline_data["question_text"] = ["Are all {}s also {}s?".format(r.question,r["shape"]) 
#                                   for i,r in baseline_data.iterrows()]

tidy_df["block_reached_crit"] = 15
tidy_df["blocks_to_go"] = 0
tidy_df["reached_crit"] = False
tidy_df["incorrect"] = [int(~i) for i in tidy_df.correct]
for subj in tidy_df.subids.unique():
    subj_i = (tidy_df.subids == subj)&(tidy_df.section=='training')
    subj_blocks = tidy_df.block[subj_i]
#     print(max(subj_blocks))
    tidy_df.block_reached_crit[tidy_df.subids == subj] = max(subj_blocks)
    tidy_df.blocks_to_go[subj_i] = max(subj_blocks) - subj_blocks +1
    tidy_df.reached_crit[tidy_df.subids == subj] = max(subj_blocks) != 15

AttributeError: 'DataFrame' object has no attribute 'block'