In [1]:
import pandas as pd
import numpy as np

all_data = pd.read_csv('symmetry_raw_data.csv')
all_data[:11]

Unnamed: 0,sub_num,trial_num,trial_type,relation1,relation2,pair1_word1,pair1_word2,pair2_word1,pair2_word2,rating,RT
0,1,1,between-subtype,9h,9b,ingestion,digestion,mine,coal,1,7623
1,1,2,between-subtype,4a,4g,rich,poor,hard,simple,1,7019
2,1,3,between-subtype,9d,9b,kitchen,pot,dairy,milk,1,8010
3,1,4,between-subtype,6h,6g,discourage,confident,hurry,lazily,1,14199
4,1,5,between-subtype,5a,5b,fire,heat,volatile,combustible,4,6833
5,1,6,within-subtype,8a,8a,accident,damage,injury,pain,5,9784
6,1,7,between-subtype,2g,2a,house,bricks,bicycle,wheel,7,4386
7,1,8,between-subtype,9g,9e,adulthood,job,margin,paper,1,4330
8,1,9,between-type,7c,9h,shooter,gun,step,journey,1,5117
9,1,10,within-subtype,3e,3e,associate,partner,puppy,dog,1,4366


In [2]:
# Find the "bad" subjects--those who failed too many attention checks

fail_threshold = 2
bad_subs = []
attn_checks = all_data[all_data['trial_type'] == 'attention check']
for sub_num in attn_checks['sub_num'].unique():
    sub_data = attn_checks[attn_checks['sub_num'] == sub_num]
    same_words = sub_data['pair1_word1'] == sub_data['pair2_word1']
    diff_words = np.logical_not(same_words)
    not_rated_same = sub_data['rating'] != 7
    not_rated_diff = sub_data['rating'] != 1
    num_failed = np.sum(np.logical_or(np.logical_and(same_words, not_rated_same),
                                      np.logical_and(diff_words, not_rated_diff)))
    if num_failed >= fail_threshold:
        bad_subs.append(sub_num)
        
print 'Number of "bad" subjects:', len(bad_subs)

Number of "bad" subjects: 99


In [6]:
# Get the "good" data

good_rows = reduce(np.logical_and,
                   [all_data['sub_num'] != num for num in bad_subs])
good_data = all_data[good_rows]
print 'Number of "good" subjects:', len(good_data['sub_num'].unique())

real_trials = good_data[good_data['trial_type'] != 'attention check']

print 'Mean RT (seconds):', real_trials['RT'].mean() / 1000
print 'Mean overall rating:', real_trials['rating'].mean()
print
print 'Mean within-subtype rating:', real_trials[real_trials['trial_type'] == 'within-subtype']['rating'].mean()
print 'Mean between-subtype rating:', real_trials[real_trials['trial_type'] == 'between-subtype']['rating'].mean()
print 'Mean between-type rating:', real_trials[real_trials['trial_type'] == 'between-type']['rating'].mean()

Number of "good" subjects: 1003
Mean RT (seconds): 10.6806863809
Mean overall rating: 4.17122632104

Mean within-subtype rating: 5.08619595758
Mean between-subtype rating: 3.62911266201
Mean between-type rating: 2.80408773679


In [5]:
# Create a dictionary of all individual ratings for each comparison. Each entry has
# the ratings for both the forward and backward directions.

def get_indiv_ratings_dict(data):
    ratings_dict = {}

    for r in xrange(data.shape[0]):
        row = data.iloc[r]
        rel1 = row['relation1']
        rel2 = row['relation2']
        pair1_word1 = row['pair1_word1']
        pair1_word2 = row['pair1_word2']
        pair2_word1 = row['pair2_word1']
        pair2_word2 = row['pair2_word2']
        rating = row['rating']

        comp = (rel1, rel2, (pair1_word1, pair1_word2), (pair2_word1, pair2_word2))
        rev_comp = (rel2, rel1, (pair2_word1, pair2_word2), (pair1_word1, pair1_word2))

        if comp in ratings_dict:
            ratings_dict[comp][0].append(rating)
        elif rev_comp in ratings_dict:
            ratings_dict[rev_comp][1].append(rating)
        else:
            ratings_dict[comp] = ([rating], [])
    
    return ratings_dict

all_indiv_ratings = get_indiv_ratings_dict(real_trials)

In [9]:
# Conduct a t-test for each comparison's forward and backward ratings

from scipy.stats import ttest_ind
import math

num_comps = len(all_indiv_ratings)

alpha = 0.05
num_sig = 0

print 'Comparisons with significantly different forward and backward relational similarity ratings:'
print

for comp, both_ratings in all_indiv_ratings.iteritems():
    fwd_ratings = both_ratings[0]
    bwd_ratings = both_ratings[1]
    
    mean1 = np.mean(fwd_ratings)
    SD1 = np.std(fwd_ratings, ddof=1)
    mean2 = np.mean(bwd_ratings)
    SD2 = np.std(bwd_ratings, ddof=1)
    
    _, prob = ttest_ind(fwd_ratings, bwd_ratings)
    
    if prob < alpha:
        num_sig += 1
        print comp
        print 'mean forward rating: {} (SD = {})'.format(mean1, SD1)
        print 'mean backward rating: {} (SD = {})'.format(mean2, SD2)
        print 't-test p-value:', prob
        print

Comparisons with significantly different forward and backward relational similarity ratings:

('7f', '6b', ('tip', 'waiter'), ('inexorable', 'halted'))
mean forward rating: 1.84 (SD = 1.23486758771)
mean backward rating: 2.54 (SD = 1.60623784042)
t-test p-value: 0.0163536017794

('2a', '2b', ('fish', 'fin'), ('album', 'songs'))
mean forward rating: 5.82352941176 (SD = 1.30699475673)
mean backward rating: 4.84 (SD = 1.85560375784)
t-test p-value: 0.00264295453185

('4h', '4h', ('pimple', 'skin'), ('leukemia', 'blood'))
mean forward rating: 5.06 (SD = 1.91033707892)
mean backward rating: 5.72 (SD = 1.10730413572)
t-test p-value: 0.0370904825533

('9i', '9d', ('shirt', 'button'), ('garage', 'wrench'))
mean forward rating: 3.74509803922 (SD = 2.14329780716)
mean backward rating: 4.76 (SD = 1.59795788047)
t-test p-value: 0.0083000846465

('8f', '5c', ('punch', 'pain'), ('ice', 'cold'))
mean forward rating: 5.88 (SD = 1.25584348433)
mean backward rating: 4.90196078431 (SD = 1.87888160309)
t-

In [12]:
# Conduct a bionmial test to see if the number of significant t-tests is
# more than expected under the null hypothesis that presentation order
# doesn't matter

from scipy.stats import binom_test
p = binom_test(num_sig, num_comps, 0.05)
print 'Binomial test p-value:', p

Binomial test p-value: 3.9249911628e-17
