# 2.2 Experiment 2B

## 2.2.1 Imports

In [4]:
# Analytical Tools
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# General Utilities
import itertools
import json
import pprint

# Custom Utilities
import utilities.processing as processing
import utilities.plotting as plotting
import utilities.describe as describe

# Some settings
log = pprint.pprint
%matplotlib inline
pd.options.mode.chained_assignment = None

## 2.2.2 Up-Vote Data

In [5]:
GROUPS = 2
QUESTIONS = 5
JUDGMENTS = 6
SCORE_THRESHOLD = 240.9

FILE_NAMES = [
    'data/exp-12-data/raw-3.json',
]

master_responses = []
for name in FILE_NAMES:
    with open(name) as file:
        master_responses.extend(json.loads(line) for line in file if line)

In [6]:
def _get_group(score):
    if score < SCORE_THRESHOLD:
        return 0
    return 1

In [7]:
data = {
    'id': [],
    'consent': [],
    'attention': []
}

for g_num in range(GROUPS):
    for q_num in range(QUESTIONS):
        data['g{}_q{}_score'.format(g_num, q_num)] = []
        data['g{}_q{}_index'.format(g_num, q_num)] = []
        for j_num in range(JUDGMENTS):
            data['g{}_q{}_j{}'.format(g_num, q_num, j_num)] = []

In [8]:
ANSWER_KEYS = ('Q0', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5')

for person in master_responses:
    # Grab data & ID
    person_data = person['data']  
    data['id'].append(person_data[0]['participantID'])
    
    # Fill in question labels
    question_scores = person_data[0]['questionScores']
    question_indices = person_data[0]['questionIndices']
    
    counters = [0, 0]
    question_labels = []
    for score in question_scores:
        g_num = _get_group(score)
        question_labels.append('g{}_q{}'.format(g_num, counters[g_num]))
        counters[g_num] += 1
    
    for q_label, score in zip(question_labels, question_scores):
        data['{}_score'.format(q_label)].append(score)
    for q_label, index in zip(question_labels, question_indices):
        data['{}_index'.format(q_label)].append(index)
    
    # Get consent response
    consent_answer = json.loads(person_data[0]['responses'])
    consent_value = int(consent_answer['Q0'].startswith('I consent'))
    data['consent'].append(consent_value)

    # Fill in judgments
    judgment_indices = person_data[0]['judgmentIndices']
    
    for q_label, likert_index in zip(question_labels, range(2, 14)):
        answer = json.loads(person_data[likert_index]['responses'])
        for j_label, key in zip(judgment_indices, ANSWER_KEYS):
            rating = int(answer[key])
            data['{}_j{}'.format(q_label, j_label)].append(rating)

    # Attention check
    attention_answer = json.loads(person_data[7]['responses'])
    attention_value = int(attention_answer['Q6'] == '6')
    data['attention'].append(attention_value)

In [9]:
data = pd.DataFrame(data)
sample_size = len(data)
data.head()

Unnamed: 0,id,consent,attention,g0_q0_score,g0_q0_index,g0_q0_j0,g0_q0_j1,g0_q0_j2,g0_q0_j3,g0_q0_j4,...,g1_q3_j4,g1_q3_j5,g1_q4_score,g1_q4_index,g1_q4_j0,g1_q4_j1,g1_q4_j2,g1_q4_j3,g1_q4_j4,g1_q4_j5
0,sana26r21r9md90gqj3o7fm1rm4rpw2p,1,1,36,2,4,3,5,0,5,...,1,2,24039,40,3,2,4,5,1,5
1,68d2brqo4tuf7kstwx0evwcuu5hqf9o2,1,1,30,10,2,2,3,2,2,...,3,3,24051,0,1,3,1,1,1,1
2,rxt9ompoqypdk5v67wmwpseu29te3s08,1,1,14,32,6,2,5,2,3,...,5,5,24047,23,2,0,1,6,6,1
3,e4zr4k2wtn02qngsjapkp4w911s169qu,1,0,15,45,6,0,4,0,6,...,6,6,24051,21,6,6,0,3,6,0
4,wul66ydj8kvrksvrb3u57s0eap457scs,1,1,24,41,5,0,3,4,6,...,6,4,24032,42,2,0,4,6,5,4


In [10]:
print(len(data), data.size)
data = data[data.consent == 1]
print(len(data), data.size)
data = data[data.attention == 1]
print(len(data), data.size)

print('Inclusion: {:.4}% '.format(len(data) / sample_size * 100))

264 21912
264 21912
243 20169
Inclusion: 92.05% 


In [11]:
for g_num in range(GROUPS):
    for j_num in range(JUDGMENTS):
        headings = []
        for q_num in range(QUESTIONS):
            headings.append('g{}_q{}_j{}'.format(g_num, q_num, j_num))
        
        rating_mean = data[headings].mean(axis=1)
        data['g{}_j{}_mean'.format(g_num, j_num)] = rating_mean

In [12]:
PLOTTING_OPTIONS = {
    'ylabel': 'Ratings',
    'ticks': ['Curiosity', 'Confidence', 'Usefulness',
              'Popularity', 'Surprise', 'Social Utility'],
    'legend': ['Low Scores', 'High Scores'],
    'title': 'All Questions',
    'size': (12, 6)
}

# for _ in range(1):
#     sample = data.iloc[random.sample(range(len(data)), len(data))]
    
sample = data

g0_means, g0_errs = [], []
g1_means, g1_errs = [], []
for j_num in range(JUDGMENTS):
    g0_values = sample['g0_j{}_mean'.format(j_num)]
    g1_values = sample['g1_j{}_mean'.format(j_num)]
    g0_means.append(g0_values.mean())
    g1_means.append(g1_values.mean())
    g0_errs.append(stats.sem(g0_values))
    g1_errs.append(stats.sem(g1_values))

In [13]:
r_format = lambda values: ', '.join(map(str, values))

print('Means')
print(r_format(g0_means) + ',')
print(r_format(g1_means))

print('Errors')
print(r_format(g0_errs) + ',')
print(r_format(g1_errs))

Means
3.06090534979424, 1.7168724279835395, 2.5720164609053504, 1.8008230452674885, 2.416460905349795, 2.2255144032921814,
3.4814814814814805, 1.7934156378600823, 2.8551440329218116, 4.471604938271604, 3.0567901234567882, 2.5465020576131705
Errors
0.08708556835127079, 0.08677867302581319, 0.0822069278140041, 0.08603070837739102, 0.07974193108745946, 0.08334014977729128,
0.08441716032837629, 0.09057077574386016, 0.08402893743392, 0.08017365370278555, 0.08332530277562553, 0.08671826757953739


## 2.2.3 Baseline Data

In [15]:
QUESTIONS = 10
JUDGMENTS = 4

FILE_NAMES = [
    'data/exp-13-data/baseline_final.json', #raw is initial one that may have duplicates, we take the non-duplicates from it
    'data/exp-13-data/raw.json',
]

master_responses = []
for name in FILE_NAMES:
    with open(name) as file:
        master_responses.extend(json.loads(line) for line in file if line)

In [16]:
data = {
    'id': [],
    'consent': [],
    'attention': []
}

for q_num in range(QUESTIONS):
    data['q{}_score'.format(q_num)] = []
    data['q{}_index'.format(q_num)] = []
    for j_num in range(JUDGMENTS):
        data['q{}_j{}'.format(q_num, j_num)] = []

In [17]:
ANSWER_KEYS = ('Q0', 'Q1', 'Q2', 'Q3')

for person in master_responses:
    # Grab data & ID
    person_data = person['data']  
    data['id'].append(person_data[0]['participantID'])
    
    # Fill in question labels
    question_scores = person_data[0]['questionScores']
    question_indices = person_data[0]['questionIndices']
    
    for q_num, score in zip(range(QUESTIONS), question_scores):
        data['q{}_score'.format(q_num)].append(score)
    for q_num, score in zip(range(QUESTIONS), question_indices):
        data['q{}_index'.format(q_num)].append(score)  
    
    # Get consent response
    consent_answer = json.loads(person_data[0]['responses'])
    consent_value = int(consent_answer['Q0'].startswith('I consent'))
    data['consent'].append(consent_value)

    # Fill in judgments
    judgment_indices = person_data[0]['judgmentIndices']
    
    for q_num, likert_index in zip(range(QUESTIONS), range(2, 12)):
        answer = json.loads(person_data[likert_index]['responses'])
        for j_label, key in zip(judgment_indices, ANSWER_KEYS):
            rating = int(answer[key])
            data['q{}_j{}'.format(q_num, j_label)].append(rating)

    # Attention check
    attention_answer = json.loads(person_data[7]['responses'])
    attention_value = int(attention_answer['Q4'] == '6')
    data['attention'].append(attention_value)

In [18]:
data = pd.DataFrame(data)
sample_size = len(data)
data.head()

Unnamed: 0,id,consent,attention,q0_score,q0_index,q0_j0,q0_j1,q0_j2,q0_j3,q1_score,...,q8_j0,q8_j1,q8_j2,q8_j3,q9_score,q9_index,q9_j0,q9_j1,q9_j2,q9_j3
0,sfmcx2twvmsm0d9qpnq19vpmrd27oqtr,1,1,,18,4,0,0,0,,...,6,2,3,3,,37,0,6,6,3
1,qrw87p430tabkol0mxj9q3dkm1p4o423,1,1,,5,5,1,4,2,,...,4,4,5,1,,23,4,1,3,1
2,2yfjc32hqne6avf0a6kaxbhftw6ydo47,1,0,,26,0,1,0,1,,...,4,3,4,4,,41,0,0,0,0
3,076dogp6qxwgm1aqfmdryebkh30e4w9m,1,1,,20,3,0,1,1,,...,4,0,3,1,,40,0,0,0,0
4,um8loq05sq31yj9f54c4rhxfhjsx7uac,1,1,,38,1,0,0,0,,...,3,4,1,2,,27,3,3,2,2


In [19]:
print(len(data), data.size)
data = data[data.consent == 1]
print(len(data), data.size)
data = data[data.attention == 1]
print(len(data), data.size)

print('Inclusion: {:.4}% '.format(len(data) / sample_size * 100))

298 18774
298 18774
278 17514
Inclusion: 93.29% 


In [20]:
for j_num in range(JUDGMENTS):
    headings = []
    for q_num in range(QUESTIONS):
        headings.append('q{}_j{}'.format(q_num, j_num))

    rating_mean = data[headings].mean(axis=1)
    data['j{}_mean'.format(j_num)] = rating_mean

In [22]:
means, errs = [], []
for j_num in range(JUDGMENTS):
    values = data['j{}_mean'.format(j_num)]
    means.append(values.mean())
    errs.append(stats.sem(values))

print(r_format(means))
print(r_format(errs))

3.484172661870501, 1.6143884892086335, 2.6496402877697847, 2.2496402877697848
0.07087030628635274, 0.07352797928986694, 0.07247601012825244, 0.0717032227640521


In [23]:
['Curiosity', 'Confidence', 'Usefulness', 'Social Utility']

['Curiosity', 'Confidence', 'Usefulness', 'Social Utility']

# HI

In [5]:
control_responses = response_data[(response_data.group_number == 0) | (response_data.group_number == 1)]
upvote_responses = response_data[response_data.group_number == 2]

print(len(control_responses), len(upvote_responses))

299 301


In [6]:
control_responses = control_responses[(control_responses.test_one == 1) | (control_responses.test_two == 1)]
upvote_responses = upvote_responses[(upvote_responses.test_one == 1) | (upvote_responses.test_two == 1)]
print(len(control_responses), len(upvote_responses))

295 297


## 2.1.3 Analysis

In [7]:
NUM_QUESTIONS, NUM_JUDGEMENTS = 10, 7
GROUP_SIZE = NUM_QUESTIONS // 2
THRESHOLD = 240.9

CONDITIONS = ['Control', 'Post Number', 'Upvotes']
QUESTIONS = ['Curiosity', 'Confidence', 'Usefulness',
    'Popularity', 'Writing', 'Surprise', 'Social Utility']

ALL_LABELS = processing.get_all_labels(NUM_QUESTIONS)
LOW_LABELS, HIGH_LABELS = ALL_LABELS[:GROUP_SIZE], ALL_LABELS[GROUP_SIZE:]
JUDGMENT_LABELS = processing.get_judgment_labels(NUM_JUDGEMENTS)

In [8]:
for j_label in JUDGMENT_LABELS:
    low_headers = ['{}_{}'.format(q_label, j_label) for q_label in LOW_LABELS]
    high_headers = ['{}_{}'.format(q_label, j_label) for q_label in HIGH_LABELS]

    low_data = upvote_responses[low_headers].mean(axis=1)
    high_data = upvote_responses[high_headers].mean(axis=1)

    upvote_responses['{}_low'.format(j_label)] = low_data
    upvote_responses['{}_high'.format(j_label)] = high_data
    upvote_responses['{}_diff'.format(j_label)] = high_data - low_data
upvote_responses.head()

Unnamed: 0,consent,participant_id,group_number,response_type,test_one,test_two,low_q0_score,low_q0_index,low_q0_choice,low_q0_j0,...,j3_diff,j4_low,j4_high,j4_diff,j5_low,j5_high,j5_diff,j6_low,j6_high,j6_diff
12,1.0,r7gvngs3mjdqo4lb3nd6hm3b3uobqsrb,2,0,1.0,1.0,28.0,26.0,1.0,3.0,...,0.0,1.2,1.2,0.0,1.2,1.4,0.2,1.2,1.2,0.0
49,1.0,3burenqmmt71nbk9sznc0t6gadf8wrlk,2,0,1.0,1.0,23.0,48.0,1.0,3.0,...,1.2,3.4,3.6,0.2,3.4,3.6,0.2,3.8,3.6,-0.2
54,1.0,9543ar5qbz3ectkkdqoexf9lurur57za,2,0,1.0,1.0,25.0,27.0,0.0,4.0,...,4.8,2.0,3.2,1.2,3.4,2.0,-1.4,2.0,2.2,0.2
55,1.0,kd74sbkm5u7u0b0424a6fm5d76wagaw6,2,0,0.0,1.0,25.0,1.0,1.0,4.0,...,3.2,2.4,3.4,1.0,3.6,2.6,-1.0,2.6,3.0,0.4
77,1.0,hczxorcd6py0pwrty95zbp2rzqdvv66e,2,0,1.0,1.0,16.0,43.0,1.0,6.0,...,3.4,4.4,3.0,-1.4,1.6,4.2,2.6,2.6,3.4,0.8


In [20]:
for j_label in JUDGMENT_LABELS:
    low_headers = ['{}_{}'.format(q_label, j_label) for q_label in LOW_LABELS]
    high_headers = ['{}_{}'.format(q_label, j_label) for q_label in HIGH_LABELS]
    all_headers = low_headers + high_headers

    # For control {}_low, {}_high both the average of all questions
    all_data = control_responses[all_headers].mean(axis=1)
    control_responses['{}_low'.format(j_label)] = all_data
    control_responses['{}_high'.format(j_label)] = all_data
    control_responses['{}_diff'.format(j_label)] = 0
control_responses.head()

Unnamed: 0,consent,participant_id,group_number,response_type,test_one,test_two,low_q0_score,low_q0_index,low_q0_choice,low_q0_j0,...,j3_diff,j4_low,j4_high,j4_diff,j5_low,j5_high,j5_diff,j6_low,j6_high,j6_diff
9,1.0,6rdp183ar5kr3jagqdoxb7nh9d7kyd0u,0,0,0.0,1.0,35.0,9.0,0.0,0.0,...,0,0.0,0.0,0,,,0,0.0,0.0,0
48,1.0,7ov3go08xy77yt73ou8tc79enl4c67mj,0,0,1.0,1.0,24.0,7.0,1.0,3.0,...,0,3.3,3.3,0,,,0,2.8,2.8,0
50,1.0,qwt23r6c9kn5tl73h6kh6kcdqm1nz4oq,0,0,1.0,1.0,16.0,27.0,1.0,6.0,...,0,3.8,3.8,0,,,0,0.6,0.6,0
51,1.0,1a6hvsu5ghm07dusso38okbt09960cyk,0,0,1.0,1.0,24.0,10.0,0.0,3.0,...,0,4.9,4.9,0,,,0,2.3,2.3,0
58,1.0,p1nehat5h8la1qhyxhmdf83upctpjl0q,0,0,1.0,0.0,14.0,11.0,1.0,3.0,...,0,4.0,4.0,0,,,0,2.4,2.4,0


In [32]:
low, control, high = [], [], []
low_err, control_err, high_err = [], [], []

for j_label in JUDGMENT_LABELS:
    if j_label != 'j4':
        low_data = upvote_responses['{}_low'.format(j_label)]
        low.append(low_data.mean())
        low_err.append(stats.sem(low_data))

        control_data = control_responses['{}_low'.format(j_label)]
        control.append(control_data.mean())
        control_err.append(stats.sem(control_data))

        high_data = upvote_responses['{}_high'.format(j_label)]
        high.append(high_data.mean())
        high_err.append(stats.sem(high_data))

In [34]:
r_format = lambda values: ', '.join(map(str, values))

print('Means')
print(r_format(low) + ',')
print(r_format(control) + ',')
print(r_format(high))

print('Errors')
print(r_format(low_err) + ',')
print(r_format(control_err) + ',')
print(r_format(high_err))

Means
2.983164983164984, 1.7299663299663297, 2.4296296296296296, 1.827609427609427, 2.358922558922558, 2.120538720538721,
3.349152542372882, 1.721694915254238, 2.649830508474576, 2.889152542372881, nan, 2.415593220338984,
3.4383838383838388, 1.7804713804713799, 2.7649831649831627, 4.22087542087542, 2.9845117845117857, 2.593265993265991
Errors
0.07704766995023894, 0.0761468044766546, 0.07494190877181982, 0.0773848545844434, 0.06855260153760509, 0.07420865263871017,
0.07459157311856353, 0.07813852069409588, 0.07253521205245628, 0.06467555441801905, nan, 0.07264566557405276,
0.08113552957078504, 0.08088870837335543, 0.07629995629160503, 0.07084813180600136, 0.07135194230169387, 0.07778459263755035
