In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

**Goal**: Use the model trace to reconstruct the main outputs of the paper. 

#### Contents 

1. <a href='#goldstandard'>Import gold standard.</a>
2. <a href='#baseline'>Construct majority voting baseline</a>
3. <a href='#validationhard'>Get validation error on difficult posts</a>
3. <a href='#validationeasy'>Get validation error on easy posts</a>
3. <a href='#paths'>Plot skill paths</a>



<a id='goldstandard'></a>
#### 1. Import gold standard

* In two waves, we collected all posts for which the model and majority voting conflicted and passed them to a Tribe employee to evaluate ground truth.


* First we import those gold standard cases.

In [2]:
# import two waves of Tribe manual verification
ground_truth1 = pd.read_csv(os.path.join("input", "Rohan Mturk Conflicts 2017-12-04.csv"))
ground_truth2 = pd.read_csv(os.path.join("input", "Rohan Mturk Conflicts 2017-12-22.csv"))
ground_truth2 = ground_truth2.rename(columns={"Tribe Decision":"Tribe decision"})
ground_truth = ground_truth1.append(ground_truth2, sort=False)
# keep columsn of interest
ground_truth = ground_truth[["text","Tribe decision"]]
ground_truth = ground_truth.rename(columns={"Tribe decision":"z_obs"})
# keep unambiguous cases
ground_truth = ground_truth[ground_truth["z_obs"].isin(["0","1"])]
ground_truth["z_obs"] = ground_truth["z_obs"].astype(int)


<a id='baseline'></a>
#### 2. Construct majority voting baseline

* We lift cleaning steps from **tribe_010218.py**.

In [3]:
def get_accessory(tribe_csvs, condense_repeat_votes=True, keep_prolific_cut=20):
    # import
    df_brands = [pd.read_csv(csv) for csv in tribe_csvs]
    
    # merge
    tribe_df = pd.concat(df_brands)
    tribe_df = tribe_df.loc[tribe_df["mturker"]==True,:]

    # deal with brand ambiguity
    tribe_df["post_hash"] = tribe_df["post_hash"].astype(str) + "_" + tribe_df["brand_id"].astype(str)
    text_lkup = tribe_df[["post_hash","text"]].drop_duplicates()
    
    # condense
    if condense_repeat_votes==True:
        rows_before = tribe_df.shape[0]
        tribe_df = tribe_df.groupby(["brand_id","worker_id","post_hash"]).mean().reset_index()
        # hard cases
        hard_cases_df = tribe_df.loc[tribe_df["answer"]==0.5,:]
        # easy cases
        tribe_df = tribe_df.loc[tribe_df["answer"]!=0.5,:]
        tribe_df["answer"] = 1*(tribe_df["answer"]>0.5)
        hard_cases_df["answer"] = 1
        tribe_df = pd.concat([tribe_df, hard_cases_df])
        hard_cases_df["answer"] = 0
        tribe_df = pd.concat([tribe_df, hard_cases_df])
        print("rows dropped due to condensing = %i" % (rows_before-tribe_df.shape[0]))
    
    # get workers with more than 20 labels
    workload = tribe_df.groupby("worker_id").count().reset_index()
    worker_subset = workload[workload["post_hash"]>keep_prolific_cut].worker_id.values
    rows_before = tribe_df.shape[0]
    tribe_df = tribe_df[tribe_df.worker_id.isin(worker_subset)]
    print("rows dropped due to non prolific worker = %i" % (rows_before-tribe_df.shape[0]))

    tribe_df["r_maj"] = tribe_df["answer"]*1
    r_maj = tribe_df \
        .groupby(["post_hash","brand_id"]) \
        .mean()["r_maj"] \
        .reset_index()

    worker_ct = tribe_df \
        .groupby(["post_hash","brand_id"]) \
        .count()["answer"] \
        .reset_index().rename(columns={"answer":"worker_ct"})
        
    model_decision_df = tribe_df[["post_hash","model_decision"]].drop_duplicates()
    
    accessory_df = r_maj.merge(worker_ct, on=["post_hash","brand_id"], indicator=True)
    assert np.mean(accessory_df["_merge"]=="both")==1
    del accessory_df["_merge"]
    
    brand_lookup = pd.DataFrame({
        "brand_id":[18795,11977,13584,18792,15004,14937,15336],
        "brand_name":["braun","mac","mufe","kate","patagonia","ross","simple"]
    })
    accessory_df = accessory_df.merge(brand_lookup, on=["brand_id"], indicator=True)
    assert np.mean(accessory_df["_merge"]=="both")==1
    del accessory_df["_merge"]
    
    accessory_df = accessory_df.merge(text_lkup, on=["post_hash"], indicator=True)
    assert np.mean(accessory_df["_merge"]=="both")==1
    del accessory_df["_merge"]
 
    accessory_df = accessory_df.merge(model_decision_df, on=["post_hash"], indicator=True)
    assert np.mean(accessory_df["_merge"]=="both")==1
    del accessory_df["_merge"]
    
    return accessory_df[["post_hash","r_maj","worker_ct","brand_name","text","model_decision"]]

In [4]:
tribe_csvs = [
    os.path.join("input","rohan_11977_1513051779.csv"),
    os.path.join("input","rohan_13584_1513051779.csv"),
    os.path.join("input","rohan_14937_1513051907.csv"),
    os.path.join("input","rohan_15004_1513051907.csv"),
    os.path.join("input","rohan_15336_1513051907.csv"),
    os.path.join("input","rohan_17231_1513051779.csv"),
    os.path.join("input","rohan_17334_1513051779.csv"),
    os.path.join("input","rohan_18792_1513051907.csv"),
    os.path.join("input","rohan_18795_1513051907.csv"),
    os.path.join("input","rohan_19123_1513051779.csv"),
    os.path.join("input","rohan_19141_1513051779.csv"),
    os.path.join("input","rohan_19154_1513051779.csv"),
    os.path.join("input","rohan_19321_1513051779.csv"),
    os.path.join("input","rohan_19491_1513050263.csv"),
    os.path.join("input","rohan_19491_1513051779.csv"),
]

accessory_df = get_accessory(tribe_csvs, condense_repeat_votes=True, keep_prolific_cut=20)

rows dropped due to condensing = 5057
rows dropped due to non prolific worker = 3953


<a id='validationhard'></a>
#### 3. Get validation error on difficult posts

* Get inferred label from annotation models.

In [5]:
def get_predictions(pickle_in, suffix):
    data = pd.read_pickle(pickle_in)
    z_obs = data["z_obs"]
    trace = data["trace"]
    jj_transformer = data["jj_transformer"]
    z_init = data["z_init"]
    kk_transformer = data["kk_transformer"]
    ii_transformer = data["ii_transformer"]
    
    z_inferred = np.array(z_obs, dtype=float)
    z_inferred[np.where(z_obs==-999)[0]] = np.mean(trace["z_missing"], axis=0)
        
    pred_df = pd.DataFrame({
        "post_hash":ii_transformer.index,
        "z_inferred_"+suffix:z_inferred,
        "flag_hashtag":data["flag_hashtag"]})
    pred_df["post_hash"] = pred_df["post_hash"].astype(str)

    return pred_df

In [6]:
cfg1 = {"pickle_in":os.path.join("output","trace_cfg1.pkl"), "suffix":"cfg1"}
cfg3 = {"pickle_in":os.path.join("output","trace_cfg3.pkl"), "suffix":"cfg3"}
cfg11 = {"pickle_in":os.path.join("output","trace_cfg11.pkl"), "suffix":"cfg11"}
cfg12 = {"pickle_in":os.path.join("output","trace_cfg12.pkl"), "suffix":"cfg12"}

cfg_list = [
    cfg1, 
    cfg3, 
    cfg11,
    cfg12,
]

for ind, cfg in enumerate(cfg_list):
    pred_df = get_predictions(**cfg)
    accessory_df = accessory_df.merge(pred_df, on=["post_hash"], indicator=True)
    assert np.mean(accessory_df["_merge"]=="both")==1
    del accessory_df["_merge"]
    if ind!=len(cfg_list)-1:
        del accessory_df["flag_hashtag"]
        

* collect cases when majority voting and model conflict.

In [7]:
z_inferred_list = [
    "z_inferred_cfg1", 
    "z_inferred_cfg3", 
    "z_inferred_cfg11",
    "z_inferred_cfg12",    
]

accessory_df["conflict"] = "none"
accessory_df.loc[accessory_df["r_maj"]==0.5,"conflict"] = "50/50 split"
for var in z_inferred_list:
    accessory_df.loc[((accessory_df["r_maj"]<0.5) & (accessory_df[var]>0.5)),"conflict"] = "contradict majority"
    accessory_df.loc[((accessory_df["r_maj"]>0.5) & (accessory_df[var]<0.5)),"conflict"] = "contradict majority"


* list accuracy on difficult cases

In [8]:
validation_df = ground_truth.merge(accessory_df, on=["text"], how="left", indicator=True)
conf_df = validation_df[validation_df["r_maj"]!=0.5]

conf_acc = []
for cfg in z_inferred_list:
    tmp = conf_df[1*(conf_df[cfg]>0.5) != 1*(conf_df["r_maj"]>0.5)]
    conf_acc.append(np.mean(1*(tmp[cfg]>0.5) == tmp["z_obs"]))
pd.DataFrame(list(zip(z_inferred_list, conf_acc)), columns=["config", "accuracy - difficult cases"])

Unnamed: 0,config,accuracy - difficult cases
0,z_inferred_cfg1,0.664516
1,z_inferred_cfg3,0.840237
2,z_inferred_cfg11,0.833333
3,z_inferred_cfg12,0.615942


<a id='validationeasy'></a>
#### 4. Get validation error on easy posts

* Repeat for posts which Tribe employees had initially already labeled. This constitutes the "easy" validation set.

In [9]:
validation_df = pd.read_csv(os.path.join("output","validation_df.csv"), index_col=0)
accessory_df = get_accessory(tribe_csvs, condense_repeat_votes=True)

rows dropped due to condensing = 5057
rows dropped due to non prolific worker = 3953


In [10]:
for ind, cfg in enumerate(cfg_list):
    pred_df = get_predictions(**cfg)
    validation_df = validation_df.merge(pred_df, on=["post_hash"], indicator=True)
    assert np.mean(validation_df["_merge"]=="both")==1
    del validation_df["_merge"]
    if ind!=len(cfg_list)-1:
        del validation_df["flag_hashtag"]

In [11]:
validation_df = validation_df.merge(accessory_df, on=["post_hash"])
conf_df = validation_df[validation_df["r_maj"]!=0.5]

conf_acc = []
for cfg in z_inferred_list:
    conf_acc.append(np.mean(1*(conf_df[cfg]>0.5) == conf_df["z_obs"]))
pd.DataFrame(list(zip(z_inferred_list, conf_acc)), columns=["config", "accuracy - easy cases"])

Unnamed: 0,config,accuracy - easy cases
0,z_inferred_cfg1,0.823171
1,z_inferred_cfg3,0.914634
2,z_inferred_cfg11,0.914634
3,z_inferred_cfg12,0.792683


<a id='paths'></a>
#### 5. Plot paths of annotator skill

In [12]:
pickle_in = os.path.join("output","trace_cfg3.pkl")
data = pd.read_pickle(pickle_in)
J = len(data["jj_transformer"])
trace = data["trace"]


In [15]:
# get most prolific workers to plot
Tj = []
for j in range(J):
    Tj.append(trace["alpha_walk0"+str(j)].shape[1])

volatility_pe = np.mean(trace["volatility0"], axis=0)
rank_df = pd.DataFrame({"volatility_pe":volatility_pe, "Tj":Tj}).sort_values(by=["Tj"], ascending=False)
most_prolific = rank_df.index[:10]

* This output of this cell has been cleared for the sake of confidentiality.

In [None]:
plt.figure(figsize=(10,4))
for i, j in enumerate(most_prolific):
    # path
    plt.subplot(2,5,i+1)
    plt.fill_between(x=range(trace["alpha_walk0"+str(j)].shape[1]), 
                    y1=np.mean(1/(1+np.exp(-1*trace["alpha_walk0"+str(j)])), axis=0) + np.std(1/(1+np.exp(-1*trace["alpha_walk0"+str(j)])), axis=0),
                    y2=np.mean(1/(1+np.exp(-1*trace["alpha_walk0"+str(j)])), axis=0) - np.std(1/(1+np.exp(-1*trace["alpha_walk0"+str(j)])), axis=0),
                    alpha=0.3,
                    lw=0,
                    color='blue'
                    )
    plt.plot(1/(1+np.exp(-1*np.mean(trace["alpha_walk0"+str(j)], axis=0))), color="blue", label='credibility when z=0');
    plt.fill_between(x=range(trace["alpha_walk1"+str(j)].shape[1]), 
                    y1=np.mean(1/(1+np.exp(-1*trace["alpha_walk1"+str(j)])), axis=0) + np.std(1/(1+np.exp(-1*trace["alpha_walk1"+str(j)])), axis=0),
                    y2=np.mean(1/(1+np.exp(-1*trace["alpha_walk1"+str(j)])), axis=0) - np.std(1/(1+np.exp(-1*trace["alpha_walk1"+str(j)])), axis=0),
                    alpha=0.3,
                    lw=0,
                    color='purple'
                    )
    plt.plot(1/(1+np.exp(-1*np.mean(trace["alpha_walk1"+str(j)], axis=0))), color="purple", label='credibility when z=1');
    plt.ylim([0,1])
    if i<5:
        plt.xlim([0,400])
        plt.xticks([])
    else:
        plt.xlim([0,400])
        plt.xticks([0,200,400])
    if i!=0 and i!=5:
        plt.yticks([])
    else:
        plt.yticks([0,0.5,1])
    plt.axhline(y=0.5, ls='--', lw=0.5, color='gray')
    if i==3:
        plt.legend(ncol=2, bbox_to_anchor=(1.2,-1.7))
    if i==0:
        plt.text(-170, -0.1, 'Credibility', va='center', rotation='vertical', fontsize=12)
    if i==7:
        plt.xlabel("Objects Labeled (20s)", fontsize=12)