# Introduction to VICE

### Load libraries

In [17]:
import numpy as np
import pandas as pd

## Processing Data

In [18]:
# Read raw triplet data
triplets_raw = pd.read_csv("data/things48triplets.csv")
# View data
triplets_raw

Unnamed: 0,worker_no,exclude,item1,item2,item3,chosen_item,isrepeat,RT
0,1,0,telegraph,bumper,canvas,telegraph,0,12260
1,1,0,punching bag,tarantula,music box,tarantula,0,2450
2,1,0,telegraph,chariot,rolling pin,chariot,1,1916
3,1,0,rudder,bumper,washcloth,washcloth,0,2453
4,1,0,flagpole,bucket,pizza,pizza,0,4671
...,...,...,...,...,...,...,...,...
52267,121,0,candelabra,file,jet,candelabra,1,1632
52268,121,0,dice,tea,pizza,pizza,0,3525
52269,121,0,dice,canvas,telegraph,canvas,1,1606
52270,121,0,washcloth,dice,bookshelf,bookshelf,0,2083


In [19]:
# filter based on inclusion flag
triplets_filtered = triplets_raw.loc[triplets_raw['exclude'] == 0, ['item1', 'item2', 'item3', 'chosen_item']]

In [20]:
np.unique(["a","c","b"])

array(['a', 'b', 'c'], dtype='<U1')

In [40]:
def convert_triplets_to_apns(
    triplets_df, item_col_names=None, choice_col_name=None, item_names=None
):
    """
    Converts a dataframe containing triplet choices in the format (item1, item2, item3, choice)
    to a numpy integer array in the anchor-positive-negative (APN) format.

    Parameters
    ----------
    triplets_df : pandas.dataframe, with four columns containing item names.
    item_col_names : list of str.  A list of three column names for the three item columns.
        If None, then the first 3 columns are assumed to be the item columns.
    choice_col_name : str.  The column name for the choice column.
        If None, the 4th column is assumed to be the choice column.
    item_names : numpy.array of str.  
        The 0-indexed array where element i is the item name that will be mapped to integer i.
        If None, the sorted list of unique item names is used to construct the mapping.

    Returns
    -------
    apns : numpy.array containing triplets in apn format
    item_names : list of item names
    """
    assert len(triplets_df.columns) >= 4, "triplets_df must have at least 4 columns"
    if item_col_names is None:
        item_col_names = triplets_df.columns[:3]
    if choice_col_name is None:
        choice_col_name = triplets_df.columns[3]
    if item_names is None:
        item_names = np.unique(np.concatenate((
            triplets_df[item_col_names[0]],triplets_df[item_col_names[1]],triplets_df[item_col_names[2]],
            triplets_df[choice_col_name])))
    # dict for converting name to ID
    n2id = {n: i for i,n in enumerate(item_names)}
    item0 = triplets_df[item_col_names[0]].map(n2id)
    item1 = triplets_df[item_col_names[1]].map(n2id)
    item2 = triplets_df[item_col_names[2]].map(n2id)
    chosen = triplets_df[choice_col_name].map(n2id)
    choice = np.zeros(len(triplets_df))
    choice[item1 == chosen] = 1
    choice[item2 == chosen] = 2
    apns = np.zeros((len(triplets_df), 3), dtype=int)
    # APN assignment
    # choice anchor positive negative
    #      0      1        2        0
    #      1      0        2        1
    #      2      0        1        2
    apns[choice!=0, 0] = item0[choice!=0]
    apns[choice==0, 0] = item1[choice==0]
    apns[choice!=2, 1] = item2[choice!=2]
    apns[choice==2, 1] = item1[choice==2]
    apns[choice==0, 2] = item0[choice==0]
    apns[choice==1, 2] = item1[choice==1]
    apns[choice==2, 2] = item2[choice==2]
    return apns, item_names

In [41]:
apns, item_names = convert_triplets_to_apns(triplets_filtered)

### Explanation of APN format (Skippable)

In [47]:
item_names[apns[22]]

array(['trophy', 'headphones', 'furnace'], dtype=object)

In [48]:
triplets_filtered.loc[22]

item1             furnace
item2              trophy
item3          headphones
chosen_item       furnace
Name: 22, dtype: object