In [1]:
import pandas as pd
import numpy as np
import sys
import random
from sqlalchemy import create_engine

from snorkel.labeling import labeling_function

## Load Data

### DataFrame

In [2]:
## load the data by choosing a random 10% of the data
## is the data ordered? am I missing a large chunk due to my partial loading?
# p = 0.01
# responses = pd.read_csv('../data/moralmachine/SharedResponses.csv', skiprows=lambda i: i>0 and random.random() > p)

### SQL Engine

In [3]:
# access SQL DB with data
engine = create_engine("sqlite:///../data/moralmachine.db", echo=False)

In [4]:
# demo
# responses = pd.read_sql("SELECT * FROM sharedresponses ORDER BY RANDOM() LIMIT 100000", con=engine)
# responses.columns

In [5]:
# ## For viewing the possible values for each feature
# for feature in responses.columns:
#     print(feature)
#     print(responses[feature].unique())
#     print(responses[feature].unique().shape)

In [6]:
# ## What does a single user's session/response look like?
# responses_random = responses[responses['ScenarioTypeStrict'] == 'Random']
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     ## see a random user's session
# #     display(responses_random[responses_random['ExtendedSessionID'] == responses_random['ExtendedSessionID'].sample().values[0]])
    
#     ## see the user with nth most responses
#     n = 100
#     display(responses_random[responses_random['ResponseID'] == responses_random.groupby(by='ResponseID').size().sort_values(ascending=False).index[n]]) 

In [7]:
# ## group alternatives pairwise!
# # take only responses where both alternatives are present in the random sample - for a full sample, this step isn't necessary
# responses_grouped = responses.groupby(by='ResponseID').filter(lambda g: g.shape[0] < 2)
# responses_grouped.shape

## Data Exploration

In [8]:
# responses.describe()

## Snorkel Labeling
Using https://www.snorkel.org/use-cases/01-spam-tutorial

### Sample Data

In [9]:
# query random sample of responses grouped by ResponseID; only take responses for which both instances are present
query = """
    --SELECT * FROM (
        SELECT * FROM sharedresponses
            WHERE ScenarioTypeStrict LIKE 'Random'
        ORDER BY RANDOM()
        LIMIT 100000
    --)
    --GROUP BY ResponseID, ExtendedSessionID, UserID
    --HAVING COUNT(ResponseID) > 1
"""
df = pd.read_sql(query, con=engine).groupby(by='ResponseID').filter(lambda g: g.shape[0] > 1).sort_values('ResponseID')
df.head()

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
27198,25YSB5WoFBWfDxTfj,1405699273_5449630788398191.0,5449630788398190,11,1,0,0,1,Rand,Random,...,0,1,0,0,0,0,0,0,0,0
94856,25YSB5WoFBWfDxTfj,1405699273_5449630788398191.0,5449630788398190,11,0,0,1,0,Rand,Random,...,0,0,0,1,1,0,0,0,0,0
45283,2HpB4GxtEtjvYHXXS,591729313_3804871123016334.0,3804871123016340,8,0,0,0,0,Rand,Random,...,0,0,0,0,0,0,2,0,0,0
52017,2HpB4GxtEtjvYHXXS,591729313_3804871123016334.0,3804871123016340,8,1,0,1,0,Rand,Random,...,0,0,0,0,0,0,0,0,0,0
85417,2LuGsZGm99rCgNHho,936372580_1655509386,1655509386,1,0,0,1,0,Rand,Random,...,0,0,1,0,0,0,0,0,0,0


For comparison to Noothigattu et al., how many pairwise comparisons per voter in this sample dataset?

In [10]:
# number of pairwise comparisons per voter?
# = number of response IDs per voter
df.groupby('UserID')['ResponseID'].nunique().mean()

1.0124688279301746

In [11]:
df.columns

Index(['ResponseID', 'ExtendedSessionID', 'UserID', 'ScenarioOrder',
       'Intervention', 'PedPed', 'Barrier', 'CrossingSignal', 'AttributeLevel',
       'ScenarioTypeStrict', 'ScenarioType', 'DefaultChoice',
       'NonDefaultChoice', 'DefaultChoiceIsOmission', 'NumberOfCharacters',
       'DiffNumberOFCharacters', 'Saved', 'Template', 'DescriptionShown',
       'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
       'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
       'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
       'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
       'Cat'],
      dtype='object')

Out of the variables above, these are the variables that vary within response pairs:
> 'NumberOfCharacters', 'DiffNumberOfCharacters', 'Saved', 'Template', 'DescriptionShown',
'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
'Cat'

N.B. in each pair of responses, only one is the result of an intervention, and only one is saved. 

In [12]:
df[["ResponseID","Intervention","Saved"]].head()

Unnamed: 0,ResponseID,Intervention,Saved
27198,25YSB5WoFBWfDxTfj,1,0
94856,25YSB5WoFBWfDxTfj,0,1
45283,2HpB4GxtEtjvYHXXS,0,1
52017,2HpB4GxtEtjvYHXXS,1,0
85417,2LuGsZGm99rCgNHho,0,1


### Preprocessing

First, select the fields that are unique to each scenario (the fields that vary within pairs of responses). Then split the dataset into two disjoint sets of alternatives: one in which an intervention occurs, and one in which there is no intervention.

In [13]:
scenario_fields = [
    'ResponseID', 'Barrier', 'NumberOfCharacters', 'DiffNumberOFCharacters', 'Saved', 'DescriptionShown', 'LeftHand', 'Man', 'Woman', 'Pregnant', 'Stroller', \
    'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', \
    'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', "CrossingSignal"
]
intervention = df[df['Intervention'] == 1][scenario_fields].set_index('ResponseID')
no_intervention = df[df['Intervention'] == 0][scenario_fields].set_index('ResponseID')

sample_response = df["ResponseID"].sample()
print("Alternative w/ intervention:")
display(intervention.loc[sample_response])
print("Alternative w/o intervention:")
display(no_intervention.loc[sample_response])

Alternative w/ intervention:


Unnamed: 0_level_0,Barrier,NumberOfCharacters,DiffNumberOFCharacters,Saved,DescriptionShown,LeftHand,Man,Woman,Pregnant,Stroller,...,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat,CrossingSignal
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q3cXtd4CSMAhwhRXo,1,5,3,1,1,1,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0


Alternative w/o intervention:


Unnamed: 0_level_0,Barrier,NumberOfCharacters,DiffNumberOFCharacters,Saved,DescriptionShown,LeftHand,Man,Woman,Pregnant,Stroller,...,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat,CrossingSignal
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q3cXtd4CSMAhwhRXo,0,2,3,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


Next, combine the datasets on response ID, separating the variable characteristics with suffixes.

In [14]:
df_endo = intervention.join(no_intervention, lsuffix='_int', rsuffix='_noint', how='inner')
df_endo.loc[sample_response]

Unnamed: 0_level_0,Barrier_int,NumberOfCharacters_int,DiffNumberOFCharacters_int,Saved_int,DescriptionShown_int,LeftHand_int,Man_int,Woman_int,Pregnant_int,Stroller_int,...,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint,CrossingSignal_noint
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q3cXtd4CSMAhwhRXo,1,5,3,1,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


Get data for the columns from the raw data that _didn't_ change within response pairs. Remember to eliminate duplicate response pairs.

In [15]:
df_exo = df[[col for col in df.columns if col not in scenario_fields or col == 'ResponseID']].set_index('ResponseID')
df_exo = df_exo.loc[~df_exo.index.duplicated(keep='first')]
df_exo.loc[sample_response]

Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,NonDefaultChoice,DefaultChoiceIsOmission,Template,UserCountry3
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Q3cXtd4CSMAhwhRXo,461521818_31823446047272.0,31823446047272,10,1,0,Rand,Random,Random,,,,Desktop,DEU


Then join that data in with the combined endogenous variables to get a full tuple for each pairwise comparison presented to a user.

In [16]:
df_joined = df_exo.join(df_endo, how='inner')
df_joined.loc[sample_response]

Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,NonDefaultChoice,...,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint,CrossingSignal_noint
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q3cXtd4CSMAhwhRXo,461521818_31823446047272.0,31823446047272,10,1,0,Rand,Random,Random,,,...,0,0,1,0,0,0,0,0,0,1


How much data did we lose with all these joins? Shouldn't be any - all we're doing is dividing the dataset in half.

In [17]:
print("df: {}".format(df.shape))
print("df_endo: {}".format(df_endo.shape))
print("df_exo: {}".format(df_exo.shape))
print("df_joined: {}".format(df_joined.shape))

df: (1624, 41)
df_endo: (812, 54)
df_exo: (812, 13)
df_joined: (812, 67)


Finally, let's make it easier to interpret the target variable. For each response, we know whether the user chose to save one set of characters (\_int) by intervention, or save another set (\_noint) by not intervening. Let's call that variable "Intervened" to indicate whether or not the user intervened (swerved the AV).

In [18]:
df_joined['Intervened'] = (df_joined['Saved_int'] == 1).astype(int)
df_joined = df_joined.drop(axis='columns', labels=['Saved_{}'.format(s) for s in ['int', 'noint']]+["Intervention"])
df_joined.loc[sample_response]

Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,PedPed,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,NonDefaultChoice,DefaultChoiceIsOmission,...,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint,CrossingSignal_noint,Intervened
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q3cXtd4CSMAhwhRXo,461521818_31823446047272.0,31823446047272,10,0,Rand,Random,Random,,,,...,0,1,0,0,0,0,0,0,1,1


Now choose which features to use:

In [19]:
# include user countries and other metadata or not? decided not - experts writing LFs are trying to decide most moral response, not predict what an individual from a certain background would do
# but if it helps generalization, technically useful... try both?
features = [
    "Intervened", "PedPed", "CrossingSignal_int", "CrossingSignal_noint", 'NumberOfCharacters_int', 'Man_int', 'Woman_int',
    'Pregnant_int', 'Stroller_int', 'OldMan_int', 'OldWoman_int', 'Boy_int',
    'Girl_int', 'Homeless_int', 'LargeWoman_int', 'LargeMan_int',
    'Criminal_int', 'MaleExecutive_int', 'FemaleExecutive_int',
    'FemaleAthlete_int', 'MaleAthlete_int', 'FemaleDoctor_int', 'Barrier_int',
    'MaleDoctor_int', 'Dog_int', 'Cat_int', 'Barrier_noint', 'NumberOfCharacters_noint',
    'Man_noint', 'Woman_noint', 'Pregnant_noint', 'Stroller_noint', 'OldMan_noint', 'OldWoman_noint',
    'Boy_noint', 'Girl_noint', 'Homeless_noint', 'LargeWoman_noint',
    'LargeMan_noint', 'Criminal_noint', 'MaleExecutive_noint',
    'FemaleExecutive_noint', 'FemaleAthlete_noint', 'MaleAthlete_noint',
    'FemaleDoctor_noint', 'MaleDoctor_noint', 'Dog_noint', 'Cat_noint', 
#     "Template", "UserCountry3", 'DescriptionShown_int', 'LeftHand_int', 'DescriptionShown_noint', 'LeftHand_noint'
]
cat_features = [
#     "Template", "UserCountry3"
]
num_features = [
    "PedPed", "CrossingSignal_int", "CrossingSignal_noint", "NumberOfCharacters_int", "NumberOfCharacters_noint", "Man_int", "Man_noint", "Woman_int", "Woman_noint", 
    "Pregnant_int", "Pregnant_noint", "Stroller_int", "Stroller_noint", "OldMan_int", "OldMan_noint", "OldWoman_int", "OldWoman_noint", "Boy_int", "Boy_noint",
    "LargeMan_noint", "LargeMan_int", "Criminal_int", "Criminal_noint", "MaleExecutive_int", "MaleExecutive_noint", "FemaleExecutive_int",
    "FemaleExecutive_noint", "Girl_int", "Girl_noint", "LargeWoman_int", "LargeWoman_noint", "FemaleAthlete_int", "FemaleAthlete_noint",
    "MaleAthlete_int", "MaleAthlete_noint", "FemaleDoctor_int", "FemaleDoctor_noint", "MaleDoctor_int", "MaleDoctor_noint", "Dog_int",
    "Dog_noint", "Cat_int", "Cat_noint", "Homeless_int", "Barrier_noint", "Homeless_noint", "Barrier_int"
#   'DescriptionShown_int', 'LeftHand_int', 'DescriptionShown_noint', 'LeftHand_noint'
]

Now deal with NA's or missing values:

In [20]:
# transform numerical data types
df_proc = df_joined.loc[:,features]
# convert to numeric, changing literals to NaN
for f in num_features:
    df_proc.loc[:,f] = pd.to_numeric(df_proc.loc[:,f], errors='coerce')
df_proc.shape

(812, 48)

In [21]:
# are there any nan in the numerical features?
df_proc = df_proc.dropna(axis=0, how='any', subset=num_features)
df_proc.shape

(812, 48)

### Train/Test Split

A standard train test split for testing:

In [22]:
from sklearn.model_selection import train_test_split

X = df_proc.drop(labels=["Intervened"], axis='columns', inplace=False)
y = df_proc["Intervened"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
X_test, X_dev, y_test, y_dev = train_test_split(X_test, y_test, test_size=0.2, random_state=1)
display(X_train.head())
display(y_train.head())

Unnamed: 0_level_0,PedPed,CrossingSignal_int,CrossingSignal_noint,NumberOfCharacters_int,Man_int,Woman_int,Pregnant_int,Stroller_int,OldMan_int,OldWoman_int,...,LargeMan_noint,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7bCirWXbdbudHR7pn,0,2,0,2,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
XyBynoC3uusLvcWdi,0,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
n26MuwfGDm68JPerW,0,1,0,5,1,0,1,1,0,0,...,0,0,0,0,0,1,1,0,0,0
37pjQAoQgB3J7xvvj,0,0,1,5,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
tjWH5KuXHNr8qXa6r,0,0,2,5,0,1,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0


ResponseID
7bCirWXbdbudHR7pn    1
XyBynoC3uusLvcWdi    1
n26MuwfGDm68JPerW    0
37pjQAoQgB3J7xvvj    1
tjWH5KuXHNr8qXa6r    0
Name: Intervened, dtype: int64

### Labeling Model

Some sample labeling functions, constructed with the help of the effect sizes in the Moral Machine experiment.

In [23]:
sys.path.insert(1, '../heuristics')
import labeling_functions
import utils
import importlib
importlib.reload(labeling_functions)
importlib.reload(utils)

from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis


lfs = [
    labeling_functions.doctors,
    labeling_functions.utilitarian,
    labeling_functions.utilitarian_anthro,
    labeling_functions.inaction,
    labeling_functions.pedestrians,
    labeling_functions.females,
    labeling_functions.fitness,
    labeling_functions.status,
    labeling_functions.legal,
    labeling_functions.illegal,
    labeling_functions.youth,
    labeling_functions.criminals,
    labeling_functions.homeless,
    labeling_functions.pets,
    labeling_functions.spare_strollers,
    labeling_functions.spare_girl,
    labeling_functions.spare_boy,
    labeling_functions.spare_pregnant
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=X_train)
L_dev = applier.apply(df=X_dev)
L_valid = applier.apply(df=X_val)
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=y_dev.values).sort_values("Correct", ascending=False)

  from pandas import Panel
100%|██████████| 519/519 [00:00<00:00, 679.30it/s]
100%|██████████| 33/33 [00:00<00:00, 826.31it/s]
100%|██████████| 130/130 [00:00<00:00, 828.51it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
utilitarian_anthro,2,"[0, 1]",0.848485,0.848485,0.818182,20,8,0.714286
youth,10,"[0, 1]",0.757576,0.757576,0.727273,18,7,0.72
utilitarian,1,"[0, 1]",0.787879,0.787879,0.757576,16,10,0.615385
inaction,3,[0],1.0,1.0,0.969697,15,18,0.454545
pedestrians,4,"[0, 1]",0.818182,0.818182,0.818182,14,13,0.518519
females,5,"[0, 1]",0.575758,0.575758,0.545455,13,6,0.684211
doctors,0,"[0, 1]",0.454545,0.454545,0.454545,11,4,0.733333
fitness,6,"[0, 1]",0.666667,0.666667,0.636364,11,11,0.5
status,7,"[0, 1]",0.424242,0.424242,0.424242,9,5,0.642857
spare_boy,16,"[0, 1]",0.333333,0.333333,0.30303,8,3,0.727273


## Aggregation

Recall that there are no true labels for this problem - really, we're just measuring similarity of the heuristic labels to real voter's responses. 

**Baseline**: majority label voting.

In [24]:
from snorkel.labeling import MajorityLabelVoter

model_majority = MajorityLabelVoter()
preds_train = model_majority.predict(L=L_train)
preds_train

array([ 1,  1, -1,  1,  1,  1,  0,  0,  0,  0, -1, -1,  0,  1,  0,  0,  1,
        0,  1, -1,  0,  0, -1,  0,  0,  1,  1,  0, -1,  1,  0,  0,  0,  0,
       -1,  1,  0,  0,  0,  1, -1,  1,  0,  0,  1,  1,  1,  0,  0,  0,  1,
       -1,  1, -1,  0,  1,  0,  0,  1,  1,  1,  0,  1,  0,  1,  0,  1,  1,
        1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  1,  0,  1,  0,  1,  1,
        0,  1,  1,  0,  1,  0,  0,  0,  0,  1,  0,  1,  0, -1,  0,  1,  1,
        1,  1,  0,  1,  0,  0,  1,  1, -1,  0,  0,  0,  0,  1,  0,  0,  0,
        0,  0,  0,  0,  1,  0,  1,  1,  1,  0,  1, -1,  0,  0,  0,  0,  1,
        0,  1,  0,  1,  1, -1,  0,  0,  0,  0,  1,  1,  1,  0,  0,  1,  0,
        1,  0,  1,  1,  0,  0,  0, -1,  1,  1,  0,  0,  1,  1,  0,  1,  0,
        0,  0,  0,  1,  0,  1,  1,  0,  1,  0,  0,  0,  0,  1,  0,  0,  0,
       -1,  0,  1,  0,  0, -1,  0,  0,  0,  1,  0,  0,  0,  1,  1,  1, -1,
        0, -1,  0,  1,  1,  0,  0,  0, -1,  0,  0,  0,  1, -1,  1,  1,  0,
        0,  1,  0,  1,  0

**Label Model**: Snorkel aggregator. Chooses weights to combine the labeling functions based on learned conditional probabilities.

In [25]:
from snorkel.labeling import LabelModel

# cardinality is num classes
model_label = LabelModel(cardinality=2, verbose=True)
model_label.fit(L_train=L_train, n_epochs=500, lr=.001, log_freq=100, seed=1)
pd.DataFrame([[lf.name for lf in lfs], model_label.get_weights()])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,doctors,utilitarian,utilitarian_anthro,inaction,pedestrians,females,fitness,status,legal,illegal,youth,criminals,homeless,pets,spare_strollers,spare_girl,spare_boy,spare_pregnant
1,0.731025,0.999199,1,0.5,0.542265,0.779617,0.522652,0.650394,0.488887,0.446887,0.773165,1,1,0.960062,0.687356,0.865729,0.69175,0.676495


How much does the label model improve on the majority voter?

In [26]:
for model in [model_majority, model_label]:
    acc = model.score(L=L_valid, Y=y_val.values, tie_break_policy="random")["accuracy"]
    print(f"{str(model)} {'Vote Accuracy:':<25} {acc*100:.1f}")

MajorityLabelVoter() Vote Accuracy:            60.8
LabelModel() Vote Accuracy:            62.3


### Eye Test - Debugging Label Model

A handy function for viewing pairwise alternatives:

In [75]:
from labeling_functions import characters_all

def pictofy(response):
    crossing_light = ["🔴" if response["CrossingSignal_{}".format(suf)].iloc[0] == 2 else "🟢" if response["CrossingSignal_{}".format(suf)].iloc[0] == 1 else " " for suf in ["noint", "int"]]
    pedped = ["🚧" if response["Barrier_{}".format(s)].iloc[0] == 1 else "🚸" for s in ["noint", "int"]]
    out_string = \
        "What should the self-driving car do?\n\n" +\
        "\t    🚘 \n" +\
        "\t   |\t\\ \n" +\
        "\t   v \t v\n" +\
        "\t{}{} \t{}{}\n".format(crossing_light[0], *pedped, crossing_light[1]) +\
        "\t NOINT\tINT\t\n"
    for k, s in {"INT": "_int", "NOINT": "_noint"}.items():
        out_dict = {col: response["{}{}".format(col, s)] for col in characters_all}
        out_list = []
        for c, v in out_dict.items():
            for i in range(int(v)):
                out_list.append(c.split("_")[0])
        out_string += "{} saves: \n{}\n".format(k, out_list)
    print(out_string)
    
pictofy(X.sample())

	    🚘 
	   |	\ 
	   v 	 v
	 🚧 	🚸 
	 NOINT	INT	
INT saves: 
['OldWoman', 'Girl', 'Homeless']
NOINT saves: 
['Girl', 'LargeWoman', 'LargeWoman', 'MaleExecutive', 'MaleDoctor']



Now, use the label model to create probabilistic labels for the dev set. Rounding off, create binary predictions.

In [48]:
from snorkel.analysis import get_label_buckets

threshold = 0.5

probs_dev = model_label.predict_proba(L=L_dev)
preds_dev = probs_dev >= 0.5

Create label buckets for eyeball debugging (groups TP, FP, TN, FN).

In [67]:
# confusion matrix
from sklearn.metrics import confusion_matrix

cm = pd.crosstab(y_dev.values.astype(bool), preds_dev[:, 1], rownames=['Actual'], colnames=['Predicted'])
print(cm)

buckets = get_label_buckets(y_dev.values, preds_dev[:, 1])

Predicted  False  True 
Actual                 
False         11      4
True           5     13


#### False Negatives
Here, the user chose to intervene, while the label model did not. 

In [69]:
# false negatives
df_fn_dev = X_dev.iloc[buckets[(1, 0)]]
# get the corresponding posteriori probability for each false negative
df_fn_dev.loc[:,"probability"] = probs_dev[buckets[(1, 0)], 1]
# check out a few
pictofy(df_fn_dev.sample(random_state=3))

ResponseID
pqGArknH6LdTNiYyc    1
Name: Barrier_noint, dtype: int64
What should the self-driving car do?

	    🚘 
	   |	\ 
	   v 	 v
	 🚧 	🚸🔴
	 NOINT	INT	
INT saves: 
['Girl', 'Homeless', 'LargeMan', 'MaleExecutive']
NOINT saves: 
['Woman', 'Stroller', 'OldWoman', 'Homeless', 'FemaleAthlete']



### False Positives
Here, the user chose not to intervene, but the label model did.

In [70]:
# false negatives
df_fp_dev = X_dev.iloc[buckets[(0, 1)]]
# get the corresponding posteriori probability for each false positive
df_fp_dev.loc[:,"probability"] = probs_dev[buckets[(0, 1)], 1]
# check out a few
pictofy(df_fp_dev.sample(random_state=3))

ResponseID
ajKNqxwtS3eAPM3n6    0
Name: Barrier_noint, dtype: int64
What should the self-driving car do?

	    🚘 
	   |	\ 
	   v 	 v
	 🚸 	🚸 
	 NOINT	INT	
INT saves: 
['Pregnant', 'Boy', 'Girl', 'LargeWoman', 'FemaleAthlete']
NOINT saves: 
['LargeMan', 'MaleAthlete', 'MaleDoctor']



## Classification

https://www.snorkel.org/use-cases/01-spam-tutorial#5-training-a-classifier

### Featurization

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('oh_enc', OneHotEncoder(handle_unknown='ignore'))
])
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))
#     ('classifier', LogisticRegression(C=.001))
])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Using just the labels (no label model):

In [20]:
from snorkel.analysis import metric_score

def test_accuracy(clf):
    preds_test_dev = np.round(clf.predict(X_test))
    test_acc = metric_score(golds=y_test, preds=preds_test_dev, metric="accuracy")
    print(f"Test Accuracy: {test_acc * 100:.1f}%")
    
clf.fit(X=X_train, y=y_train)
test_accuracy(clf)

Test Accuracy: 82.9%


Using the label model, filter out unlabeled points:

In [21]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=X_train, y=model_label.predict_proba(L=L_train), L=L_train
)
df_train_filtered

Unnamed: 0_level_0,PedPed,CrossingSignal,NumberOfCharacters_int,Man_int,Woman_int,Pregnant_int,Stroller_int,OldMan_int,OldWoman_int,Boy_int,...,LargeMan_noint,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
amAqXftPdTjeq2BqK,1,2,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,2,1,0
y74L4Em6gR8BwaRez,0,0,3,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
W6Qc5xfozrnebhPRR,0,1,4,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
327AQ3oN82RnGCdb3,0,1,3,0,0,1,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
SXf9eXPHAWZdoB5wG,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47wdS2HrqS4Y8Ayg2,0,0,2,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PZ4zjX3dZAKxhKmgD,0,0,3,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
BuTuepkymLAgEcRi8,0,2,4,0,0,0,0,2,0,0,...,0,0,0,0,1,0,0,0,0,0
yaLtEmdbzh2juTTw3,0,2,4,0,0,0,0,0,0,0,...,0,2,0,1,0,0,0,0,0,0


In [22]:
from snorkel.utils import probs_to_preds

preds_train_filtered = probs_to_preds(probs=probs_train_filtered)
clf.fit(X=df_train_filtered, y=preds_train_filtered)
test_accuracy(clf)

Test Accuracy: 66.7%
