In [1]:
import pandas as pd
import numpy as np
import sys
import random
from sqlalchemy import create_engine

from snorkel.labeling import labeling_function

## Load Data

### DataFrame

In [2]:
## load the data by choosing a random 10% of the data
## is the data ordered? am I missing a large chunk due to my partial loading?
# p = 0.01
# responses = pd.read_csv('../data/moralmachine/SharedResponses.csv', skiprows=lambda i: i>0 and random.random() > p)

### SQL Engine

In [3]:
# access SQL DB with data
engine = create_engine("sqlite:///../data/moralmachine.db", echo=False)

In [4]:
# demo
# responses = pd.read_sql("SELECT * FROM sharedresponses ORDER BY RANDOM() LIMIT 100000", con=engine)
# responses.columns

In [5]:
# ## For viewing the possible values for each feature
# for feature in responses.columns:
#     print(feature)
#     print(responses[feature].unique())
#     print(responses[feature].unique().shape)

In [6]:
# ## What does a single user's session/response look like?
# responses_random = responses[responses['ScenarioTypeStrict'] == 'Random']
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     ## see a random user's session
# #     display(responses_random[responses_random['ExtendedSessionID'] == responses_random['ExtendedSessionID'].sample().values[0]])
    
#     ## see the user with nth most responses
#     n = 100
#     display(responses_random[responses_random['ResponseID'] == responses_random.groupby(by='ResponseID').size().sort_values(ascending=False).index[n]]) 

In [7]:
# ## group alternatives pairwise!
# # take only responses where both alternatives are present in the random sample - for a full sample, this step isn't necessary
# responses_grouped = responses.groupby(by='ResponseID').filter(lambda g: g.shape[0] < 2)
# responses_grouped.shape

## Data Exploration

In [8]:
# responses.describe()

## Snorkel Labeling
Using https://www.snorkel.org/use-cases/01-spam-tutorial

In [9]:
# query random sample of responses grouped by ResponseID; only take responses for which both instances are present
query = """
    --SELECT * FROM (
        SELECT * FROM sharedresponses
            WHERE ScenarioTypeStrict LIKE 'Random'
        ORDER BY RANDOM()
        LIMIT 100000
    --)
    --GROUP BY ResponseID, ExtendedSessionID, UserID
    --HAVING COUNT(ResponseID) > 1
"""
df = pd.read_sql(query, con=engine).groupby(by='ResponseID').filter(lambda g: g.shape[0] > 1).sort_values('ResponseID')
df

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
75354,2MiCgDBtDEqD26wa2,-1952801970_1656066627886452.0,1656066627886450,10,1,0,1,0,Rand,Random,...,0,0,0,0,0,0,0,1,0,0
90359,2MiCgDBtDEqD26wa2,-1952801970_1656066627886452.0,1656066627886450,10,0,0,0,1,Rand,Random,...,0,0,0,0,0,0,0,0,0,0
20477,2NXph6YggTsPyW7Hf,-1901794190_3687017707098259.0,3687017707098260,11,1,0,1,0,Rand,Random,...,0,0,0,0,0,0,0,0,0,0
8604,2NXph6YggTsPyW7Hf,-1901794190_3687017707098259.0,3687017707098260,11,0,0,0,2,Rand,Random,...,1,1,0,0,0,0,0,0,0,0
41847,2YLruszpQxGtHWnqQ,-825740645_516289185,516289185,2,0,0,1,0,Rand,Random,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30583,zrRKAjffabiGF6E83,-99643939_1062068271,1062068271,11,1,1,0,2,Rand,Random,...,0,0,0,0,0,0,0,0,0,0
4227,ztWnoj6bHwxrMf9bW,-218458164_24114949,24114949,2,1,0,1,0,Rand,Random,...,0,0,0,0,0,0,1,0,1,0
90368,ztWnoj6bHwxrMf9bW,-218458164_24114949,24114949,2,0,0,0,0,Rand,Random,...,0,2,0,0,0,0,0,0,1,0
44048,zzJDCNTx2BrKhCoft,1629226_4181225833514974.0,4181225833514980,4,1,0,0,1,Rand,Random,...,0,0,0,0,0,1,0,0,0,0


Need to have both alternatives in the same tuple, marking them by whether or not they are the intervention - specifically the factors:
> 'NumberOfCharacters', 'DiffNumberOfCharacters', 'Saved', 'Template', 'DescriptionShown',
'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
'Cat'

In [10]:
df[["ResponseID","Intervention","Saved"]]

Unnamed: 0,ResponseID,Intervention,Saved
75354,2MiCgDBtDEqD26wa2,1,0
90359,2MiCgDBtDEqD26wa2,0,1
20477,2NXph6YggTsPyW7Hf,1,0
8604,2NXph6YggTsPyW7Hf,0,1
41847,2YLruszpQxGtHWnqQ,0,0
...,...,...,...
30583,zrRKAjffabiGF6E83,1,0
4227,ztWnoj6bHwxrMf9bW,1,1
90368,ztWnoj6bHwxrMf9bW,0,0
44048,zzJDCNTx2BrKhCoft,1,0


In [11]:
scenario_fields = [
    'ResponseID', 'Barrier', 'NumberOfCharacters', 'DiffNumberOFCharacters', 'Saved', 'DescriptionShown', 'LeftHand', 'Man', 'Woman', 'Pregnant', 'Stroller', \
    'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', \
    'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat'
]
intervention = df[df['Intervention'] == 1][scenario_fields].set_index('ResponseID')
no_intervention = df[df['Intervention'] == 0][scenario_fields].set_index('ResponseID')
df_joined = intervention.join(no_intervention, lsuffix='_int', rsuffix='_noint', how='inner')
df_joined = df[[col for col in df.columns if col not in scenario_fields or col == 'ResponseID']].set_index('ResponseID').join(df_joined)
df_joined['Intervened'] = (df_joined['Saved_int'] == 1).astype(int)
df_joined = df_joined.drop(axis='columns', labels=['Saved_{}'.format(s) for s in ['int', 'noint']])
df_joined

Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,CrossingSignal,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,...,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint,Intervened
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2MiCgDBtDEqD26wa2,-1952801970_1656066627886452.0,1656066627886450,10,1,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0
2MiCgDBtDEqD26wa2,-1952801970_1656066627886452.0,1656066627886450,10,0,0,1,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0
2NXph6YggTsPyW7Hf,-1901794190_3687017707098259.0,3687017707098260,11,1,0,0,Rand,Random,Random,,...,1,0,0,0,0,0,0,0,0,0
2NXph6YggTsPyW7Hf,-1901794190_3687017707098259.0,3687017707098260,11,0,0,2,Rand,Random,Random,,...,1,0,0,0,0,0,0,0,0,0
2YLruszpQxGtHWnqQ,-825740645_516289185,516289185,2,0,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zrRKAjffabiGF6E83,-99643939_1062068271,1062068271,11,1,1,2,Rand,Random,Random,,...,0,0,0,1,0,0,0,1,0,0
ztWnoj6bHwxrMf9bW,-218458164_24114949,24114949,2,1,0,0,Rand,Random,Random,,...,2,0,0,0,0,0,0,1,0,1
ztWnoj6bHwxrMf9bW,-218458164_24114949,24114949,2,0,0,0,Rand,Random,Random,,...,2,0,0,0,0,0,0,1,0,1
zzJDCNTx2BrKhCoft,1629226_4181225833514974.0,4181225833514980,4,1,0,1,Rand,Random,Random,,...,1,0,0,1,0,1,0,0,0,0


A standard train test split for testing:

In [12]:
from sklearn.model_selection import train_test_split

X = df_joined.drop(labels=["Intervened"], axis='columns', inplace=False)
y = df_joined["Intervened"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
X_test, X_dev, y_test, y_dev = train_test_split(X_test, y_test, test_size=0.2, random_state=1)

Some sample labeling functions:

In [35]:
sys.path.insert(1, '../hmm-app')
import heuristics
import importlib
importlib.reload(heuristics)
from snorkel.labeling import PandasLFApplier

lfs = [
    heuristics.lf_doctors,
    heuristics.lf_utilitarian,
    heuristics.lf_inaction,
    heuristics.lf_pedestrians,
    heuristics.lf_females,
    heuristics.lf_fitness,
    heuristics.lf_status,
    heuristics.lf_legal,
    heuristics.lf_illegal,
    heuristics.lf_youth,
    heuristics.lf_criminals,
    heuristics.lf_homeless,
    heuristics.lf_pets,
    heuristics.lf_spare_strollers,
    heuristics.lf_spare_girl,
    heuristics.lf_spare_boy,
    heuristics.lf_spare_pregnant,
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=X_train)
L_dev = applier.apply(df=X_dev)
L_train

  from pandas import Panel


  0%|          | 0/1009 [00:00<?, ?it/s][A[A

  6%|▌         | 62/1009 [00:00<00:01, 618.47it/s][A[A

 13%|█▎        | 127/1009 [00:00<00:01, 625.27it/s][A[A

 19%|█▉        | 193/1009 [00:00<00:01, 635.20it/s][A[A

 26%|██▌       | 263/1009 [00:00<00:01, 650.72it/s][A[A

 33%|███▎      | 332/1009 [00:00<00:01, 661.86it/s][A[A

 39%|███▉      | 398/1009 [00:00<00:00, 660.88it/s][A[A

 46%|████▌     | 464/1009 [00:00<00:00, 659.93it/s][A[A

 53%|█████▎    | 530/1009 [00:00<00:00, 658.81it/s][A[A

 59%|█████▉    | 599/1009 [00:00<00:00, 666.89it/s][A[A

 66%|██████▌   | 668/1009 [00:01<00:00, 672.13it/s][A[A

 73%|███████▎  | 737/1009 [00:01<00:00, 676.99it/s][A[A

 80%|███████▉  | 806/1009 [00:01<00:00, 679.80it/s][A[A

 87%|████████▋ | 874/1009 [00:01<00:00, 675.29it/s][A[A

100%|██████████| 1009/1009 [00:01<00:00, 669.31it/s][A[A


100%|██████████| 64/64 [00:00<00:00, 700.30it/s]


array([[ 1, -1,  0, ...,  1, -1,  0],
       [-1,  1,  0, ..., -1, -1, -1],
       [ 0,  0,  0, ..., -1, -1, -1],
       ...,
       [-1,  1,  0, ..., -1,  0, -1],
       [ 1,  1,  0, ..., -1, -1, -1],
       [-1, -1,  0, ...,  0, -1,  1]])

In [36]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_doctors,0,"[0, 1]",0.409316,0.409316,0.395441
lf_utilitarian,1,"[0, 1]",0.806739,0.806739,0.772052
lf_inaction,2,[0],1.0,1.0,0.956392
lf_pedestrians,3,"[0, 1]",0.878097,0.878097,0.848365
lf_females,4,"[0, 1]",0.592666,0.592666,0.572844
lf_fitness,5,"[0, 1]",0.622398,0.622398,0.60555
lf_status,6,"[0, 1]",0.361744,0.361744,0.346878
lf_legal,7,"[0, 1]",0.173439,0.173439,0.166501
lf_illegal,8,"[0, 1]",0.214073,0.214073,0.214073
lf_youth,9,"[0, 1]",0.587711,0.587711,0.587711


In [40]:
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=y_dev.values).sort_values("Emp. Acc.", ascending=False)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_pets,12,[1],0.015625,0.015625,0.015625,1,0,1.0
lf_homeless,11,[1],0.015625,0.015625,0.015625,1,0,1.0
lf_criminals,10,"[0, 1]",0.046875,0.046875,0.046875,3,0,1.0
lf_utilitarian,1,"[0, 1]",0.734375,0.734375,0.703125,30,17,0.638298
lf_youth,9,"[0, 1]",0.546875,0.546875,0.546875,22,13,0.628571
lf_spare_pregnant,16,"[0, 1]",0.28125,0.28125,0.28125,11,7,0.611111
lf_females,4,"[0, 1]",0.578125,0.578125,0.546875,22,15,0.594595
lf_doctors,0,"[0, 1]",0.453125,0.453125,0.421875,16,13,0.551724
lf_fitness,5,"[0, 1]",0.515625,0.515625,0.46875,18,15,0.545455
lf_status,6,"[0, 1]",0.359375,0.359375,0.34375,11,12,0.478261


In [41]:
X_train.columns

Index(['ExtendedSessionID', 'UserID', 'ScenarioOrder', 'Intervention',
       'PedPed', 'CrossingSignal', 'AttributeLevel', 'ScenarioTypeStrict',
       'ScenarioType', 'DefaultChoice', 'NonDefaultChoice',
       'DefaultChoiceIsOmission', 'Template', 'UserCountry3', 'Barrier_int',
       'NumberOfCharacters_int', 'DiffNumberOFCharacters_int',
       'DescriptionShown_int', 'LeftHand_int', 'Man_int', 'Woman_int',
       'Pregnant_int', 'Stroller_int', 'OldMan_int', 'OldWoman_int', 'Boy_int',
       'Girl_int', 'Homeless_int', 'LargeWoman_int', 'LargeMan_int',
       'Criminal_int', 'MaleExecutive_int', 'FemaleExecutive_int',
       'FemaleAthlete_int', 'MaleAthlete_int', 'FemaleDoctor_int',
       'MaleDoctor_int', 'Dog_int', 'Cat_int', 'Barrier_noint',
       'NumberOfCharacters_noint', 'DiffNumberOFCharacters_noint',
       'DescriptionShown_noint', 'LeftHand_noint', 'Man_noint', 'Woman_noint',
       'Pregnant_noint', 'Stroller_noint', 'OldMan_noint', 'OldWoman_noint',
       'Boy_n