In [1]:
import pandas as pd
import numpy as np
import sys
import random
from sqlalchemy import create_engine

from snorkel.labeling import labeling_function

## Load Data

### DataFrame

In [2]:
## load the data by choosing a random 10% of the data
## is the data ordered? am I missing a large chunk due to my partial loading?
# p = 0.01
# responses = pd.read_csv('../data/moralmachine/SharedResponses.csv', skiprows=lambda i: i>0 and random.random() > p)

### SQL Engine

In [3]:
# access SQL DB with data
engine = create_engine("sqlite:///../data/moralmachine.db", echo=False)

In [4]:
# demo
responses = pd.read_sql("SELECT * FROM sharedresponses ORDER BY RANDOM() LIMIT 100000", con=engine)
responses.columns

Index(['ResponseID', 'ExtendedSessionID', 'UserID', 'ScenarioOrder',
       'Intervention', 'PedPed', 'Barrier', 'CrossingSignal', 'AttributeLevel',
       'ScenarioTypeStrict', 'ScenarioType', 'DefaultChoice',
       'NonDefaultChoice', 'DefaultChoiceIsOmission', 'NumberOfCharacters',
       'DiffNumberOFCharacters', 'Saved', 'Template', 'DescriptionShown',
       'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
       'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
       'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
       'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
       'Cat'],
      dtype='object')

In [5]:
# ## For viewing the possible values for each feature
for feature in responses.columns:
    print(feature)
    print(responses[feature].unique())
    print(responses[feature].unique().shape)

ResponseID
['HKQEHx4aZqESmBY8S' '3onZQNgPLe2vsyn9f' 'aLGfC379nQvMMGKnR' ...
 'KhkFWJfcN9mmMqcus' 'uD4GEBYQZbi4mbG34' 'DMSbecfNPBe7oKmaJ']
(99913,)
ExtendedSessionID
['-856676726_1470502651' '794211492_8632120756420629.0'
 '-1755150261_9197295003499718.0' ... '-1697102036_2322956926.0'
 '653309876_8543287825499266.0' '186431870_2178752375268680.0']
(98151,)
UserID
['1470502651' '8632120756420630' '9197295003499720' ... '7574217179790140'
 '8543287825499270' '2178752375268680']
(84098,)
ScenarioOrder
[ 5 10  6  8 11  2  9 13  4  7  1 12  3]
(13,)
Intervention
[0 1]
(2,)
PedPed
[0 1]
(2,)
Barrier
[1 0]
(2,)
CrossingSignal
[0 1 2]
(3,)
AttributeLevel
['Pets' 'Low' 'Fat' 'Old' 'Fit' 'Young' 'Female' 'Less' 'Rand' 'More'
 'Hoomans' 'Male' 'High']
(13,)
ScenarioTypeStrict
['Species' 'Social Status' 'Fitness' 'Age' 'Gender' 'Utilitarian' 'Random']
(7,)
ScenarioType
['Species' 'Social Status' 'Fitness' 'Age' 'Gender' 'Utilitarian' 'Random']
(7,)
DefaultChoice
['Hoomans' 'High' 'Fit' 'Young' 'Ma

In [10]:
# ## What does a single user's session/response look like?
responses_random = responses[responses['ScenarioTypeStrict'] == 'Random']
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    ## see a random user's session
#     display(responses_random[responses_random['ExtendedSessionID'] == responses_random['ExtendedSessionID'].sample().values[0]])
    
    ## see the user with nth most responses
    n = 100
    display(responses_random[responses_random['ResponseID'] == responses_random.groupby(by='ResponseID').size().sort_values(ascending=False).index[n]]) 

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,NonDefaultChoice,DefaultChoiceIsOmission,NumberOfCharacters,DiffNumberOFCharacters,Saved,Template,DescriptionShown,LeftHand,UserCountry3,Man,Woman,Pregnant,Stroller,OldMan,OldWoman,Boy,Girl,Homeless,LargeWoman,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
69559,Mit9jfwRXWaLPy84C,-1467026630_805057287878687.0,805057287878687,6,0,0,0,2,Rand,Random,Random,,,,3,2,1,Desktop,1,1,AUS,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0


In [7]:
# ## group alternatives pairwise!
# # take only responses where both alternatives are present in the random sample - for a full sample, this step isn't necessary
responses_grouped = responses.groupby(by='ResponseID').filter(lambda g: g.shape[0] < 2)
responses_grouped.shape

(99826, 41)

## Data Exploration

In [9]:
responses.describe()

Unnamed: 0,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,NumberOfCharacters,DiffNumberOFCharacters,Saved,Man,Woman,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,6.72484,0.49974,0.4479,0.27544,0.60401,3.08786,0.5709,0.50243,0.32676,0.32446,...,0.16017,0.06059,0.10816,0.10764,0.19283,0.19278,0.09581,0.0967,0.17271,0.17298
std,3.773957,0.500002,0.497281,0.446738,0.813799,1.459857,1.150846,0.499997,0.611565,0.610958,...,0.447703,0.25919,0.352679,0.349993,0.525594,0.525394,0.331529,0.333332,0.565159,0.565686
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,10.0,1.0,1.0,1.0,1.0,4.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,13.0,1.0,1.0,1.0,2.0,5.0,4.0,1.0,5.0,5.0,...,5.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,5.0,5.0


## Snorkel Labeling
Using https://www.snorkel.org/use-cases/01-spam-tutorial

In [4]:
# query random sample of responses grouped by ResponseID; only take responses for which both instances are present
query = """
    --SELECT * FROM (
        SELECT * FROM sharedresponses
            WHERE ScenarioTypeStrict LIKE 'Random'
        ORDER BY RANDOM()
        LIMIT 100000
    --)
    --GROUP BY ResponseID, ExtendedSessionID, UserID
    --HAVING COUNT(ResponseID) > 1
"""
df = pd.read_sql(query, con=engine).groupby(by='ResponseID').filter(lambda g: g.shape[0] > 1).sort_values('ResponseID')
df

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
96129,24EHhs2PQzQSTnC5C,278916114_7676518387914383.0,7676518387914380,9,0,0,1,0,Rand,Random,...,0,0,0,1,0,0,0,0,0,0
52695,24EHhs2PQzQSTnC5C,278916114_7676518387914383.0,7676518387914380,9,1,0,0,1,Rand,Random,...,0,1,0,1,0,0,0,0,0,0
87780,24JyGibrLNCNH6oJ9,-69112924_4041911432.0,4041911432,8,0,1,0,2,Rand,Random,...,1,0,0,0,0,1,0,0,0,1
21446,24JyGibrLNCNH6oJ9,-69112924_4041911432.0,4041911432,8,1,1,0,1,Rand,Random,...,1,0,0,0,1,0,0,0,0,0
43534,27B7WuixvASdoZNeS,-1595907107_9289891760278082.0,9289891760278080,3,0,0,0,2,Rand,Random,...,0,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16845,zrujvrc9NztzmM7tn,-92475683_8866245258428904.0,8866245258428900,3,0,0,1,0,Rand,Random,...,0,0,0,0,0,1,0,0,0,0
87684,zx4r7WTKPC7ZchE9G,-208611158_3060625628.0,3060625628,4,1,0,0,1,Rand,Random,...,0,0,0,0,0,0,0,0,0,0
52535,zx4r7WTKPC7ZchE9G,-208611158_3060625628.0,3060625628,4,0,0,1,0,Rand,Random,...,0,0,0,0,0,0,0,0,0,0
99327,zyCoqekKFMy84xPdz,-1952339671_1059207222,1059207222,7,1,0,1,0,Less,Random,...,0,0,0,0,0,0,0,0,0,1


Need to have both alternatives in the same tuple, marking them by whether or not they are the intervention - specifically the factors:
> 'NumberOfCharacters', 'DiffNumberOfCharacters', 'Saved', 'Template', 'DescriptionShown',
'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
'Cat'

In [5]:
df[["ResponseID","Intervention","Saved"]]

Unnamed: 0,ResponseID,Intervention,Saved
96129,24EHhs2PQzQSTnC5C,0,1
52695,24EHhs2PQzQSTnC5C,1,0
87780,24JyGibrLNCNH6oJ9,0,0
21446,24JyGibrLNCNH6oJ9,1,1
43534,27B7WuixvASdoZNeS,0,1
...,...,...,...
16845,zrujvrc9NztzmM7tn,0,0
87684,zx4r7WTKPC7ZchE9G,1,0
52535,zx4r7WTKPC7ZchE9G,0,1
99327,zyCoqekKFMy84xPdz,1,1


In [15]:
scenario_fields = [
    'ResponseID', 'Barrier', 'NumberOfCharacters', 'DiffNumberOFCharacters', 'Saved', 'DescriptionShown', 'LeftHand', 'Man', 'Woman', 'Pregnant', 'Stroller', \
    'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', \
    'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat'
]
intervention = df[df['Intervention'] == 1][scenario_fields].set_index('ResponseID')
no_intervention = df[df['Intervention'] == 0][scenario_fields].set_index('ResponseID')
df_joined = intervention.join(no_intervention, lsuffix='_int', rsuffix='_noint', how='inner')
df_joined = df[[col for col in df.columns if col not in scenario_fields or col == 'ResponseID']].set_index('ResponseID').join(df_joined)
df_joined['Intervened'] = (df_joined['Saved_int'] == 1).astype(int)
df_joined = df_joined.drop(axis='columns', labels=['Saved_{}'.format(s) for s in ['int', 'noint']])
df_joined

Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,CrossingSignal,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,...,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint,Intervened
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24EHhs2PQzQSTnC5C,278916114_7676518387914383.0,7676518387914380,9,0,0,0,Rand,Random,Random,,...,0,0,1,0,0,0,0,0,0,0
24EHhs2PQzQSTnC5C,278916114_7676518387914383.0,7676518387914380,9,1,0,1,Rand,Random,Random,,...,0,0,1,0,0,0,0,0,0,0
24JyGibrLNCNH6oJ9,-69112924_4041911432.0,4041911432,8,0,1,2,Rand,Random,Random,,...,0,0,0,0,1,0,0,0,1,1
24JyGibrLNCNH6oJ9,-69112924_4041911432.0,4041911432,8,1,1,1,Rand,Random,Random,,...,0,0,0,0,1,0,0,0,1,1
27B7WuixvASdoZNeS,-1595907107_9289891760278082.0,9289891760278080,3,0,0,2,Rand,Random,Random,,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zrujvrc9NztzmM7tn,-92475683_8866245258428904.0,8866245258428900,3,0,0,0,Rand,Random,Random,,...,0,0,0,0,1,0,0,0,0,1
zx4r7WTKPC7ZchE9G,-208611158_3060625628.0,3060625628,4,1,0,1,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0
zx4r7WTKPC7ZchE9G,-208611158_3060625628.0,3060625628,4,0,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0
zyCoqekKFMy84xPdz,-1952339671_1059207222,1059207222,7,1,0,0,Less,Random,Utilitarian,More,...,0,0,0,0,0,0,1,0,1,1


A standard train test split for testing:

In [16]:
from sklearn.model_selection import train_test_split

X = df_joined.drop(labels=["Intervened"], axis='columns', inplace=False)
y = df_joined["Intervened"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
X_test, X_dev, y_test, y_dev = train_test_split(X_test, y_test, test_size=0.2, random_state=1)

Some sample labeling functions:

In [17]:
sys.path.insert(1, '../hmm-app')
import heuristics
from snorkel.labeling import PandasLFApplier

lfs = [heuristics.lf_doctors, heuristics.lf_utilitarian]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=X_train)
L_dev = applier.apply(df=X_dev)
L_train

  from pandas import Panel
100%|██████████| 1112/1112 [00:00<00:00, 7948.43it/s]
100%|██████████| 70/70 [00:00<00:00, 6853.44it/s]


array([[-1,  0],
       [ 0,  0],
       [-1,  1],
       ...,
       [-1,  0],
       [ 1,  1],
       [-1,  0]])

In [18]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_doctors,0,"[0, 1]",0.367806,0.286871,0.100719
lf_utilitarian,1,"[0, 1]",0.801259,0.286871,0.100719


In [19]:
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=y_dev.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_doctors,0,"[0, 1]",0.314286,0.257143,0.1,10,12,0.454545
lf_utilitarian,1,"[0, 1]",0.828571,0.257143,0.1,18,40,0.310345


In [20]:
from snorkel.analysis import get_label_buckets

# misclassified - labeled intervention, actually not
buckets = get_label_buckets(y_dev, L_dev[:, 1])
X_dev.iloc[buckets[0,1]]

Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,CrossingSignal,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,...,LargeMan_noint,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5Gdx39dHpbQQS9daS,346078433_1729138077,1729138077,4,0,0,1,Rand,Random,Random,,...,0,0,0,0,0,1,0,0,1,0
wpRqwfJuwQPvS2feY,-799996382_2255699979599972.0,2255699979599970,3,0,0,1,Rand,Random,Random,,...,0,0,0,1,0,0,0,0,0,1
bsE69LbQZnoBumSkJ,-491178125_597475253,597475253,4,1,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,1,0,0
tZwYK6gt2dZ9XAzWe,900284825_1356727652,1356727652,4,0,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,2,0
2qe3ksrAat2QADvGM,-767981200_8634147426915814.0,8634147426915820,1,1,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0
DhQiCCam4KH96mnmn,-782682737_3458713931.0,3458713931,9,0,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0
gTKvGDqDqLK5iJgXN,-805841631_7900968578207694.0,7900968578207700,13,1,0,0,Rand,Random,Random,,...,0,1,0,0,0,0,0,1,0,0
55T9CN25QkJDtmcp6,1165001890_1296421467,1296421467,11,1,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,1,0,0,0
KdqLujDv8Ru6HCMyr,-2054552385_5830778472591192.0,5830778472591190,8,1,0,2,Rand,Random,Random,,...,0,0,0,2,0,0,0,0,0,0
XBexQYg2PB6cwTAFL,680891227_9347577392361782.0,9347577392361780,10,1,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0


In [21]:
## TODO: check this - probably buggy
# false positives? (intervene when not supposed to?)
# this is where doctors voted intervene
X_train.iloc[L_train[:, 1] == 1].sample(10, random_state=1)
# this is where doctors abstained, but utilitarian voted
X_train.iloc[buckets[(0, -1)]].sample(10, random_state=1)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [22]:
X_train.columns

Index(['ExtendedSessionID', 'UserID', 'ScenarioOrder', 'Intervention',
       'PedPed', 'CrossingSignal', 'AttributeLevel', 'ScenarioTypeStrict',
       'ScenarioType', 'DefaultChoice', 'NonDefaultChoice',
       'DefaultChoiceIsOmission', 'Template', 'UserCountry3', 'Barrier_int',
       'NumberOfCharacters_int', 'DiffNumberOFCharacters_int',
       'DescriptionShown_int', 'LeftHand_int', 'Man_int', 'Woman_int',
       'Pregnant_int', 'Stroller_int', 'OldMan_int', 'OldWoman_int', 'Boy_int',
       'Girl_int', 'Homeless_int', 'LargeWoman_int', 'LargeMan_int',
       'Criminal_int', 'MaleExecutive_int', 'FemaleExecutive_int',
       'FemaleAthlete_int', 'MaleAthlete_int', 'FemaleDoctor_int',
       'MaleDoctor_int', 'Dog_int', 'Cat_int', 'Barrier_noint',
       'NumberOfCharacters_noint', 'DiffNumberOFCharacters_noint',
       'DescriptionShown_noint', 'LeftHand_noint', 'Man_noint', 'Woman_noint',
       'Pregnant_noint', 'Stroller_noint', 'OldMan_noint', 'OldWoman_noint',
       'Boy_n

In [29]:
X_train[["Barrier_int", "Barrier_noint"]]

Unnamed: 0_level_0,Barrier_int,Barrier_noint
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1
y2gKhi2zM3T5bFEqQ,0,1
Y93YFpjJ8sWakkWkL,1,0
DM36v4Zqc7cse2r4Q,0,1
Y9EdrtNokH2cJxPi5,0,1
52MkYefvjjkQMNLRS,0,0
...,...,...
BBrK4uRkddrzMXTwT,0,1
9sF5PsGKgeygH2GsC,0,0
eAag4Kz4hCvdGoesZ,1,0
R35yznt2233mFjB54,0,1
