In [1]:
import pandas as pd
import random
from sqlalchemy import create_engine

from snorkel.labeling import labeling_function

## Load Data

### DataFrame

In [2]:
## load the data by choosing a random 10% of the data
## is the data ordered? am I missing a large chunk due to my partial loading?
# p = 0.01
# responses = pd.read_csv('../data/moralmachine/SharedResponses.csv', skiprows=lambda i: i>0 and random.random() > p)

### SQL Engine

In [3]:
# access SQL DB with data
engine = create_engine("sqlite:///../data/moralmachine.db", echo=False)

In [4]:
# demo
responses = pd.read_sql("SELECT * FROM sharedresponses ORDER BY RANDOM() LIMIT 1000000", con=engine)
responses.columns

Index(['ResponseID', 'ExtendedSessionID', 'UserID', 'ScenarioOrder',
       'Intervention', 'PedPed', 'Barrier', 'CrossingSignal', 'AttributeLevel',
       'ScenarioTypeStrict', 'ScenarioType', 'DefaultChoice',
       'NonDefaultChoice', 'DefaultChoiceIsOmission', 'NumberOfCharacters',
       'DiffNumberOFCharacters', 'Saved', 'Template', 'DescriptionShown',
       'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
       'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
       'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
       'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
       'Cat'],
      dtype='object')

In [5]:
# ## For viewing the possible values for each feature
for feature in responses.columns:
    print(feature)
    print(responses[feature].unique())
    print(responses[feature].unique().shape)

ResponseID
['EW8bpjLJf2hR2xQnT' '2uh3HfyfP2QfxgbYc' 'mZpDfuPpHcjivbwTh' ...
 'aNP2shDqMxZxNB7QS' 'p3sAhqQrDKkMKmZz9' '9NjTMqqDYmd2Jp2BT']
(993114,)
ExtendedSessionID
['421947033_2730632126.0' '573476814_5485599329590620.0'
 '1811985714_3400382452.0' ... '-274831822_2116635015694961.0'
 '1481330532_4138729598.0' '1070856770_1958463420529225.0']
(859342,)
UserID
['2730632126' '5485599329590620' '3400382452' ... '5705685083702870'
 '2116635015694960' '1958463420529220']
(639729,)
ScenarioOrder
[ 3  2 13  4  1  6  5  8 11 12  7  9 10]
(13,)
Intervention
[0 1]
(2,)
PedPed
[1 0]
(2,)
Barrier
[0 1]
(2,)
CrossingSignal
[1 0 2]
(3,)
AttributeLevel
['More' 'Hoomans' 'Rand' 'Pets' 'Fat' 'Young' 'Old' 'Fit' 'Low' 'Female'
 'Less' 'Male' 'High']
(13,)
ScenarioTypeStrict
['Utilitarian' 'Species' 'Fitness' 'Age' 'Social Status' 'Gender' 'Random']
(7,)
ScenarioType
['Utilitarian' 'Species' 'Random' 'Fitness' 'Age' 'Social Status' 'Gender']
(7,)
DefaultChoice
['More' 'Hoomans' '' 'Fit' 'Young' 'High' '

In [8]:
# ## What does a single user's session/response look like?
responses_random = responses[responses['ScenarioTypeStrict'] == 'Random']
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    ## see a random user's session
#     display(responses_random[responses_random['ExtendedSessionID'] == responses_random['ExtendedSessionID'].sample().values[0]])
    
    ## see the user with nth most responses
    n = 100
    display(responses_random[responses_random['ResponseID'] == responses_random.groupby(by='ResponseID').size().sort_values(ascending=False).index[n]]) 

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,NonDefaultChoice,DefaultChoiceIsOmission,NumberOfCharacters,DiffNumberOFCharacters,Saved,Template,DescriptionShown,LeftHand,UserCountry3,Man,Woman,Pregnant,Stroller,OldMan,OldWoman,Boy,Girl,Homeless,LargeWoman,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
91964,iDskDYWyipPafkkku,-1656963704_479791456395692.0,479791456395692,8,1,1,0,2,Rand,Random,Random,,,,4,3,1,Desktop,0,0,CAN,0,0,2,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
471498,iDskDYWyipPafkkku,-1656963704_479791456395692.0,479791456395692,8,0,1,0,1,Rand,Random,Random,,,,1,3,0,Desktop,0,1,CAN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [7]:
# ## group alternatives pairwise!
# # take only responses where both alternatives are present in the random sample - for a full sample, this step isn't necessary
responses_grouped = responses.groupby(by='ResponseID').filter(lambda g: g.shape[0] < 2)
responses_grouped.shape

## Data Exploration

In [9]:
df.describe()

Unnamed: 0,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,NumberOfCharacters,DiffNumberOFCharacters,Saved,Man,Woman,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,6.748,0.0,0.416,0.214,0.726,3.11,0.54,0.476,0.33,0.352,...,0.166,0.07,0.074,0.11,0.21,0.208,0.098,0.094,0.172,0.172
std,3.839007,0.0,0.493387,0.410538,0.876619,1.449864,1.076437,0.499924,0.56368,0.636322,...,0.4414,0.292011,0.276907,0.331844,0.538963,0.566903,0.329565,0.359754,0.592552,0.589161
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,10.0,0.0,1.0,0.0,2.0,4.25,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,13.0,0.0,1.0,1.0,2.0,5.0,4.0,1.0,2.0,3.0,...,2.0,2.0,2.0,2.0,3.0,5.0,2.0,3.0,5.0,5.0


## Snorkel Labeling

In [13]:
# query random sample of responses grouped by ResponseID
query = """
    SELECT * FROM sharedresponses
        WHERE ScenarioTypeStrict LIKE 'Random'
    --GROUP BY ResponseID
        --HAVING COUNT(ResponseID) > 1
    ORDER BY RANDOM()
    LIMIT 100000
"""
df = pd.read_sql(query, con=engine)

In [14]:
df.groupby(by='ResponseID').filter(lambda g: g.shape[0] < 2)
n=0
display(df[df['ResponseID'] == df.groupby(by='ResponseID').size().sort_values(ascending=False).index[n]]) 

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
10131,zzzyn5cTnLLFf9hXz,-2129308287_3976863958077909.0,3976863958077910,3,0,0,1,0,More,Random,...,0,0,0,0,0,0,0,1,0,0
