In [1]:
import pandas as pd
import numpy as np
import sys
import random
from sqlalchemy import create_engine

from snorkel.labeling import labeling_function

## Load Data

### DataFrame

In [2]:
## load the data by choosing a random 10% of the data
## is the data ordered? am I missing a large chunk due to my partial loading?
# p = 0.01
# responses = pd.read_csv('../data/moralmachine/SharedResponses.csv', skiprows=lambda i: i>0 and random.random() > p)

### SQL Engine

In [3]:
# access SQL DB with data
engine = create_engine("sqlite:///../data/moralmachine.db", echo=False)

In [4]:
# demo
# responses = pd.read_sql("SELECT * FROM sharedresponses ORDER BY RANDOM() LIMIT 100000", con=engine)
# responses.columns

In [5]:
# ## For viewing the possible values for each feature
# for feature in responses.columns:
#     print(feature)
#     print(responses[feature].unique())
#     print(responses[feature].unique().shape)

In [6]:
# ## What does a single user's session/response look like?
# responses_random = responses[responses['ScenarioTypeStrict'] == 'Random']
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     ## see a random user's session
# #     display(responses_random[responses_random['ExtendedSessionID'] == responses_random['ExtendedSessionID'].sample().values[0]])
    
#     ## see the user with nth most responses
#     n = 100
#     display(responses_random[responses_random['ResponseID'] == responses_random.groupby(by='ResponseID').size().sort_values(ascending=False).index[n]]) 

In [7]:
# ## group alternatives pairwise!
# # take only responses where both alternatives are present in the random sample - for a full sample, this step isn't necessary
# responses_grouped = responses.groupby(by='ResponseID').filter(lambda g: g.shape[0] < 2)
# responses_grouped.shape

## Data Exploration

In [8]:
# responses.describe()

## Snorkel Labeling
Using https://www.snorkel.org/use-cases/01-spam-tutorial

In [9]:
# query random sample of responses grouped by ResponseID; only take responses for which both instances are present
query = """
    --SELECT * FROM (
        SELECT * FROM sharedresponses
            WHERE ScenarioTypeStrict LIKE 'Random'
        ORDER BY RANDOM()
        LIMIT 100000
    --)
    --GROUP BY ResponseID, ExtendedSessionID, UserID
    --HAVING COUNT(ResponseID) > 1
"""
df = pd.read_sql(query, con=engine).groupby(by='ResponseID').filter(lambda g: g.shape[0] > 1).sort_values('ResponseID')
df

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
93142,228spSh7JKQDk4qHe,958238556_3661797005.0,3661797005,8,1,1,0,0,Rand,Random,...,0,0,1,0,0,0,0,1,0,0
68609,228spSh7JKQDk4qHe,958238556_3661797005.0,3661797005,8,0,1,0,0,Rand,Random,...,1,0,1,0,0,0,1,0,1,0
3731,26Z7TG3uQnTbzCbJQ,650348675_1751123414774660.0,1751123414774660,6,0,0,0,2,Rand,Random,...,0,0,0,0,0,0,0,0,1,0
26441,26Z7TG3uQnTbzCbJQ,650348675_1751123414774660.0,1751123414774660,6,1,0,1,0,Rand,Random,...,0,0,0,0,0,0,0,0,0,0
28635,28wbiMJBB3dA795EW,1766930390_7670675362614470.0,7670675362614470,1,0,0,1,0,Rand,Random,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80957,zsNG4bsonzxBQq4YK,-1810355574_1611943219,1611943219,10,0,0,0,1,Rand,Random,...,0,1,0,0,0,0,0,1,0,0
38500,zzBqnxsCjEJYGjypj,-575581380_6222850737263892.0,6222850737263890,8,1,0,1,0,Rand,Random,...,0,0,1,0,1,0,0,0,0,0
66175,zzBqnxsCjEJYGjypj,-575581380_6222850737263892.0,6222850737263890,8,0,0,0,0,Rand,Random,...,0,0,0,1,1,0,0,0,1,0
63220,zzLheRMmfqEQzLfjL,2119264851_4982232198937873.0,4982232198937870,1,1,0,0,2,Rand,Random,...,1,0,0,0,0,0,0,0,0,1


Need to have both alternatives in the same tuple, marking them by whether or not they are the intervention - specifically the factors:
> 'NumberOfCharacters', 'DiffNumberOfCharacters', 'Saved', 'Template', 'DescriptionShown',
'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
'Cat'

In [10]:
df[["ResponseID","Intervention","Saved"]]

Unnamed: 0,ResponseID,Intervention,Saved
93142,228spSh7JKQDk4qHe,1,1
68609,228spSh7JKQDk4qHe,0,0
3731,26Z7TG3uQnTbzCbJQ,0,1
26441,26Z7TG3uQnTbzCbJQ,1,0
28635,28wbiMJBB3dA795EW,0,1
...,...,...,...
80957,zsNG4bsonzxBQq4YK,0,0
38500,zzBqnxsCjEJYGjypj,1,0
66175,zzBqnxsCjEJYGjypj,0,1
63220,zzLheRMmfqEQzLfjL,1,0


In [11]:
scenario_fields = [
    'ResponseID', 'Barrier', 'NumberOfCharacters', 'DiffNumberOFCharacters', 'Saved', 'DescriptionShown', 'LeftHand', 'Man', 'Woman', 'Pregnant', 'Stroller', \
    'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', \
    'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat'
]
intervention = df[df['Intervention'] == 1][scenario_fields].set_index('ResponseID')
no_intervention = df[df['Intervention'] == 0][scenario_fields].set_index('ResponseID')
df_joined = intervention.join(no_intervention, lsuffix='_int', rsuffix='_noint', how='inner')
df_joined = df[[col for col in df.columns if col not in scenario_fields or col == 'ResponseID']].set_index('ResponseID').join(df_joined)
df_joined['Intervened'] = (df_joined['Saved_int'] == 1).astype(int)
df_joined = df_joined.drop(axis='columns', labels=['Saved_{}'.format(s) for s in ['int', 'noint']])
df_joined

Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,CrossingSignal,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,...,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint,Intervened
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
228spSh7JKQDk4qHe,958238556_3661797005.0,3661797005,8,1,1,0,Rand,Random,Random,,...,0,1,0,0,0,1,0,1,0,1
228spSh7JKQDk4qHe,958238556_3661797005.0,3661797005,8,0,1,0,Rand,Random,Random,,...,0,1,0,0,0,1,0,1,0,1
26Z7TG3uQnTbzCbJQ,650348675_1751123414774660.0,1751123414774660,6,0,0,2,Rand,Random,Random,,...,0,0,0,0,0,0,0,1,0,0
26Z7TG3uQnTbzCbJQ,650348675_1751123414774660.0,1751123414774660,6,1,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,1,0,0
28wbiMJBB3dA795EW,1766930390_7670675362614470.0,7670675362614470,1,0,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zsNG4bsonzxBQq4YK,-1810355574_1611943219,1611943219,10,0,0,1,Rand,Random,Random,,...,1,0,0,0,0,0,1,0,0,1
zzBqnxsCjEJYGjypj,-575581380_6222850737263892.0,6222850737263890,8,1,0,0,Rand,Random,Random,,...,0,0,1,1,0,0,0,1,0,0
zzBqnxsCjEJYGjypj,-575581380_6222850737263892.0,6222850737263890,8,0,0,0,Rand,Random,Random,,...,0,0,1,1,0,0,0,1,0,0
zzLheRMmfqEQzLfjL,2119264851_4982232198937873.0,4982232198937870,1,1,0,2,Rand,Random,Random,,...,0,0,0,1,0,1,0,0,0,0


A standard train test split for testing:

In [12]:
from sklearn.model_selection import train_test_split

X = df_joined.drop(labels=["Intervened"], axis='columns', inplace=False)
y = df_joined["Intervened"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
X_test, X_dev, y_test, y_dev = train_test_split(X_test, y_test, test_size=0.2, random_state=1)

Some sample labeling functions:

In [21]:
sys.path.insert(1, '../heuristics')
import labeling_functions
import utils
import importlib
importlib.reload(labeling_functions)
importlib.reload(utils)

from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis


lfs = [
    labeling_functions.doctors,
    labeling_functions.utilitarian,
    labeling_functions.utilitarian_anthro,
    labeling_functions.inaction,
    labeling_functions.pedestrians,
    labeling_functions.females,
    labeling_functions.fitness,
    labeling_functions.status,
    labeling_functions.legal,
    labeling_functions.illegal,
    labeling_functions.youth,
    labeling_functions.criminals,
    labeling_functions.homeless,
    labeling_functions.pets,
    labeling_functions.spare_strollers,
    labeling_functions.spare_girl,
    labeling_functions.spare_boy,
    labeling_functions.spare_pregnant
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=X_train)
L_dev = applier.apply(df=X_dev)
L_valid = applier.apply(df=X_val)
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=y_dev.values).sort_values("Correct", ascending=False)

  from pandas import Panel
100%|██████████| 1012/1012 [00:01<00:00, 984.17it/s]
100%|██████████| 64/64 [00:00<00:00, 1069.97it/s]
100%|██████████| 253/253 [00:00<00:00, 1000.85it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
utilitarian,1,"[0, 1]",0.828125,0.828125,0.765625,40,13,0.754717
utilitarian_anthro,2,"[0, 1]",0.765625,0.765625,0.71875,39,10,0.795918
inaction,3,[0],1.0,1.0,0.921875,27,37,0.421875
pedestrians,4,"[0, 1]",0.84375,0.84375,0.765625,22,32,0.407407
youth,10,"[0, 1]",0.5,0.5,0.46875,21,11,0.65625
fitness,6,"[0, 1]",0.578125,0.578125,0.546875,18,19,0.486486
females,5,"[0, 1]",0.53125,0.53125,0.46875,17,17,0.5
status,7,"[0, 1]",0.453125,0.453125,0.4375,15,14,0.517241
doctors,0,"[0, 1]",0.375,0.375,0.34375,12,12,0.5
spare_boy,16,"[0, 1]",0.15625,0.15625,0.15625,8,2,0.8


## Aggregation

Recall that there are no true labels for this problem - really, we're just measuring similarity of the heuristic labels to real voter's responses. 

**Baseline**: majority label voting.

In [23]:
from snorkel.labeling import MajorityLabelVoter

model_majority = MajorityLabelVoter()
preds_train = model_majority.predict(L=L_train)
preds_train

array([0, 1, 1, ..., 0, 1, 1])

Snorkel label model.

In [25]:
from snorkel.labeling import LabelModel

# cardinality is num classes
model_label = LabelModel(cardinality=2, verbose=True)
model_label.fit(L_train=L_train, n_epochs=500, lr=.001, log_freq=100, seed=1)

LabelModel()

In [27]:
for model in [model_majority, model_label]:
    acc = model.score(L=L_valid, Y=y_val.values, tie_break_policy="random")["accuracy"]
    print(f"{str(model)} {'Vote Accuracy:':<25} {acc*100:.1f}")

MajorityLabelVoter() Vote Accuracy:            62.1
LabelModel() Vote Accuracy:            69.2


In [37]:
from snorkel.analysis import get_label_buckets

probs_dev = model_label.predict_proba(L=L_dev)
preds_dev = probs_dev >= 0.5
buckets = get_label_buckets(y_dev.values, preds_dev[:, 1])

df_fn_dev = X_dev.iloc[buckets[(1, 0)]]
df_fn_dev["probability"] = probs_dev[buckets[(1, 0)], 1]

df_fn_dev.sample(5, random_state=3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,CrossingSignal,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,...,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint,probability
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
egjhwPsSWAc8Hoqqr,-1192149401_2263512426.0,2263512426,7,1,1,0,Rand,Random,Random,,...,0,0,0,1,0,0,0,0,1,9.214667e-09
eGdsz6At3RT4zm7Hf,1475514958_4635321376047822.0,4635321376047820,1,0,0,0,Rand,Random,Random,,...,0,0,1,0,0,0,0,1,0,2.396009e-05
4kDXtBboa7AFyRpcc,129316629_4125547988.0,4125547988,2,0,0,2,Rand,Random,Random,,...,0,0,0,1,0,0,0,0,1,1.905956e-05
cRns874frwXE533cv,1229710027_277512319248932.0,277512319248932,13,1,0,2,Rand,Random,Random,,...,1,0,0,0,0,0,1,0,0,0.4924776
FSotvdoFntsTRvDdq,-1015461331_4067195114.0,4067195114,5,0,0,0,Rand,Random,Random,,...,0,0,0,0,0,0,1,1,0,1.242088e-13


Filter out unlabeled data points.

In [39]:
## TODO - this is probably buggy

from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=X_train, y=model_label.predict_proba(L=L_train), L=L_train
)
df_train_filtered

Unnamed: 0_level_0,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,CrossingSignal,AttributeLevel,ScenarioTypeStrict,ScenarioType,DefaultChoice,...,LargeMan_noint,Criminal_noint,MaleExecutive_noint,FemaleExecutive_noint,FemaleAthlete_noint,MaleAthlete_noint,FemaleDoctor_noint,MaleDoctor_noint,Dog_noint,Cat_noint
ResponseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
uBbzLEPvAMJSnEavg,1657388442_3175740424965196.0,3175740424965200,5,1,0,2,Rand,Random,Random,,...,0,0,0,0,0,0,0,0,0,0
E6E9vj685MJMYwCJB,324625214_4346078261217463.0,4346078261217460,2,1,0,0,Rand,Random,Random,,...,1,1,0,0,1,0,0,0,0,0
XZD3MHbn3x8hsYtWr,1475844801_2835032345293249.0,2835032345293250,10,1,0,2,Rand,Random,Random,,...,1,0,0,0,0,0,0,0,0,0
mC5BraWXpaLf2kbTG,-1654723965_4641465912102962.0,4641465912102960,9,1,0,2,Rand,Random,Random,,...,1,0,0,0,0,0,0,0,0,0
Kzk8maBGowtGTHXLb,-596949408_5433938059366248.0,5433938059366250,12,0,1,1,Rand,Random,Random,,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RZno79kHpfWXxxsQT,-785105394_8534311523911013.0,8534311523911010,6,1,0,0,Rand,Random,Random,,...,0,1,1,0,0,0,0,0,0,0
ED4sQYkJt7R9ZAGB9,1349208551_3316682535.0,3316682535,9,1,0,0,Rand,Random,Random,,...,0,1,1,0,0,1,1,0,0,0
JA9o5aDqhyXJSnJpe,1623832462_8683987851228011.0,8683987851228010,8,0,0,1,Rand,Random,Random,,...,0,0,0,1,0,0,0,1,0,0
KMGPi4anc4jkfaQQT,-2094872049_2986678616278260.0,2986678616278260,8,0,0,2,Rand,Random,Random,,...,0,0,0,0,1,0,0,0,0,0


## Classification

https://www.snorkel.org/use-cases/01-spam-tutorial#5-training-a-classifier

### Featurization

In [79]:
features = [
    "PedPed", "CrossingSignal", "Template", "UserCountry3",
    'NumberOfCharacters_int', 'DescriptionShown_int', 'LeftHand_int', 'Man_int', 'Woman_int',
    'Pregnant_int', 'Stroller_int', 'OldMan_int', 'OldWoman_int', 'Boy_int',
    'Girl_int', 'Homeless_int', 'LargeWoman_int', 'LargeMan_int',
    'Criminal_int', 'MaleExecutive_int', 'FemaleExecutive_int',
    'FemaleAthlete_int', 'MaleAthlete_int', 'FemaleDoctor_int',
    'MaleDoctor_int', 'Dog_int', 'Cat_int', 'Barrier_noint',
    'NumberOfCharacters_noint', 'DescriptionShown_noint', 'LeftHand_noint', 
    'Man_noint', 'Woman_noint', 'Pregnant_noint', 'Stroller_noint', 'OldMan_noint', 'OldWoman_noint',
    'Boy_noint', 'Girl_noint', 'Homeless_noint', 'LargeWoman_noint',
    'LargeMan_noint', 'Criminal_noint', 'MaleExecutive_noint',
    'FemaleExecutive_noint', 'FemaleAthlete_noint', 'MaleAthlete_noint',
    'FemaleDoctor_noint', 'MaleDoctor_noint', 'Dog_noint', 'Cat_noint'
]
cat_features = [
    "Template", "UserCountry3"
]
num_features = [
    "PedPed", "CrossingSignal", "NumberOfCharacters_int", "NumberOfCharacters_noint", "DescriptionShown_int", "DescriptionShown_noint",
    "LeftHand_int", "LeftHand_noint", "Man_int", "Man_noint", "Woman_int", "Woman_noint", "Pregnant_int", "Pregnant_noint",
    "Stroller_int", "Stroller_noint", "OldMan_int", "OldMan_noint", "OldWoman_int", "OldWoman_noint", "Boy_int", "Boy_noint",
    "LargeMan_noint", "LargeMan_int", "Criminal_int", "Criminal_noint", "MaleExecutive_int", "MaleExecutive_noint", "FemaleExecutive_int",
    "FemaleExecutive_noint", "Girl_int", "Girl_noint", "LargeWoman_int", "LargeWoman_noint", "FemaleAthlete_int", "FemaleAthlete_noint",
    "MaleAthlete_int", "MaleAthlete_noint", "FemaleDoctor_int", "FemaleDoctor_noint", "MaleDoctor_int", "MaleDoctor_noint", "Dog_int",
    "Dog_noint", "Cat_int", "Cat_noint", "Homeless_int", "Barrier_noint", "Homeless_noint"
]
# are there any nan?
X_train[num_features].describe()

Unnamed: 0,PedPed,CrossingSignal,NumberOfCharacters_int,NumberOfCharacters_noint,Man_int,Man_noint,Woman_int,Woman_noint,Pregnant_int,Pregnant_noint,...,FemaleDoctor_noint,MaleDoctor_int,MaleDoctor_noint,Dog_int,Dog_noint,Cat_int,Cat_noint,Homeless_int,Barrier_noint,Homeless_noint
count,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,...,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0,1012.0
mean,0.136364,0.79249,3.007905,3.074111,0.158103,0.137352,0.162055,0.137352,0.146245,0.136364,...,0.145257,0.12747,0.139328,0.156126,0.162055,0.172925,0.168972,0.12747,0.431818,0.161067
std,0.343344,0.839447,1.405773,1.413319,0.401165,0.355691,0.406941,0.372002,0.372597,0.36836,...,0.382153,0.348171,0.381774,0.38944,0.374011,0.418109,0.395456,0.342442,0.495574,0.391232
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,2.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.0,2.0,5.0,5.0,3.0,2.0,2.0,3.0,2.0,2.0,...,2.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,1.0,2.0


In [90]:
from snorkel.analysis import metric_score
from snorkel.utils import preds_to_probs

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

def transform_X(X):
    X = X[features]
    # convert to numeric, changing literals to NaN
    for f in num_features:
        X[f] = pd.to_numeric(X[f], errors='coerce')
    return X

X_train = transform_X(X_train)
X_dev = transform_X(X_dev)
X_val = transform_X(X_val)
X_test = transform_X(X_test)

# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('oh_enc', OneHotEncoder())
])
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))
])
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             