In [None]:
import sys
sys.path.insert(1, '..')
import importlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sqlalchemy import create_engine

from snorkel.labeling import labeling_function

## Load Data

### SQL Engine

In [None]:
# access SQL DB with data
engine = create_engine("sqlite:///../data/moralmachine.db", echo=False)

## Data Exploration

### Sample Data

In [None]:
# query random sample of responses grouped by ResponseID; only take responses for which both instances are present
sample_size = 100000000
query = """
    SELECT gc.session_count, sr.* FROM sharedresponses sr
    INNER JOIN (
        SELECT ExtendedSessionID, COUNT(DISTINCT ResponseID) AS session_count FROM sharedresponses
        WHERE UserID <> ''
        
        /* Get only full sessions. */
        GROUP BY ExtendedSessionID
            HAVING COUNT(DISTINCT ResponseID) LIKE 13
                AND COUNT(ResponseID) LIKE 26
        
        LIMIT {0:d}
    ) gc
        ON gc.ExtendedSessionID = sr.ExtendedSessionID
    ORDER BY sr.UserID
""".format(sample_size)
df = pd.read_sql(query, con=engine)
df

In [None]:
# size and other stuff
df.describe()

In [None]:
# number of users
df['UserID'].nunique()

For comparison to Noothigattu et al., how many pairwise comparisons per voter in this sample dataset?

In [None]:
# number of pairwise comparisons per voter?
# = number of response IDs per voter
df.groupby('UserID')['ResponseID'].nunique().mean()

In [None]:
# number of scenarios
pd.DataFrame(df['ScenarioType'].value_counts()/df['ScenarioType'].value_counts().sum()).to_csv("../figures/data/freq_scenario.csv")
# frequency of each character count
characters = ['Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat']
pd.DataFrame(df[characters].sum()/df[characters].sum().sum(), columns=['frequency']).to_csv('../figures/data/freq_character.csv')

In [None]:
# user countries
freqs = df['UserCountry3'].value_counts()
pd.DataFrame(freqs/freqs.sum()).to_csv('../figures/data/freq_countries.csv')

In [None]:
df.iloc[:26,:].sort_values('ScenarioOrder')

Out of the variables above, these are the variables that vary within response pairs:
> 'NumberOfCharacters', 'DiffNumberOfCharacters', 'Saved', 'Template', 'DescriptionShown',
'LeftHand', 'UserCountry3', 'Man', 'Woman', 'Pregnant', 'Stroller',
'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman',
'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive',
'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog',
'Cat'

N.B. in each pair of responses, only one is the result of an intervention, and only one is saved. 

In [None]:
df[["ResponseID","Intervention","Saved"]].head()

### Preprocessing

First, let's convert to abstract features for ease of labeling.

In [None]:
import hmm.labeling.utils
importlib.reload(hmm.labeling.utils)
from hmm.labeling.utils import transform_abstract

ids = ['ResponseID', 'ExtendedSessionID', 'UserID']
df_i = df.set_index(ids, append=True, verify_integrity=True)
df_i.index = df_i.index.set_names(['UUID']+ids)
df_abstract = transform_abstract(df_i)
df_abstract.head()

Now let's check our work.

In [None]:
sample = df_i.sample().index
df_i.loc[sample]

In [None]:
df_abstract.loc[sample]

In [None]:
df_abstract.columns

First, select the fields that are unique to each scenario (the fields that vary within pairs of responses). Then split the dataset into two disjoint sets of alternatives: one in which an intervention occurs, and one in which there is no intervention.

In [None]:
ids = ["ResponseID", "ExtendedSessionID", "UserID"]
scenario_fields = [
    'Saved', 'NumberOfCharacters', 'DescriptionShown', 'LeftHand', 'Male', 'Female', 'Young', 'Old', 'Infancy', 'Pregnancy',
    'Fat', 'Fit', 'Working', 'Medical', 'Homeless', 'Criminal', 'Human',
    'Non-human', 'Passenger', 'Law Abiding', 'Law Violating'
]

df_abstract = df_abstract.reset_index(level='UUID')
sample_response = df_abstract.sample().index
intervention = df_abstract[df_abstract['Intervene'] == 1][scenario_fields]
no_intervention = df_abstract[df_abstract['Intervene'] == 0][scenario_fields]

print("Alternative w/ intervention:")
display(intervention.loc[sample_response])
print("Alternative w/o intervention:")
display(no_intervention.loc[sample_response])

Next, combine the datasets on response ID, separating the variable characteristics with suffixes.

In [None]:
df_endo = intervention.join(no_intervention, lsuffix='_int', rsuffix='_noint', how='inner')
df_endo.loc[sample_response]

Get data for the columns from the raw data that _didn't_ change within response pairs. Remember to eliminate duplicate response pairs - now that we have a combined tuple for each pairwise comparison, they're no longer necessary.

In [None]:
df_exo = df_abstract[[col for col in df_abstract.columns if col not in scenario_fields]]
df_exo = df_exo.loc[~df_exo.index.duplicated(keep='first')]
df_exo.loc[sample_response]

Then join that data in with the combined endogenous variables to get a full tuple for each pairwise comparison presented to a user.

In [None]:
df_joined = df_exo.join(df_endo, how='inner').set_index(['UUID'], append=True, verify_integrity=True)
sample = df_joined.sample().index
df_joined.loc[sample]

In [None]:
import hmm.labeling.utils
importlib.reload(hmm.labeling.utils)
from hmm.labeling.utils import pictofy
    
# works with abstract
pictofy(df_joined.loc[sample])

How much data did we lose with all these joins? Shouldn't be any - all we're doing is dividing the dataset in half.

In [None]:
print("df: {}".format(df.shape))
print("df_endo: {}".format(df_endo.shape))
print("df_exo: {}".format(df_exo.shape))
print("df_joined: {}".format(df_joined.shape))

Finally, let's make it easier to interpret the target variable. For each response, we know whether the user chose to save one set of characters (\_int) by intervention, or save another set (\_noint) by not intervening. Let's call that variable "Intervened" to indicate whether or not the user intervened (swerved the AV).

In [None]:
df_joined['Intervened'] = (df_joined['Saved_int'] == 1).astype(int)
df_joined = df_joined.drop(axis='columns', labels=['Saved_{}'.format(s) for s in ['int', 'noint']]+["Intervene"])
scenario_fields.remove('Saved')
df_joined.columns

Now choose which features to use:

In [None]:
# include user countries and other metadata or not? decided not - experts writing LFs are trying to decide most moral response, not predict what an individual from a certain background would do
# but if it helps generalization, technically useful... try both?
target = ["Intervened"]
features = ["Template", "UserCountry3"] + ["{}_{}".format(f, s) for f in scenario_fields for s in ["int", "noint"]]
cat_features = [
    "Template", "UserCountry3"
]
num_features = [f for f in features if f not in cat_features]
df_joined[features].head()

Now explicitly type cast and deal with NA's or missing values:

In [None]:
# transform numerical data types
df_proc = df_joined.loc[:, features + target]

def transform_num(df, num_features):
    # convert to numeric, changing literals to NaN
    for f in num_features:
        df.loc[:, f] = pd.to_numeric(df.loc[:, f], errors='coerce')
    # are there any nan in the numerical features? usually just one
    nan = df[df[num_features].isna().any(axis=1)]
    print("Dropping these NaN:")
    display(nan)
    return df.dropna(axis=0, how='any', subset=num_features)

df_proc = transform_num(df_proc, num_features)
df_proc.columns

### Train/Test Split

A standard train test split for testing:

In [None]:
import hmm.classification
importlib.reload(hmm.classification)
from hmm.classification import train_test_val_dev_split

def make_X_y(df):
    X = df.drop(labels=["Intervened"], axis='columns', inplace=False)
    y = df["Intervened"]
    return X, y

X, y = make_X_y(df_proc)
X_train, X_test, X_val, X_dev, y_train, y_test, y_val, y_dev = train_test_val_dev_split(X, y)
display(X_train.head())
display(y_train.head())
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(X_dev.shape)

How many unique scenarios are there (vs total scenarios)?

In [None]:
from scipy import stats

counts = np.unique(X.values, axis=0, return_counts=True)
stats.describe(counts[1])

### Labeling Model

Some sample labeling functions, constructed with the help of the effect sizes in the Moral Machine experiment.

In [None]:
import hmm.labeling.moralmachine as mm
import hmm.labeling.models
import hmm.labeling.utils
importlib.reload(hmm.labeling.moralmachine)
importlib.reload(hmm.labeling.models)
importlib.reload(hmm.labeling.utils)
import hmm.labeling.moralmachine as mm
import hmm.labeling.models
import hmm.labeling.utils

from hmm.labeling.models import Labeler
from snorkel.labeling import LFAnalysis

lfs = [
    mm.doctors,
    mm.utilitarian,
    mm.utilitarian_anthro,
    mm.action,
    mm.pedestrians,
    mm.females,
    mm.fitness,
    mm.status,
    mm.legal,
    mm.illegal,
    mm.youth,
#     mm.elderly,
    mm.criminals,
    mm.homeless,
    mm.pets,
    mm.spare_strollers,
    mm.spare_pregnant
]

labeler = Labeler(lfs)
L_train, L_dev, L_val = labeler.label([X_train, X_dev, X_val])
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=y_dev.values).sort_values("Correct", ascending=False)

*Experiment: LF Density*

In [None]:
# using the validation set (since tuning is done)
analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(Y=y_val.values)
analysis.to_csv("../figures/data/lfanalysis.csv")
# labeling density
pd.DataFrame(L_dev, columns=[lf.name for lf in lfs]).to_csv("../figures/data/mm-density.csv")

## Aggregation

Recall that there are no true labels for this problem - really, we're just measuring similarity of the heuristic labels to real voter's responses. 

**Baseline**: majority label voting.

In [None]:
from snorkel.labeling import MajorityLabelVoter

model_majority = MajorityLabelVoter()
preds_train = model_majority.predict(L=L_train)

**Label Model**: Snorkel aggregator. Chooses weights to combine the labeling functions based on learned conditional probabilities.

*Experiment: LF weights*

In [None]:
# cardinality is num classes
importlib.reload(hmm.labeling.models)

model_label = labeler.fit(L_train, Y_dev=y_dev.values, fit_params={'n_epochs': 200, 'log_freq': 50})
analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(Y=y_val.values)
analysis['weight'] = pd.Series(model_label.get_weights(), index=[lf.name for lf in lfs])
analysis.to_csv('../figures/data/mm-weights.csv')
analysis.sort_values('Emp. Acc.')
# X[['Passenger_int', 'Passenger_noint', 'Law Abiding_int', 'Law Abiding_noint']]

How much does the label model improve on the majority voter?

In [None]:
from hmm.labeling.models import Labeler
importlib.reload(hmm.labeling.models)
from hmm.labeling.models import Labeler

for model in [model_majority, model_label]:
    Labeler.score(model, L_val, y_val)

*Ranking by Effect Size*

In [None]:
from snorkel.labeling import MajorityLabelVoter
import scipy as sp

class WeightedMajorityLabelVoter(MajorityLabelVoter):
    def predict_proba(self, L: np.ndarray) -> np.ndarray:
        n, m = L.shape
        Y_p = np.zeros((n, self.cardinality))
        for i in range(n):
            counts = np.zeros(self.cardinality)
            for j in range(m):
                if L[i, j] != -1:
                    # add a weighted count instead of a whole count
                    counts[L[i, j]] += self.mu[j]
            Y_p[i, :] = np.where(counts == max(counts), 1, 0)
        Y_p /= Y_p.sum(axis=1).reshape(-1, 1)
        return Y_p
    
    def interp_mu(self, borda, ordered_keys=None):
        if ordered_keys is not None:
            mu = borda.mean()[ordered_keys].values
            self.set_mu(np.interp(mu, (borda.mean().min(), borda.mean().max()), (0, 1)))
        else:
            self.set_mu(np.interp(borda, (borda.min(), borda.max()), (0, 1)))
            print(self.mu)
    
    def set_mu(self, mu):
        self.mu = mu
    
    @staticmethod
    def borda(x, key):
        count = 0
        key_val = x[x['key'] == key]['effect'].values[0]
        for val in x[x['key'] != key]['effect'].values:
            if key_val > val:
                count += 1
        return count
        
# TODO - try weighting this model by the learned LF bordas
effect_sizes = pd.DataFrame([
    ['action', 0.07],
    ['pedestrians', .12],
    ['females', .14],
    ['fitness', .18],
    ['status', .33],
    ['legal', .35],
    ['illegal', .35],
    ['youth', 0.5],
    ['utilitarian', 0.51],
    ['utilitarian_anthro', 0.55],
    ['spare_strollers', .18],
    ['spare_pregnant', .15],
    ['criminals', .12],
    ['homeless', 0.02],
    ['pets', 0.59],
    ['doctors', 0.07]
], columns=['key', 'effect'])
effect_sizes['borda'] = effect_sizes.apply(lambda x: WeightedMajorityLabelVoter.borda(effect_sizes, x['key']), axis=1)

In [None]:
model_majority_weighted = WeightedMajorityLabelVoter()
borda = np.array([effect_sizes[effect_sizes['key'] == lf.name]['borda'].values[0] for lf in lfs])
model_majority_weighted.interp_mu(borda)
preds_train = model_majority_weighted.predict(L=L_train)

analysis = LFAnalysis(L=L_val, lfs=lfs).lf_summary(Y=y_val.values)
analysis['weight'] = pd.Series(model_majority_weighted.mu, index=[lf.name for lf in lfs])
analysis.to_csv('../figures/data/mm-weights_icml.csv')
analysis.sort_values('Emp. Acc.')

How much does the weighted majority voter improve on the majority voter?

In [None]:
from hmm.labeling.models import Labeler
importlib.reload(hmm.labeling.models)
from hmm.labeling.models import Labeler

for model in [model_majority_weighted, model_majority]:
    Labeler.score(model, L_val, y_val)

*Experiment: Accuracy by Scenario*

In [None]:
# what is the label model accuracy per scenario type?
# create a dataframe with scenariotype, gold label, probabilistic label, votes for each LF
## TODO REMOVE DEPRECATED SECOND ARG
preds_scenario = pd.DataFrame(L_val, columns=[lf.name for lf in lfs])
preds_scenario['scenario'] = df_joined.loc[X_val.index]['ScenarioType'].values
preds_scenario['actual'] = y_val.values
probs = labeler.model.predict_proba(L=L_val)
preds_scenario['pred'] = Labeler.probs_to_preds(probs)
preds_scenario.to_csv("../figures/data/mm-preds_scenario.csv")

### Eye Test - Debugging Label Model

Now, use the label model to create probabilistic labels for the dev set. Rounding off, create binary predictions.

In [None]:
from snorkel.analysis import get_label_buckets

threshold = 0.5
probs_dev = model_label.predict_proba(L=L_dev)
preds_dev = probs_dev >= threshold

Create label buckets for eyeball debugging (groups TP, FP, TN, FN).

In [None]:
# confusion matrix
print(labeler.get_confusion_matrix(L_dev, y_dev))
buckets = labeler.get_label_buckets(L_dev, y_dev)

What is the accuracy for each scenario type?

In [None]:
# false negatives
df_fn_dev = X_dev.iloc[buckets[(1, 0)]]
# false positives
df_fp_dev = X_dev.iloc[buckets[(0, 1)]]
df_n_dev = X_dev.iloc[np.concatenate([buckets[(1, 0)], buckets[(0, 1)]])]
# acc = 1 - Neg / Total for each scenario type
acc = 1 - df_joined.loc[df_n_dev.index, 'ScenarioType'].value_counts() / df_joined.loc[X_dev.index, 'ScenarioType'].value_counts()
acc.sort_values()

#### False Negatives
Here, the user chose to intervene, while the label model did not. 

In [None]:
# get the corresponding posteriori probability for each false negative
df_fn_dev.loc[:,"probability"] = probs_dev[buckets[(1, 0)], 1]
# check out a few
pictofy(df_fn_dev.sample())

Which scenario types does the model tend to get wrong?

In [None]:
df_joined.loc[df_fn_dev.index, 'ScenarioType'].value_counts().plot.pie()

#### False Positives
Here, the user chose not to intervene, but the label model did.

In [None]:
# get the corresponding posteriori probability for each false positive
df_fp_dev.loc[:,"probability"] = probs_dev[buckets[(0, 1)], 1]
# check out a few
pictofy(df_fp_dev.sample(random_state=3))

In [None]:
df_joined.loc[df_fp_dev.index, 'ScenarioType'].value_counts().plot.pie()

## Classification

https://www.snorkel.org/use-cases/01-spam-tutorial#5-training-a-classifier

Let's design a simple machine learning classifier for this problem, then test it on both the gold standard labels and the heuristic labels.

In [None]:
import hmm.classification
importlib.reload(hmm.classification)
from hmm.classification import Classifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

clf = Classifier(features, num_features, cat_features)
kf = KFold(n_splits=5, shuffle=True, random_state=11)

def kf_cross_val(kf, X_n, y_n):
    gold_acc = []
    lm_acc = []
    mv_acc = []
    mv_weighted_acc = []
    
    for i_train, i_test in kf.split(X_n):
        # train/test split by fold
        X_train_n, X_test_n = X_n.iloc[i_train], X_n.iloc[i_test]
        y_train_n, y_test_n = y_n.iloc[i_train], y_n.iloc[i_test]
        
        # gold accuracy
        clf.fit(X_train_n, y_train_n)
        gold_acc.append(clf.score(X_test_n, y_test_n, verbose=False))
        
        # lm accuracy
        ## train label model
        labeler = Labeler(lfs)
        L_train_n = labeler.label(X_train_n, verbose=False)
        labeler.fit(L_train_n, Y_dev=y_train_n)
        ## label points in X_train
        X_train_filtered_n, probs_train_filtered_n = labeler.filter_probs(X_train_n, L_train_n)
        preds_train_filtered_n = Labeler.probs_to_preds(probs_train_filtered_n)
        ## fit and score
        clf.fit(X_train_filtered_n, preds_train_filtered_n)
        lm_acc.append(clf.score(X_test_n, y_test_n, verbose=False))
        
        L_test_n = labeler.label(X_test_n, verbose=False)
        mv_acc.append(accuracy_score(y_true=y_test_n, y_pred=model_majority.predict(L=L_test_n)))
        mv_weighted_acc.append(accuracy_score(y_true=y_test_n, y_pred=model_majority_weighted.predict(L=L_test_n)))
        
    return gold_acc, lm_acc

sample = X.sample(10000).index
gold_acc, lm_acc = kf_cross_val(kf, X.loc[sample], y.loc[sample])

#### Training on Gold Labels

Using just the labels (no label model):

In [None]:
np.mean(gold_acc)

#### Training on Heuristic Labels

Using the label model, filter out unlabeled points:

In [None]:
np.mean(lm_acc)

### Grid Search Testing

**LF perturbations**

In [None]:
# baseline model
labeler = Labeler(lfs)
L_train, L_val = labeler.label([X_train, X_val], verbose=False)
model_label = labeler.fit(L_train, Y_dev=y_train)
acc_full = Labeler.score(model_label, L_val, y_val)
# perturbed models
lf_diffs = []
for lf in lfs:
    lfs_perturb = [l for l in lfs if l != lf]
    l = Labeler(lfs_perturb)
    L_train, L_val = l.label([X_train, X_val], verbose=False)
    lm = labeler.fit(L_train)
    acc_perturb = Labeler.score(lm, L_val, y_val, verbose=False)
    lf_diffs.append((lf.name, acc_full - acc_perturb))
    print("{}: {}".format(lf.name, acc_full - acc_perturb))
pd.DataFrame(lf_diffs, columns=['heuristic', 'value_added']).to_csv("../figures/data/mm-perturb.csv")

**Which models perform best?**

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

for name, model in {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Log Reg": LogisticRegression(C=.001),
    "KNN": KNeighborsClassifier(3),
    "SVC Linear": SVC(kernel="linear", C=0.025),
    "SVC Nonlinear": SVC(gamma=2, C=1),
    "GP": GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "RF": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "MLP": MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    "QDA": QuadraticDiscriminantAnalysis()
}.items():
    print("## {} ##".format(name))
    clf = Classifier(features, num_features, cat_features, clf=model)
    
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test, verbose=False)
    print("Accuracy with gold labels: {}".format(acc))
    
    clf.fit(X_train_filtered, preds_train_filtered)
    acc = clf.score(X_test, y_test, verbose=False)
    print("Accuracy with heuristic labels: {}".format(acc))

**How does performance change as the number of voters is increased?** 

In this case, evaluate performance voter-wise by splitting the data after stratifying by voter. Matches Noothigattu Fig. 1.

In [None]:
from sklearn.model_selection import train_test_split

data = []
users = df_proc.groupby(['UserID'])
a = np.arange(users.ngroups)

trials = 200
epochs = 50
np.random.seed(21)

for i in range(epochs):
    
    print("# Epoch {} #".format(i))

    acc_gold = []
    acc_lm = []
    
    # shuffle the voter IDs
    np.random.shuffle(a)
    
    num_range = list(range(1,5))+list(range(5,trials,5))
    for n in num_range:
#         print("## Testing N={}##".format(n)) 
        n_respondents = df_proc[users.ngroup().isin(a[:n])]
        train_index = n_respondents.groupby('UserID').head(8).index

        X, y = make_X_y(n_respondents)
        X_train, y_train = X.loc[train_index], y.loc[train_index]
        X_test, y_test = X.loc[~X.index.isin(train_index)], y.loc[~y.index.isin(train_index)]
        # label the data
        L_train, L_test = labeler.label([X_train, X_test], verbose=False)

        # baseline gold model accuracy
        clf.fit(X_train, y_train)
        acc_gold.append(clf.score(X_test, y_test, verbose=False))

        # label model accuracy
#         lm = labeler.fit(L_train)
#         Labeler.score(lm, L_test, y_test, verbose=False)
#         X_train_filtered, probs_train_filtered = labeler.filter_probs(X_train, L_train)
#         preds_train_filtered = Labeler.probs_to_preds(probs_train_filtered)
#         clf.fit(X_train_filtered, preds_train_filtered)
#         acc_lm.append(clf.score(X_test, y_test, verbose=False))
        
        # unweighted majority vote accuracy
        preds_train = model_majority.predict(L=L_train)
        clf.fit(X_train, preds_train)
        acc_lm.append(clf.score(X_test, y_test, verbose=False))

    data.append([ 
        acc_gold,
        acc_lm
    ])
summary = np.array(data)
stats = np.concatenate((np.array([num_range]), np.mean(summary, axis=0), np.std(summary, axis=0)), axis=0).transpose()
accs = pd.DataFrame(stats, columns=["n_voters", "acc_gold", "acc_heuristic", "std_gold", "std_heuristic"])
accs['n_voters'] = accs['n_voters'].astype(int)
accs.to_csv("../figures/data/mm-accs_voters_icml.csv")

In [None]:
accs[['acc_gold', 'acc_heuristic']].plot(kind='line')
accs.to_csv("../figures/data/mm-accs_voters_icml.csv")

**Performance by training set size**

In [None]:
clf = Classifier(features, num_features, cat_features)
results = []
kf = KFold(n_splits=5, shuffle=True, random_state=11)

for n in list(range(50, 1000, 10))+list(range(1100,10000,100)):
    sample = X.sample(n).index
    X_n = X.loc[sample]
    y_n = y.loc[sample]
    
    gold_acc, lm_acc = kf_cross_val(kf, X_n, y_n)

    # store results
    res = (n, np.mean(gold_acc), np.std(gold_acc), np.mean(lm_acc), np.std(lm_acc))
    print(res)
    results.append(res)
pd.DataFrame(results, columns=["n_rows", "acc_gold", "std_gold", "acc_heuristic", "std_heuristic"]).to_csv("../figures/data/mm-accs_data.csv")