In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import random

import pandas as pd
from snorkel.labeling import labeling_function, PandasLFApplier
from snorkel.labeling.model import LabelModel

In [None]:
# generate random training data
N_INSTANCES = 10000
values = [random.randrange(0, 100) for _ in range(N_INSTANCES)]
labels = [1 if random.randrange(0, 100) <= v else 0 for v in values]
df = pd.DataFrame({"value": values, "label": labels})
print(df.label.sum())
df.head()

In [None]:
ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

In [None]:
# labeling functions

def is_ge(threshold):
    @labeling_function(name="is_ge_"+str(threshold))
    def fn(row):
        return POSITIVE if row.value >= threshold else ABSTAIN
    return fn

def is_lt(threshold):
    @labeling_function(name="is_lt_"+str(threshold))
    def fn(row):
        return NEGATIVE if row.value < threshold else ABSTAIN
    return fn

def is_lt2(threshold):
    @labeling_function(name="is_lt2_"+str(threshold))
    def fn(row):
        return NEGATIVE if row.value < threshold else ABSTAIN
    return fn

def is_ge_lt(threshold):
    @labeling_function(name="is_ge_lt_"+str(threshold))
    def fn(row):
        return POSITIVE if row.value >= threshold else NEGATIVE
    return fn

def is_in_range(low, high):
    @labeling_function(name="is_in_range_"+str(low)+"_"+str(high))
    def fn(row):
        return POSITIVE if low <= row.value < high else ABSTAIN
    return fn


## Try is_ge_lt

In [None]:
lfs = [is_ge_lt(threshold) for threshold in range(10, 100, 10)]
train = PandasLFApplier(lfs).apply(df)

In [None]:
print(train.shape)
print(df[0:3])
print(train[0:3])

In [None]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(train, n_epochs=500, log_freq=50, class_balance=[0.5, 0.5])
df["predict"] = label_model.predict(L=train, tie_break_policy="abstain")
df.head(10)

In [None]:
len(df[df["label"] == df["predict"]])

## Try is_ge and is_lt

In [None]:
lfs = [is_ge(threshold) for threshold in range(10, 100, 10)] + [is_lt(threshold) for threshold in range(10, 100, 10)]
train = PandasLFApplier(lfs).apply(df)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(train, n_epochs=500, log_freq=50, class_balance=[0.5, 0.5])
df["predict"] = label_model.predict(L=train, tie_break_policy="abstain")
len(df[df["label"] == df["predict"]])

In [None]:
train[0:5]

### Try is_in_range

In [None]:
lfs = [is_in_range(threshold, threshold+10) for threshold in range(0, 100, 10)] + \
      [is_lt(threshold) for threshold in range(10, 100, 10)]
train = PandasLFApplier(lfs).apply(df)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(train, n_epochs=500, log_freq=50, class_balance=[0.5, 0.5])
df["predict"] = label_model.predict(L=train, tie_break_policy="abstain")
len(df[df["label"] == df["predict"]])

### Try repeating high-confidence learners

In [None]:
lfs = [is_ge(threshold) for threshold in range(10, 100, 10)] + \
      [is_lt(threshold) for threshold in range(10, 100, 10)] + \
      [is_in_range(80,90), is_in_range(90,100), is_lt2(10), is_lt2(20)]
train = PandasLFApplier(lfs).apply(df)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(train, n_epochs=500, log_freq=50, class_balance=[0.5, 0.5])
df["predict"] = label_model.predict(L=train, tie_break_policy="abstain")
len(df[df["label"] == df["predict"]])

### Try modifying class balance

In [None]:
lfs = [is_ge_lt(threshold) for threshold in range(10, 100, 10)]
train = PandasLFApplier(lfs).apply(df)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(train, n_epochs=500, log_freq=50, class_balance=[0.65, 0.35])
df["predict"] = label_model.predict(L=train, tie_break_policy="abstain")
len(df[df["label"] == df["predict"]])

### Try a "perfect" model

In [None]:
def predict(row):
    return 1 if row.value >= 50 else 0

df["predict"] = df.apply(lambda row: predict(row), axis=1)
len(df[df["label"] == df["predict"]])

### Try a simple majority vote

In [None]:
lfs = [is_ge_lt(threshold) for threshold in range(10, 100, 10)]
train = PandasLFApplier(lfs).apply(df)

def predict(train):
    return (train.sum(axis=1) >= 5).astype(int)

df["predict"] = predict(train)
len(df[df["label"] == df["predict"]])

### Questions
1. is snorkel better than simple majority vote?
2. does adding high-confidence learners multiple times help?
3. does separating Pos/Neg into Pos/Abstain and Abstain/Neg help?
4. does banding help?