In [1]:
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter

import pandas as pd
import numpy as np
import random

### Create a toy dataset

In [11]:
data = [random.randrange(1,40) for x in range(20)]

In [12]:
data

[5, 21, 25, 18, 32, 37, 10, 20, 10, 26, 2, 32, 34, 11, 22, 36, 12, 20, 31, 25]

In [24]:
data = [5, 21, 1, 29, 32, 37, 10, 20, 10, 26, 2, 37, 34, 11, 22, 36, 12, 20, 31, 25]

In [25]:
df = pd.DataFrame(data, columns=['Number'])

### Creating a small validation set

In [25]:
validation = [random.randrange(1,40) for x in range(5)]

In [26]:
validation

[25, 19, 7, 2, 32]

In [26]:
validation = [22, 11, 7, 2, 32]

In [27]:
df_val = pd.DataFrame(validation, columns=['Number'])

### Ground truth

In [28]:
true_labels = np.array([0, 1, 1, 1, 0])

In [29]:
df_tl = pd.DataFrame(true_labels, columns=['Number'])

### Labeling functions

In [30]:
ABSTAIN = -1
NON_PRIME = 0
PRIME = 1

In [31]:
@labeling_function()
def is_odd(record):
    if record["Number"]%2 == 1:
        return ABSTAIN
    else:
        return NON_PRIME

In [32]:
@labeling_function()
def is_even(record):
    if record["Number"]%2 == 0:
        return NON_PRIME
    else:
        return ABSTAIN

In [33]:
@labeling_function()
def is_two(record):
    if record["Number"] == 2:
        return PRIME
    else:
        return ABSTAIN

In [34]:
@labeling_function()
def is_known_prime(record):
    known_primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
    if record["Number"] in known_primes:
        return PRIME
    else:
        return ABSTAIN

In [35]:
lfs = [
        is_odd,
        is_even, 
        is_two,
        is_known_prime
      ]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)
L_valid = applier.apply(df_val)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 1816.35it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 2497.20it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
is_odd,0,[0],0.55,0.55,0.05
is_even,1,[0],0.55,0.55,0.05
is_two,2,[1],0.05,0.05,0.05
is_known_prime,3,[1],0.2,0.05,0.05


In [189]:
LFAnalysis(L_valid, lfs).lf_summary(true_labels)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_odd,0,[0],0.6,0.6,0.2,2,1,0.666667
is_even,1,[0],0.6,0.6,0.2,2,1,0.666667
is_two,2,[1],0.2,0.2,0.2,1,0,1.0
is_known_prime,3,[1],0.6,0.2,0.2,3,0,1.0


In [191]:
df_val = pd.DataFrame(validation, columns=['Number'])
df_val["is_odd"] = df_val.apply(is_odd, axis=1)
df_val["is_even"] = df_val.apply(is_even, axis=1)
df_val["is_two"] = df_val.apply(is_two, axis=1)
df_val["is_known_prime"] = df_val.apply(is_known_prime, axis=1)
df_val["ground_truth"] = true_labels
df_val

Unnamed: 0,Number,is_odd,is_even,is_two,is_known_prime,ground_truth
0,22,0,0,-1,-1,0
1,11,-1,-1,-1,1,1
2,7,-1,-1,-1,1,1
3,2,0,0,1,1,1
4,32,0,0,-1,-1,0


In [177]:
LFAnalysis(L=L_train, lfs=lfs).lf_conflicts()

array([0.05, 0.05, 0.05, 0.05])

In [14]:
df["is_odd"] = df.apply(is_odd, axis=1)
df["is_even"] = df.apply(is_even, axis=1)
df["is_two"] = df.apply(is_two, axis=1)
df["is_known_prime"] = df.apply(is_known_prime, axis=1)

In [15]:
df[["Number", "is_odd", "is_even", "is_two", "is_known_prime"]]

Unnamed: 0,Number,is_odd,is_even,is_two,is_known_prime
0,5,-1,-1,-1,1
1,21,-1,-1,-1,-1
2,1,-1,-1,-1,-1
3,29,-1,-1,-1,1
4,32,0,0,-1,-1
5,37,-1,-1,-1,-1
6,10,0,0,-1,-1
7,20,0,0,-1,-1
8,10,0,0,-1,-1
9,26,0,0,-1,-1


##### Coverage

In [52]:
print(df[df!= -1].count()/df.shape[0])

Number            1.00
is_odd            0.55
is_even           0.55
is_two            0.05
is_known_prime    0.20
dtype: float64


#### Overlaps

#### Conflicts

In [78]:
df[
    (df["is_known_prime"] != -1) &
    ((df["is_known_prime"] != df["is_two"]) |
    (df["is_known_prime"] != df["is_even"]) |
    (df["is_known_prime"] != df["is_odd"]))
]

Unnamed: 0,Number,is_odd,is_even,is_two,is_known_prime,majority_pred,preds_labelingModel
0,5,-1,-1,-1,1,1,1
3,29,-1,-1,-1,1,1,1
10,2,0,0,1,1,-1,0
13,11,-1,-1,-1,1,1,1


In [63]:
df

Unnamed: 0,Number,is_odd,is_even,is_two,is_known_prime,majority_pred
0,5,-1,-1,-1,1,1
1,21,-1,-1,-1,-1,-1
2,1,-1,-1,-1,-1,-1
3,29,-1,-1,-1,1,1
4,32,0,0,-1,-1,0
5,37,-1,-1,-1,-1,-1
6,10,0,0,-1,-1,0
7,20,0,0,-1,-1,0
8,10,0,0,-1,-1,0
9,26,0,0,-1,-1,0


### Random Voter

In [19]:
from snorkel.labeling.model import RandomVoter

In [54]:
random_model = RandomVoter()
preds_train_random = random_model.predict(L=L_train, tie_break_policy='abstain')
preds_valid_random = random_model.predict(L=L_valid)
df["preds_train_random"] = preds_train_random
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_even,is_two,is_known_prime,preds_train_random,majority_pred
10,2,0,0,1,1,0,-1


In [78]:
preds_train_random

array([1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0])

In [49]:
metrics = random_model.score(preds_valid_random, true_labels, metrics=['accuracy'], tie_break_policy='abstain')
metrics



{'accuracy': 0.4}

### MajorityClassVoter

In [50]:
from snorkel.labeling.model import MajorityClassVoter

In [79]:
majorityClass_model = MajorityClassVoter()
majorityClass_model.fit(balance=np.array([0.7, 0.3]))
majorityClass_train_random = majorityClass_model.predict(L=L_train)
majorityClass_valid_random = majorityClass_model.predict(L=L_valid)
df["majorityClass_pred"] = majorityClass_train_random
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_even,is_two,is_known_prime,preds_train_random,majority_pred,majorityClass_pred,majorityLabel_pred
10,2,0,0,1,1,0,-1,0,-1


In [80]:
majorityClass_train_random

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### MajorityLabelVoter

In [72]:
majority_model = MajorityLabelVoter()
preds_train_majority = majority_model.predict(L=L_train)
preds_valid_majority = majority_model.predict(L=L_valid)

In [73]:
preds_train_majority

array([ 1, -1, -1,  1,  0, -1,  0,  0,  0,  0, -1, -1,  0,  1,  0,  0,  0,
        0, -1, -1])

In [74]:
preds_valid_majority

array([ 0,  1,  1, -1,  0])

In [75]:
df["majorityLabel_pred"] = preds_train_majority

In [76]:
df[df["Number"] == 2]

Unnamed: 0,Number,is_odd,is_even,is_two,is_known_prime,preds_train_random,majority_pred,majorityClass_pred,majorityLabel_pred
10,2,0,0,1,1,0,-1,1,-1


### LabelingModel

In [36]:
label_model = LabelModel()
label_model.fit(L_train=L_train, n_epochs=200, seed=100)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()

preds_train_labelingModel = label_model.predict(L=L_train)
preds_valid_labelingModel = label_model.predict(L=L_valid)

100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 4823.26it/s]


In [37]:
preds_train_labelingModel

array([ 1, -1, -1,  1,  0, -1,  0,  0,  0,  0,  0, -1,  0,  1,  0,  0,  0,
        0, -1, -1])

In [38]:
df["preds_labelingModel"] = preds_train_labelingModel

In [39]:
df

Unnamed: 0,Number,preds_labelingModel
0,5,1
1,21,-1
2,1,-1
3,29,1
4,32,0
5,37,-1
6,10,0
7,20,0
8,10,0
9,26,0


In [116]:
# 7/20 prime numbers
label_model = LabelModel()
label_model.fit(L_train=L_train, n_epochs=100, class_balance = [0.7, 0.3], seed=100)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_val)
LFAnalysis(L_valid, lfs).lf_summary()

preds_train_labelingModel = label_model.predict(L=L_train)
df["preds_labelingModel_wclassBalance"] = preds_train_labelingModel

  from pandas import Panel
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1661.37it/s]


In [144]:
df

Unnamed: 0,Number,preds_labelingModel,is_odd,is_even,is_two,is_known_prime
0,5,1,-1,-1,-1,1
1,21,-1,-1,-1,-1,-1
2,1,-1,-1,-1,-1,-1
3,29,1,-1,-1,-1,1
4,32,0,0,0,-1,-1
5,37,-1,-1,-1,-1,-1
6,10,0,0,0,-1,-1
7,20,0,0,0,-1,-1
8,10,0,0,0,-1,-1
9,26,0,0,0,-1,-1


In [40]:
np.round(label_model.get_weights(), 2)

array([0.91, 0.91, 0.76, 0.85])

In [41]:
np.round(label_model.get_conditional_probs(), 3)

array([[[0.   , 0.937],
        [0.99 , 0.053],
        [0.01 , 0.01 ]],

       [[0.   , 0.937],
        [0.99 , 0.053],
        [0.01 , 0.01 ]],

       [[0.907, 0.924],
        [0.01 , 0.01 ],
        [0.083, 0.066]],

       [[0.911, 0.66 ],
        [0.01 , 0.01 ],
        [0.079, 0.33 ]]])

In [42]:
label_model.predict_proba(L_train)[1]

array([0.5, 0.5])

In [120]:
L = np.array([[0, 0, -1], [-1, 0, 1], [1, -1, 0]])
Y_dev = [0, 1, 0]

In [119]:
L

array([[ 0,  0, -1],
       [-1,  0,  1],
       [ 1, -1,  0]])

In [121]:
label_model = LabelModel()
label_model.fit(L)
#label_model.fit(L, Y_dev=Y_dev, seed=2020, lr=0.05)
label_model.fit(L, class_balance=[0.7, 0.3], n_epochs=200, l2=0.4)

In [122]:
pred = label_model.predict(L=L)
pred

array([0, 0, 1])

In [111]:
label_model.score(L, Y=np.array([1, 1, 1]))



{'accuracy': 0.6666666666666666}