# Snorkel for Bias
Label model for weakly-supervised bias classification problem. 

In [25]:
%load_ext autoreload
%autoreload 2

import os
import pickle
os.chdir("/Users/sabrieyuboglu/Documents/sabri/school/cs_224u/attention_analysis")

import torch
from scipy import sparse
from metal.label_model import LabelModel  

from src.utils import *
from tasks.bias_classification.lib.tagging.features import Featurizer

os.chdir("/Users/sabrieyuboglu/Documents/sabri/school/cs_224u/attention_analysis")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Get Weak Labels
Load the weak labels for the entire bias classiciation dataset. 

Output: an [n,m] scipy.sparse label matrix of noisy labels where n= # data points, m = # labeling functions

In [2]:
# load labeler outputs
pos_dataset = pickle.load(open('tasks/bias_classification/data/labels/pos_weak_labels.pkl', 'rb'))
word2vec_dataset = pickle.load(open('tasks/bias_classification/data/labels/word2vec_weak_labels.pkl', 'rb'))
marta_dataset = pickle.load(open('tasks/bias_classification/data/labels/marta_weak_labels.pkl', 'rb'))

In [4]:
lfs = [(pos_dataset, "pos_weak_labels"), 
       (word2vec_dataset, "word2vec_weak_labels"),
       (marta_dataset, "marta_weak_labels")]

In [6]:
n = torch.max(marta_dataset.data["index"]) + 1
m = len(lfs)

In [47]:
# create lf matrix
lf_matrix = np.full((n, m), fill_value=-1)
lf_to_idx = {}
for lf_idx, (labeler, key) in enumerate(lfs):
    lf_to_idx[key] = lf_idx
    for entry in labeler:
        entry_idx = entry["index"]
        label = entry[key]
        lf_matrix[entry_idx, lf_idx] = label + 1
        
# remove incomplete rows

idx_to_id = list(set(range(lf_matrix.shape[0])) - set(np.where(lf_matrix == -1)[0]))
lf_matrix = lf_matrix[idx_to_index, :]
lf_matrix = sparse.csr_matrix(lf_matrix)

In [48]:
# analyze labeling functions
from metal.analysis import lf_summary

lf_summary(lf_matrix)

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts
0,"[1, 2]",1.0,1.0,0.401135
1,"[1, 2]",1.0,1.0,0.401135
2,"[1, 2]",1.0,1.0,0.401135


## Train Label Model

In [49]:
from metal.analysis import lf_summary

label_model = LabelModel(k=2, seed=123)

In [50]:
label_model.train_model(lf_matrix)

Computing O...
Estimating \mu...
[1 epo]: TRAIN:[loss=1.401]
[2 epo]: TRAIN:[loss=1.343]
[3 epo]: TRAIN:[loss=1.234]
[4 epo]: TRAIN:[loss=1.084]
[5 epo]: TRAIN:[loss=0.903]
[6 epo]: TRAIN:[loss=0.704]
[7 epo]: TRAIN:[loss=0.504]
[8 epo]: TRAIN:[loss=0.321]
[9 epo]: TRAIN:[loss=0.176]
[10 epo]: TRAIN:[loss=0.083]
[11 epo]: TRAIN:[loss=0.049]
[12 epo]: TRAIN:[loss=0.072]
[13 epo]: TRAIN:[loss=0.134]
[14 epo]: TRAIN:[loss=0.210]
[15 epo]: TRAIN:[loss=0.273]
[16 epo]: TRAIN:[loss=0.303]
[17 epo]: TRAIN:[loss=0.293]
[18 epo]: TRAIN:[loss=0.249]
[19 epo]: TRAIN:[loss=0.186]
[20 epo]: TRAIN:[loss=0.121]
[21 epo]: TRAIN:[loss=0.068]
[22 epo]: TRAIN:[loss=0.034]
[23 epo]: TRAIN:[loss=0.020]
[24 epo]: TRAIN:[loss=0.022]
[25 epo]: TRAIN:[loss=0.035]
[26 epo]: TRAIN:[loss=0.051]
[27 epo]: TRAIN:[loss=0.067]
[28 epo]: TRAIN:[loss=0.077]
[29 epo]: TRAIN:[loss=0.082]
[30 epo]: TRAIN:[loss=0.079]
[31 epo]: TRAIN:[loss=0.071]
[32 epo]: TRAIN:[loss=0.058]
[33 epo]: TRAIN:[loss=0.044]
[34 epo]: TRAIN:[lo

## Bias Label

In [51]:
labels = label_model.predict(lf_matrix) - 1

In [56]:
dataset_idx_to_id = []
for idx, entry in enumerate(pos_dataset):
    dataset_idx_to_id.append(entry["index"])

In [57]:
dataset_idx_to_id == idx_to_id

True

In [58]:
pos_dataset.add_data(labels, key_name="bias_label")

In [59]:
pickle.dump(pos_dataset, open("tasks/bias_classification/data/labels/bias_labels.pkl", "wb"))