# Data programming with Snorkel: Labeling the PICO dataset


In [32]:
# Importations.
import pandas as pd
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel
from snorkel.analysis import get_label_buckets
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
import string # For punctuation.
import os
import random
import pickle

In [2]:
with open('first_sentence_file.pickle', 'rb') as handle:
    first_sentence_file = pickle.load(handle)
    
# meeting notes:

# LF idea:
# many false positives in LF results
# predicting tokens as I when not I, ex. is treatment
# is there 'treatment' as a I in any abstracts?
    # should probably remove 'treatment' from LF
# see if performance is improved

# to develop LFs:
# 1. need to create LFs by looking at abstracts closely, what words are being labeled as I?
# 2. browse through ground truth files
# 3. encode patterns into LFs
# 4. better to have held out development set for developing LFs separate from training/testing, look at 100-200, and observe patterns. 

# to see if LFs worked:
# 1. run LFs on same abstracts in snorkel again and again, look at error generation files, if it's catching those errors or not.
# 2. look at each LF's performacen metrics, indiviudally too, in addition to looking at overall performance.
# 3. check if bert labels those tokens as I, if it can't then maybe just not possible to catch that token as an I


# share what LFs are being used on slack and in github.


## Process keyword data

In [3]:
# Read in suffixes.
# https://druginfo.nlm.nih.gov/drugportal/jsp/drugportal/DrugNameGenericStems.jsp
df_drugs = pd.read_csv("lf_datasets/suffixes/drug_suffixes.txt", header = None)
df_surgery = pd.read_csv("lf_datasets/suffixes/surgical_suffixes.txt", header = None)
df_psych = pd.read_csv("lf_datasets/suffixes/psychotherapy_keywords.txt", header = None)

df_psych[0] = df_psych[0].str.lower()
df_surgery[0] = df_surgery[0].str.lower()
df_drugs[0] = df_drugs[0].str.lower()


In [4]:
# Read in FDA data.

df_purple = pd.read_csv("lf_datasets/fda_approved_drugs/products_purplebook.csv")
df_orange = pd.read_csv("lf_datasets/fda_approved_drugs/products_orangebook.txt", 
                        sep = "~")
df_drugs_at_fda = pd.read_csv("lf_datasets/fda_approved_drugs/products_drugs_at_fda.txt", 
                              sep = "\t", 
                              error_bad_lines = False)




  df_drugs_at_fda = pd.read_csv("lf_datasets/fda_approved_drugs/products_drugs_at_fda.txt",
b'Skipping line 35225: expected 8 fields, saw 9\nSkipping line 35226: expected 8 fields, saw 9\nSkipping line 35227: expected 8 fields, saw 9\n'


In [5]:
# Filter drug suffixes with three characters or fewer.
drug_suffixes = list(df_drugs[0])
drug_suffixes = [x.lower() for x in drug_suffixes if len(x) > 3]


In [6]:
# Concatenate FDA drug data.
set_proprietary = list(df_drugs_at_fda["DrugName"]) + list(df_purple["Proprietary Name"]) + list(df_orange["Trade_Name"])
set_proper = list(df_drugs_at_fda["ActiveIngredient"]) + list(df_purple["Proper Name"]) + list(df_orange["Ingredient"])

# Remove floats and integers.
set_proprietary = [item.lower() for item in set_proprietary if not isinstance(item, float)]
set_proprietary = [item for item in set_proprietary if not isinstance(item, int)]
set_proper = [item.lower() for item in set_proper if not isinstance(item, float)]
set_proper = [item for item in set_proper if not isinstance(item, int)]

# Cast as sets to remove duplicates.
set_proprietary = set(set_proprietary)
set_proper = set(set_proper)
set_fda = set.union(set_proprietary, set_proper)


## Read data

In [7]:
# read the pickle file
with open('df_dev.pickle', 'rb') as handle:
    df_dev_orig = pickle.load(handle)

# Remove None type labels. BERT does the same.
df_dev = df_dev_orig.dropna()
df_dev = df_dev.reset_index(drop = True)


## Labeling functions

Labeling functions will be written to cover the following intervention categories, as used by the manual annotators of this [dataset](https://github.com/bepnye/EBM-NLP):

- Surgical.
- Physical.
- Drug.
- Educational.
- Psychological.
- Other.
- Control.

In [8]:
# Label macros.
ABSTAIN = -1
NOT_I = 0
I = 1

# Data for labeling functions.
generic_interventions = ["pretreatment", "placebo"]
nltk.download("stopwords")
stop_words = stopwords.words("english")
print("Total stop words =", len(stop_words))

Total stop words = 179


[nltk_data] Downloading package stopwords to /home/rcw258/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Labeling functions
**All labeling functions label tokens. The corresponding gold labels are from the "starting span" labels.**

In [38]:
# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title(x, first_sentence_file):
    return I if x.token.lower() in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title2(x, first_sentence_file):
    return I if x.token.lower() in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title3(x, first_sentence_file):
    return I if x.token.lower() in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title4(x, first_sentence_file):
    return I if x.token.lower() in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?


# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def not_in_title(x, first_sentence_file):
    return NOT_I if x.token.lower() not in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?


@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def surround_in_title(x, first_sentence_file):
    if((x.token_index>0) and (x.token_index<len(x.abstract)-1)):
        if (x.abstract[x.token_index-1].lower() in first_sentence_file[x.file]) and (x.abstract[x.token_index+1].lower() in first_sentence_file[x.file]):
            return I
    
    return ABSTAIN
# abstain or not_i?


# Labeling function for tokens that contain drug suffixes.
@labeling_function()
def contains_drug_suffix(x):
    return I if (any(suffix.lower() in x.token.lower() for suffix in drug_suffixes)) else ABSTAIN
    
# Labeling function for tokens that contain surgical suffixes.
@labeling_function()
def contains_surgical_suffix(x):
    return I if (any(suffix.lower() in x.token.lower() for suffix in df_surgery[0])) else ABSTAIN

# Labeling function for tokens that contain psychological / psychotherapeutic keywords.
@labeling_function()
def contains_psych_term(x):
    return I if (any(suffix.lower() in x.token.lower() for suffix in df_psych[0])) else ABSTAIN

# Labeling function for tokens that contain generic intervention keywords.
@labeling_function()
def is_generic(x):
    return I if (any(term.lower() in x.token.lower() for term in generic_interventions)) else ABSTAIN

# Labeling function for tokens that are 'placebo'
@labeling_function()
def is_placebo(x):
    return I if 'placebo' in x.token.lower() else ABSTAIN

# Labeling function for stop words.
@labeling_function()
def is_stop_word(x):
    return NOT_I if x.token.lower() in stop_words else ABSTAIN

# Labeling function for tokens that are punctuation.
@labeling_function()
def is_punctuation(x):
    return NOT_I if x.token.lower() in string.punctuation else ABSTAIN


# Labeling function for FDA approved drugs.
@labeling_function()
def contains_fda_drug(x):
    if (len(x.token) <= 5):
        return ABSTAIN

    return I if (any(x.token.lower() in drug.lower() for drug in set_fda)) else ABSTAIN


# checks if the preceding token is 'of' or 'with' (effect of... I, treat with... I)
@labeling_function()
def has_prev_word_as(x):
    words = ['of', 'with', 'receive', 'and']
    if ((x.token_index > 0) and (x.abstract[x.token_index-1].lower() in words)):
        return I 

    else:
        return ABSTAIN
    
# checks if the next token is 'group' or 'groups'
@labeling_function()
def has_next_word_as(x):
    words = ['group', 'groups']
    if ((x.token_index < len(x.abstract)-1) and (x.abstract[x.token_index+1].lower() in words)):
        return NOT_I

    else:
        return ABSTAIN
    
# Labeling function for tokens, sees if left span of token within sentence contains keyword
@labeling_function()
def left_span_contains(x):
    
    i = 0
    while(x.abstract[i] != x.token):
        i+=1
        
    count = 0
    while(i >= 0 and count < 10):
        if((x.abstract[i] == 'determine') or (x.abstract[i] == 'assess')):
            return I
        i-=1
        count+=1
        
    return ABSTAIN
# look into spouse tutorial left spans, and using 'resources' in LFs


# checks if the preceding token is VBD, VBN (e.g. was administered)
@labeling_function()
def right_span_vb_pos(x):
    if (x.token_index < len(x.abstract) - 2) and (x.pos_abstract[x.token_index+1] == 'VBD') and (x.pos_abstract[x.token_index+2] == 'VBN'):
        return I 

    else:
        return ABSTAIN
    
    
# checks if the preceding token is VBD, VBN (e.g. was administered)
@labeling_function()
def left_span_vb_pos(x):
    if (x.token_index > 0) and ('V' in x.pos_abstract[x.token_index-1]):
        return I

    else:
        return ABSTAIN
    

# Labeling function which labels a token as NOT_I if it is in the last 50% of the file tokens.
@labeling_function()
def has_high_idx(x):
    percent = x.token_index / x.file_len
    if percent > 0.50:
        return NOT_I
    else:
        return ABSTAIN

    
# Labeling function which labels a token as NOT_I as default.
@labeling_function()
def lf_not_i(x):
    return NOT_I


    


## Apply labeling functions

In [65]:
# Apply LFs to dataframe.
lfs = [
       #contains_psych_term, # accuracy = 0.131380
       # is_punctuation,
       #has_prev_word_as,
    
       # has_next_word_as_drug, low accuracy and coverage
    
       # left_span_contains,
       # right_span_vb_pos,
       # left_span_vb_pos,
    
       #negative LFs
       is_stop_word,
       has_next_word_as,
       has_high_idx,
#        lf_not_i,
        
    
       #positive LFs
#        is_generic,
#        contains_drug_suffix,
#        contains_surgical_suffix,
#        is_placebo,
#        contains_fda_drug,
    
#        in_title,
#         in_title2,
#         in_title3,
#         in_title4,
       # not_in_title,
#        surround_in_title,

      ]
applier = PandasLFApplier(lfs = lfs)
L_dev = applier.apply(df = df_dev)

100%|██████████| 19246/19246 [00:00<00:00, 20631.57it/s]


In [66]:
# Define Y_dev.
Y_dev = df_dev["gold"].to_numpy(dtype = int)


In [67]:
# Summarive coverage, conflicts, empirical accurcacy of LFs.
LFAnalysis(L_dev, lfs).lf_summary(Y_dev)




Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_stop_word,0,[0],0.279435,0.142731,0.0,5170,208,0.961324
has_next_word_as,1,[0],0.009145,0.006599,0.0,131,45,0.744318
has_high_idx,2,[0],0.497454,0.147615,0.0,9183,391,0.95916


In [68]:
# Majority vote model.
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L = L_dev)

# Label model.
label_model = LabelModel(cardinality = 2, verbose = True)
label_model.fit(L_train = L_dev, n_epochs = 500, log_freq = 100, seed = 123)


In [62]:
# Generate error files

df_dev['maj_pred']=pd.Series(preds_train.astype(str))

df_dev_orig['maj_pred'] = '-2'

# adding maj_pred column to dev_orig
for i in range(len(df_dev)):
    pmid = df_dev.loc[i, 'PMID']
    token_index = df_dev.loc[i, 'token_index']
    maj_pred = df_dev.loc[i, 'maj_pred']
    ind = df_dev_orig.index[((df_dev_orig['PMID'] == pmid) & (df_dev_orig['token_index'] == token_index))][0]
    df_dev_orig.loc[ind, 'maj_pred'] = maj_pred
    
    

df_dev_groups = df_dev_orig.groupby('PMID')

# outputting sample files, with tokens that are gold labeled as I highlighted
for name, group in df_dev_groups:
    with open('error/dev/gold/' + name + '.md', 'w') as output:
        output.write(' GOLD ')
        tokens = group.token.tolist()
        golds = group.gold.tolist()
        for i in range(0, len(tokens)):
            if(golds[i]=='1'):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')
            
# outputting sample files, with tokens that are incorrectly labeled as intervention (tokens gold labeled as 0, but predicted as 1)
for name, group in df_dev_groups:
    with open('error/dev/incorrect/' + name + '.md', 'w') as output:
        output.write(' INCORRECT ')
        tokens = group.token.tolist()
        golds = group.gold.tolist()
        preds = group.maj_pred.tolist()
        for i in range(0, len(tokens)):
            if((golds[i]=='0') and (preds[i]=='1')):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')
            
# outputting sample files, with tokens that are missed highlighted (tokens gold labeled as 1, but predicted as 0 or -1)
for name, group in df_dev_groups:
    with open('error/dev/missed/' + name + '.md', 'w') as output:
        output.write(' MISSED ')
        tokens = group.token.tolist()
        golds = group.gold.tolist()
        preds = group.maj_pred.tolist()
        for i in range(0, len(tokens)):
            if((golds[i]=='1') and ((preds[i]=='-1') or (preds[i]=='0'))):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')
        
    

In [69]:
# Compute model performance metrics.
majority_scores = majority_model.score(L = L_dev, Y = Y_dev, 
                                       tie_break_policy = "random",
                                       metrics = ["f1", "accuracy", "precision", 
                                                  "recall", "roc_auc", "coverage"])

label_scores = label_model.score(L = L_dev, Y = Y_dev, 
                                 tie_break_policy = "random",
                                 metrics = ["f1", "accuracy", "precision", 
                                            "recall", "roc_auc", "coverage"])


INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.097]
INFO:root:[100 epochs]: TRAIN:[loss=0.000]
 35%|███▌      | 177/500 [00:00<00:00, 1765.82epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.000]
INFO:root:[300 epochs]: TRAIN:[loss=0.000]
 72%|███████▏  | 360/500 [00:00<00:00, 1798.03epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 1796.25epoch/s]
INFO:root:Finished Training


In [70]:
# Compare model performance metrics.
majority_f1 = majority_scores.get("f1")
majority_acc = majority_scores.get("accuracy")
majority_prec = majority_scores.get("precision")
majority_rec = majority_scores.get("recall")
majority_roc = majority_scores.get("roc_auc")
majority_cov = majority_scores.get("coverage")
print(f"{'Majority Model F1:':<25} {majority_f1 * 100:.1f}%")
print(f"{'Majority Model Accuracy:':<25} {majority_acc * 100:.1f}%")
print(f"{'Majority Model Precision:':<25} {majority_prec * 100:.1f}%")
print(f"{'Majority Model Recall:':<25} {majority_rec * 100:.1f}%")
print(f"{'Majority Model AUC ROC:':<25} {majority_roc * 100:.1f}%")
print(f"{'Majority Model Coverage:':<25} {majority_cov * 100:.1f}%")
print("++++++++++++++++++++++++")

# add labeling model 

label_f1 = label_scores.get("f1")
label_acc = label_scores.get("accuracy")
label_prec = label_scores.get("precision")
label_rec = label_scores.get("recall")
label_roc = label_scores.get("roc_auc")
label_cov = label_scores.get("coverage")
print(f"{'Label Model F1:':<25} {label_f1 * 100:.1f}%")
print(f"{'Label Model Accuracy:':<25} {label_acc * 100:.1f}%")
print(f"{'Label Model Precision:':<25} {label_prec * 100:.1f}%")
print(f"{'Label Model Recall:':<25} {label_rec * 100:.1f}%")
print(f"{'Label Model AUC ROC:':<25} {label_roc * 100:.1f}%")
print(f"{'Label Model Coverage:':<25} {label_cov * 100:.1f}%")

# goals
# look at acl, nlp conferences, ACL rolling review, umbrella rolling review process for nlp processes
# rolling review in June deadline, EM-NLP conference.
# Rolling deadline.


Majority Model F1:        20.8%
Majority Model Accuracy:  79.2%
Majority Model Precision: 15.4%
Majority Model Recall:    32.1%
Majority Model AUC ROC:   65.6%
Majority Model Coverage:  100.0%
++++++++++++++++++++++++
Label Model F1:           20.8%
Label Model Accuracy:     79.2%
Label Model Precision:    15.4%
Label Model Recall:       32.1%
Label Model AUC ROC:      67.0%
Label Model Coverage:     100.0%


In [46]:
# View "dummy" accuracy if predicting majority class every time.
print("Accuracy if predicting majority class", 
      df_dev["gold"].value_counts(normalize = True).max())

Accuracy if predicting majority class 0.9149433648550348
