# Data programming with Snorkel: Labeling the PICO dataset


In [2]:
# Importations.
import pandas as pd
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel
from snorkel.analysis import get_label_buckets
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
import string # For punctuation.
import os
import random
import pickle

In [3]:
with open('first_sentence_file.pickle', 'rb') as handle:
    first_sentence_file = pickle.load(handle)
    

## Process keyword data

In [6]:
# Read in suffixes.
# https://druginfo.nlm.nih.gov/drugportal/jsp/drugportal/DrugNameGenericStems.jsp
df_drugs = pd.read_csv("suffixes/drug_suffixes.txt", header = None)
df_surgery = pd.read_csv("suffixes/surgical_suffixes.txt", header = None)
df_psych = pd.read_csv("suffixes/psychotherapy_keywords.txt", header = None)

df_psych[0] = df_psych[0].str.lower()
df_surgery[0] = df_surgery[0].str.lower()
df_drugs[0] = df_drugs[0].str.lower()


In [7]:
# Read in FDA data.

df_purple = pd.read_csv("fda_approved_drugs/products_purplebook.csv")
df_orange = pd.read_csv("fda_approved_drugs/products_orangebook.txt", 
                        sep = "~")
df_drugs_at_fda = pd.read_csv("fda_approved_drugs/products_drugs_at_fda.txt", 
                              sep = "\t", 
                              error_bad_lines = False)


b'Skipping line 35225: expected 8 fields, saw 9\nSkipping line 35226: expected 8 fields, saw 9\nSkipping line 35227: expected 8 fields, saw 9\n'


In [8]:
# Filter drug suffixes with three characters or fewer.
drug_suffixes = list(df_drugs[0])
drug_suffixes = [x.lower() for x in drug_suffixes if len(x) > 3]


In [9]:
# Concatenate FDA drug data.
set_proprietary = list(df_drugs_at_fda["DrugName"]) + list(df_purple["Proprietary Name"]) + list(df_orange["Trade_Name"])
set_proper = list(df_drugs_at_fda["ActiveIngredient"]) + list(df_purple["Proper Name"]) + list(df_orange["Ingredient"])

# Remove floats and integers.
set_proprietary = [item.lower() for item in set_proprietary if not isinstance(item, float)]
set_proprietary = [item for item in set_proprietary if not isinstance(item, int)]
set_proper = [item.lower() for item in set_proper if not isinstance(item, float)]
set_proper = [item for item in set_proper if not isinstance(item, int)]

# Cast as sets to remove duplicates.
set_proprietary = set(set_proprietary)
set_proper = set(set_proper)
set_fda = set.union(set_proprietary, set_proper)


## Read data

In [10]:

df_orig = pd.read_pickle('df_orig.pickle')


# Train-test split (80% / 20%, stratified by gold label value).
X_train, X_test, y_train, y_test = train_test_split(df_orig["Token"], 
                                                    df_orig["Gold"], 
                                                    test_size = 0.1, 
                                                    random_state = 42)
df_train = df_orig.iloc[X_train.index].reset_index(drop = True)
df_test = df_orig.iloc[X_test.index].reset_index(drop = True)



In [11]:
X_train

1770        cm
1888         P
6951         2
1642         a
7425        to
         ...  
5734     Heart
5191      with
5390       The
860     solute
7270         p
Name: Token, Length: 8016, dtype: object

## Labeling functions

Labeling functions will be written to cover the following intervention categories, as used by the manual annotators of this [dataset](https://github.com/bepnye/EBM-NLP):

- Surgical.
- Physical.
- Drug.
- Educational.
- Psychological.
- Other.
- Control.

In [12]:
# Label macros.
ABSTAIN = -1
NOT_I = 0
I = 1

# Data for labeling functions.
generic_interventions = ["therap", "treatment", "intervention",
                         "placebo", "dose", "control", "vaccin"]
nltk.download("stopwords")
stop_words = stopwords.words("english")
print("Total stop words =", len(stop_words))

Total stop words = 179


[nltk_data] Downloading package stopwords to /home/rcw258/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Labeling functions
**All labeling functions label tokens. The corresponding gold labels are from the "starting span" labels.**

In [1]:
# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title(x, first_sentence_file):
    return I if x.Token.lower() in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title2(x, first_sentence_file):
    return I if x.Token.lower() in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title3(x, first_sentence_file):
    return I if x.Token.lower() in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title4(x, first_sentence_file):
    return I if x.Token.lower() in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?


# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def not_in_title(x, first_sentence_file):
    return NOT_I if x.Token.lower() not in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?


@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def surround_in_title(x, first_sentence_file):
    if((x.token_index>0) and (x.token_index<len(x.abstract)-1)):
        if (x.abstract[x.token_index-1].lower() in first_sentence_file[x.File]) and (x.abstract[x.token_index+1].lower() in first_sentence_file[x.File]):
            return I
    
    return ABSTAIN
# abstain or not_i?


# Labeling function for tokens that contain drug suffixes.
@labeling_function()
def contains_drug_suffix(x):
    return I if (any(suffix.lower() in x.Token.lower() for suffix in drug_suffixes)) else ABSTAIN
    
# Labeling function for tokens that contain surgical suffixes.
@labeling_function()
def contains_surgical_suffix(x):
    return I if (any(suffix.lower() in x.Token.lower() for suffix in df_surgery[0])) else ABSTAIN

# Labeling function for tokens that contain psychological / psychotherapeutic keywords.
@labeling_function()
def contains_psych_term(x):
    return I if (any(suffix.lower() in x.Token.lower() for suffix in df_psych[0])) else ABSTAIN

# Labeling function for tokens that contain generic intervention keywords.
@labeling_function()
def is_generic(x):
    return I if (any(term.lower() in x.Token.lower() for term in generic_interventions)) else ABSTAIN

# Labeling function for stop words.
@labeling_function()
def is_stop_word(x):
    return NOT_I if x.Token.lower() in stop_words else ABSTAIN

# Labeling function for tokens that are punctuation.
@labeling_function()
def is_punctuation(x):
    return NOT_I if x.Token.lower() in string.punctuation else ABSTAIN


# Labeling function for FDA approved drugs.
@labeling_function()
def contains_fda_drug(x):
    if (len(x.Token) <= 5):
        return ABSTAIN

    return I if (any(x.Token.lower() in drug.lower() for drug in set_fda)) else ABSTAIN


# checks if the preceding token is 'of' or 'with' (effect of... I, treat with... I)
@labeling_function()
def has_prev_word_as(x):
    words = ['of', 'with', 'receive', 'and']
    if ((x.token_index > 0) and (x.abstract[x.token_index-1].lower() in words)):
        return I 

    else:
        return ABSTAIN
    
# checks if the next token is 'group' or 'groups'
@labeling_function()
def has_next_word_as(x):
    words = ['group', 'groups']
    if ((x.token_index < len(x.abstract)-1) and (x.abstract[x.token_index+1].lower() in words)):
        return NOT_I

    else:
        return ABSTAIN
    
# Labeling function which labels a token as NOT_I if it is in the last 50% of the file tokens.
@labeling_function()
def has_high_idx(x):
    percent = x.token_index / x.file_len
    if percent > 0.50:
        return NOT_I
    else:
        return ABSTAIN
    
    
    
# Labeling function for tokens, sees if left span of token within sentence contains keyword
@labeling_function()
def left_span_contains(x):
    
    i = 0
    while(x.abstract[i] != x.Token):
        i+=1
        
    count = 0
    while(i >= 0 and count < 10):
        if((x.abstract[i] == 'determine') or (x.abstract[i] == 'assess')):
            return I
        i-=1
        count+=1
        
    return ABSTAIN
# look into spouse tutorial left spans, and using 'resources' in LFs


# checks if the preceding token is VBD, VBN (e.g. was administered)
@labeling_function()
def right_span_vb_pos(x):
    if (x.token_index < len(x.abstract) - 2) and (x.pos_abstract[x.tokem_index+1] == 'VBD') and (x.pos_abstract[x.token_index+2] == 'VBN'):
        return I 

    else:
        return ABSTAIN
    
    
# checks if the preceding token is VBD, VBN (e.g. was administered)
@labeling_function()
def left_span_vb_pos(x):
    if (x.token_index > 0) and ('V' in x.pos_abstract[x.token_index-1]):
        return I 

    else:
        return ABSTAIN
    


    


NameError: name 'labeling_function' is not defined

## Apply labeling functions

In [15]:
# Apply LFs to dataframe.
lfs = [
       #contains_psych_term, # accuracy = 0.131380
       # is_punctuation,
       #has_prev_word_as,
    
       # has_next_word_as_drug, low accuracy and coverage
    
       # left_span_contains,
       # right_span_vb_pos,
       # left_span_vb_pos,
    
       #negative LFs
       is_stop_word,
       has_next_word_as,
       has_high_idx,
    
       #positive LFs
       is_generic,
       contains_drug_suffix,
       contains_surgical_suffix,
       contains_fda_drug,
    
       in_title,
        in_title2,
        in_title3,
        in_title4,
       # not_in_title,
       surround_in_title,

      ]
applier = PandasLFApplier(lfs = lfs)
L_train = applier.apply(df = df_train)
L_test = applier.apply(df = df_test)
# L_dev = applier.apply(df = df_dev)

100%|██████████| 8016/8016 [03:01<00:00, 44.18it/s]
100%|██████████| 2005/2005 [00:42<00:00, 46.81it/s]


In [220]:
#L_train
# L_dev

In [221]:
# # %%capture

# coverage_check_out, coverage_check = (L_dev != ABSTAIN).mean(axis = 0)
# print(f"check_out coverage: {coverage_check_out * 100:.1f}%")
# print(f"check coverage: {coverage_check * 100:.1f}%")


In [222]:
#L_train.shape


In [16]:
# Define Y_train, Y_test.
Y_train = df_train["Gold"].to_numpy(dtype = int)
Y_test = df_test["Gold"].to_numpy(dtype = int)
# Y_dev = df_dev["Gold"].to_numpy(dtype = int)

In [17]:
# Summarive coverage, conflicts, empirical accurcacy of LFs.
LFAnalysis(L_train, lfs).lf_summary(Y_train)
# LFAnalysis(L_dev, lfs).lf_summary(Y_dev)




Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_stop_word,0,[0],0.272829,0.185629,0.086327,2097,90,0.958848
has_next_word_as,1,[0],0.005115,0.004491,0.002121,39,2,0.95122
has_high_idx,2,[0],0.49501,0.2127,0.112151,3813,155,0.960938
is_generic,3,[1],0.01522,0.011103,0.007735,41,81,0.336066
contains_drug_suffix,4,[1],0.021956,0.015344,0.008733,60,116,0.340909
contains_surgical_suffix,5,[1],0.002994,0.002869,0.001497,2,22,0.083333
contains_fda_drug,6,[1],0.044661,0.034681,0.020709,116,242,0.324022
in_title,7,[1],0.21769,0.21769,0.122505,252,1493,0.144413
in_title2,8,[1],0.21769,0.21769,0.122505,252,1493,0.144413
in_title3,9,[1],0.21769,0.21769,0.122505,252,1493,0.144413


In [18]:
LFAnalysis(L_test, lfs).lf_summary(Y_test)



Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_stop_word,0,[0],0.286284,0.20399,0.094763,539,35,0.939024
has_next_word_as,1,[0],0.002993,0.002494,0.001496,5,1,0.833333
has_high_idx,2,[0],0.505736,0.226434,0.116209,973,41,0.959566
is_generic,3,[1],0.017955,0.012968,0.007481,15,21,0.416667
contains_drug_suffix,4,[1],0.020948,0.014963,0.006983,13,29,0.309524
contains_surgical_suffix,5,[1],0.001995,0.001995,0.000499,0,4,0.0
contains_fda_drug,6,[1],0.045387,0.037905,0.021446,30,61,0.32967
in_title,7,[1],0.226434,0.226434,0.138155,62,392,0.136564
in_title2,8,[1],0.226434,0.226434,0.138155,62,392,0.136564
in_title3,9,[1],0.226434,0.226434,0.138155,62,392,0.136564


In [19]:
%%capture
'''
# Explore buckets for patterns in discordance.
buckets = get_label_buckets(L_train[:, 0], L_train[:, 1])
display(buckets)
display(df_train.iloc[buckets[(NOT_I, I)]].sample(10, random_state = 1))
'''

In [20]:
# Majority vote model.
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L = L_train)


In [21]:
# Label model.
label_model = LabelModel(cardinality = 2, verbose = True)
label_model.fit(L_train = L_train, n_epochs = 500, log_freq = 100, seed = 123)


INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.746]
  0%|          | 1/500 [00:00<01:28,  5.63epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.001]
 25%|██▌       | 127/500 [00:00<00:00, 564.38epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.001]
 53%|█████▎    | 265/500 [00:00<00:00, 879.52epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.001]
INFO:root:[400 epochs]: TRAIN:[loss=0.001]
100%|██████████| 500/500 [00:00<00:00, 905.56epoch/s] 
INFO:root:Finished Training


In [22]:
# Compute model performance metrics.
majority_scores = majority_model.score(L = L_test, Y = Y_test, 
                                       tie_break_policy = "random",
                                       metrics = ["f1", "accuracy", "precision", 
                                                  "recall", "roc_auc", "coverage"])
label_scores = label_model.score(L = L_test, Y = Y_test, 
                                 tie_break_policy = "random",
                                 metrics = ["f1", "accuracy", "precision", 
                                            "recall", "roc_auc", "coverage"])

In [23]:
# Compare model performance metrics.
majority_f1 = majority_scores.get("f1")
majority_acc = majority_scores.get("accuracy")
majority_prec = majority_scores.get("precision")
majority_rec = majority_scores.get("recall")
majority_roc = majority_scores.get("roc_auc")
majority_cov = majority_scores.get("coverage")
print(f"{'Majority Model F1:':<25} {majority_f1 * 100:.1f}%")
print(f"{'Majority Model Accuracy:':<25} {majority_acc * 100:.1f}%")
print(f"{'Majority Model Precision:':<25} {majority_prec * 100:.1f}%")
print(f"{'Majority Model Recall:':<25} {majority_rec * 100:.1f}%")
print(f"{'Majority Model AUC ROC:':<25} {majority_roc * 100:.1f}%")
print(f"{'Majority Model Coverage:':<25} {majority_cov * 100:.1f}%")
print("++++++++++++++++++++++++")

label_f1 = label_scores.get("f1")
label_acc = label_scores.get("accuracy")
label_prec = label_scores.get("precision")
label_rec = label_scores.get("recall")
label_roc = label_scores.get("roc_auc")
label_cov = label_scores.get("coverage")
print(f"{'Label Model F1:':<25} {label_f1 * 100:.1f}%")
print(f"{'Label Model Accuracy:':<25} {label_acc * 100:.1f}%")
print(f"{'Label Model Precision:':<25} {label_prec * 100:.1f}%")
print(f"{'Label Model Recall:':<25} {label_rec * 100:.1f}%")
print(f"{'Label Model AUC ROC:':<25} {label_roc * 100:.1f}%")
print(f"{'Label Model Coverage:':<25} {label_cov * 100:.1f}%")

Majority Model F1:        23.6%
Majority Model Accuracy:  62.8%
Majority Model Precision: 14.5%
Majority Model Recall:    63.2%
Majority Model AUC ROC:   65.7%
Majority Model Coverage:  100.0%
++++++++++++++++++++++++
Label Model F1:           20.7%
Label Model Accuracy:     45.2%
Label Model Precision:    11.9%
Label Model Recall:       78.6%
Label Model AUC ROC:      64.6%
Label Model Coverage:     100.0%


In [24]:
# View "dummy" accuracy if predicting majority class every time.
print("Accuracy if predicting majority class", 
      df_test["Gold"].value_counts(normalize = True).max())

INFO:numexpr.utils:Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Accuracy if predicting majority class 0.9092269326683292


## Explore errors

In [277]:
@labeling_function(pre=[spacy])
def is_noun(x):
    if x.doc.pos[0] == 'noun':
        return I
    else:
        return ABSTAIN

In [278]:
df_error = df_train.copy()

In [279]:
df_error["LF"] = df_error.apply(lambda x: has_next_word_as_drug(x), axis=1)
df_error.head()

Unnamed: 0,Token,File,Gold,PMID,token_index,file_len,Spans,LF
0,.,20369616.tokens,0,20369616,120,121,"[drug, ., Does]",-1
1,and,21410033.tokens,0,21410033,91,195,"[total, and, conjugated]",-1
2,quality,15673894.tokens,0,15673894,25,308,"[the, quality, of]",-1
3,),1631861.tokens,0,1631861,172,218,"[9/14, ), .]",-1
4,on,20390261.tokens,0,20390261,278,293,"[effect, on, reversal]",-1


In [280]:
sel = (df_error.LF == 1) & (df_error.Gold == '0') 
# labeled as NOT_I when actually I => 0 instances
# labeled as I when actually NOT_I => non-zero
df_error[sel].head(50)
# df_orig[(df_orig.PMID=='3385217') & (df_orig.Gold=='1')]

Unnamed: 0,Token,File,Gold,PMID,token_index,file_len,Spans,LF
54,fentanyl,21935685.tokens,0,21935685,147,209,"[after, fentanyl, bolus]",1
125,standardized,15264973.tokens,0,15264973,65,156,"[in, standardized, body]",1
183,anesthesia,18672629.tokens,0,18672629,72,265,"[general, anesthesia, was]",1
236,triple,24684165.tokens,0,24684165,198,337,"[the, triple, combination]",1
288,groups,24691455.tokens,0,24691455,92,214,"[;, groups, B]",1
314,groups,18453793.tokens,0,18453793,206,208,"[patient, groups, .]",1
353,papillomavirus,17367324.tokens,0,17367324,83,283,"[human, papillomavirus, infection]",1
408,infant,1741218.tokens,0,1741218,221,307,"[natural, infant, suckling]",1
459,venous,7211918.tokens,0,7211918,12,239,"[deep, venous, thrombosis]",1
464,cellular,10607234.tokens,0,10607234,404,448,"[on, cellular, distribution]",1


In [None]:
df_test[df_test.Token == 'surgery']


Unnamed: 0,Token,File,Gold,PMID,token_index,file_len,Spans,Surgical
250,surgery,19092729.tokens,0,19092729,17,332,"[buckling, surgery, .]",1
834,surgery,24532106.tokens,0,24532106,219,280,"[after, surgery, ,]",1
1130,surgery,15616772.tokens,1,15616772,147,336,"[(, surgery, only]",1
2110,surgery,10078673.tokens,0,10078673,62,291,"[abdominal, surgery, were]",1
3141,surgery,9278836.tokens,0,9278836,254,434,"[of, surgery, .]",1
3172,surgery,9278836.tokens,0,9278836,266,434,"[of, surgery, ,]",1
5358,surgery,8604728.tokens,0,8604728,170,316,"[filtering, surgery, group]",1
5837,surgery,14567804.tokens,0,14567804,86,240,"[Before, surgery, ,]",1
7105,surgery,18779477.tokens,0,18779477,202,310,"[glaucoma, surgery, was]",1
7944,surgery,11214014.tokens,0,11214014,12,306,"[bypass, surgery, .]",1


In [93]:
df_test["Gold"] = df_test["Gold"].astype(int)
df_test["Surgical Concord"] = np.where((df_test["Gold"] == df_test["Surgical"]), 1, 0)
df_test.head()

Unnamed: 0,Token,File,Gold,PMID,Spans,Surgical,Surgical Concord
0,was,7562882.tokens,0,7562882,"[lisinopril, was, observed]",-1,0
1,Mortality,3139179.tokens,0,3139179,"[RESULTS, Mortality, from]",-1,0
2,.,24077211.tokens,0,24077211,"[trial, ., OBJECTIVE]",-1,0
3,compared,10356632.tokens,0,10356632,"[and, compared, to]",-1,0
4,a,25542620.tokens,0,25542620,"[:, a, randomised]",-1,0


In [94]:
surg_discord = df_test[df_test["Surgical Concord"] == 0]
surg_discord = surg_discord[surg_discord["Surgical"] != -1]
display(surg_discord.head(100))
display(surg_discord.tail(100))

Unnamed: 0,Token,File,Gold,PMID,Spans,Surgical,Surgical Concord
13,surgery,7956382.tokens,0,7956382,"[), surgery, with]",1,0
428,surgery,1670445.tokens,0,1670445,"[hip, surgery, .]",1,0
1066,neuropsychiatric,17513813.tokens,0,17513813,"[severe, neuropsychiatric, toxicity]",1,0
1954,ureteroscopy,17156222.tokens,0,17156222,"[(, ureteroscopy, )]",1,0
3570,endoscopy,12233894.tokens,0,12233894,"[gastric, endoscopy, scores]",1,0
4036,surgical,15523393.tokens,0,15523393,"[The, surgical, resection]",1,0
5204,surgical,11922398.tokens,0,11922398,"[the, surgical, procedure]",1,0
6842,arthroscopy,12882611.tokens,0,12882611,"[requiring, arthroscopy, were]",1,0
7177,surgery,24532106.tokens,0,24532106,"[before, surgery, .]",1,0
7250,surgery,21389925.tokens,0,21389925,"["", surgery, .]",1,0


Unnamed: 0,Token,File,Gold,PMID,Spans,Surgical,Surgical Concord
13,surgery,7956382.tokens,0,7956382,"[), surgery, with]",1,0
428,surgery,1670445.tokens,0,1670445,"[hip, surgery, .]",1,0
1066,neuropsychiatric,17513813.tokens,0,17513813,"[severe, neuropsychiatric, toxicity]",1,0
1954,ureteroscopy,17156222.tokens,0,17156222,"[(, ureteroscopy, )]",1,0
3570,endoscopy,12233894.tokens,0,12233894,"[gastric, endoscopy, scores]",1,0
4036,surgical,15523393.tokens,0,15523393,"[The, surgical, resection]",1,0
5204,surgical,11922398.tokens,0,11922398,"[the, surgical, procedure]",1,0
6842,arthroscopy,12882611.tokens,0,12882611,"[requiring, arthroscopy, were]",1,0
7177,surgery,24532106.tokens,0,24532106,"[before, surgery, .]",1,0
7250,surgery,21389925.tokens,0,21389925,"["", surgery, .]",1,0
