# Data programming with Snorkel: Error analysis of the PICO dataset

In [1]:
# Importations.
import pandas as pd
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel
from snorkel.analysis import get_label_buckets
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
import string # For punctuation.
import os
import random
import pickle

In [2]:
with open('first_sentence_file.pickle', 'rb') as handle:
    first_sentence_file = pickle.load(handle)
    

## Process keyword data

In [3]:
# Read in suffixes.
# https://druginfo.nlm.nih.gov/drugportal/jsp/drugportal/DrugNameGenericStems.jsp
df_drugs = pd.read_csv("lf_datasets/suffixes/drug_suffixes.txt", header = None)
df_surgery = pd.read_csv("lf_datasets/suffixes/surgical_suffixes.txt", header = None)
df_psych = pd.read_csv("lf_datasets/suffixes/psychotherapy_keywords.txt", header = None)

df_psych[0] = df_psych[0].str.lower()
df_surgery[0] = df_surgery[0].str.lower()
df_drugs[0] = df_drugs[0].str.lower()



# Read in FDA data.

df_purple = pd.read_csv("lf_datasets/fda_approved_drugs/products_purplebook.csv")
df_orange = pd.read_csv("lf_datasets/fda_approved_drugs/products_orangebook.txt", 
                        sep = "~")
df_drugs_at_fda = pd.read_csv("lf_datasets/fda_approved_drugs/products_drugs_at_fda.txt", 
                              sep = "\t", 
                              error_bad_lines = False)



# Filter drug suffixes with three characters or fewer.
drug_suffixes = list(df_drugs[0])
drug_suffixes = [x.lower() for x in drug_suffixes if len(x) > 3]

# Concatenate FDA drug data.
set_proprietary = list(df_drugs_at_fda["DrugName"]) + list(df_purple["Proprietary Name"]) + list(df_orange["Trade_Name"])
set_proper = list(df_drugs_at_fda["ActiveIngredient"]) + list(df_purple["Proper Name"]) + list(df_orange["Ingredient"])

# Remove floats and integers.
set_proprietary = [item.lower() for item in set_proprietary if not isinstance(item, float)]
set_proprietary = [item for item in set_proprietary if not isinstance(item, int)]
set_proper = [item.lower() for item in set_proper if not isinstance(item, float)]
set_proper = [item for item in set_proper if not isinstance(item, int)]

# Cast as sets to remove duplicates.
set_proprietary = set(set_proprietary)
set_proper = set(set_proper)
set_fda = set.union(set_proprietary, set_proper)



  df_drugs_at_fda = pd.read_csv("lf_datasets/fda_approved_drugs/products_drugs_at_fda.txt",
b'Skipping line 35225: expected 8 fields, saw 9\nSkipping line 35226: expected 8 fields, saw 9\nSkipping line 35227: expected 8 fields, saw 9\n'


## Variables

In [4]:
# read the pickle file
with open('df_orig.pickle', 'rb') as handle:
    df_orig = pickle.load(handle)

files_sample = random.sample(df_orig.PMID.unique().tolist(), 30)
files_30_sel = df_orig['PMID'].apply(lambda x : x in files_sample)

# original sample files
df_sample_orig = df_orig[files_30_sel]
df_sample_orig = df_sample_orig.reset_index(drop = True)

# Remove None type labels. BERT does the same.
df_sample = df_sample_orig.dropna()
df_sample = df_sample.reset_index(drop = True)


## Labeling functions

Labeling functions will be written to cover the following intervention categories, as used by the manual annotators of this [dataset](https://github.com/bepnye/EBM-NLP):

- Surgical.
- Physical.
- Drug.
- Educational.
- Psychological.
- Other.
- Control.

In [5]:
# Label macros.
ABSTAIN = -1
NOT_I = 0
I = 1

# Data for labeling functions.
generic_interventions = ["therap", "treatment", "intervention",
                         "placebo", "dose", "control", "vaccin"]
nltk.download("stopwords")
stop_words = stopwords.words("english")
print("Total stop words =", len(stop_words))

Total stop words = 179


[nltk_data] Downloading package stopwords to /home/rcw258/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Labeling functions
**All labeling functions label tokens. The corresponding gold labels are from the "starting span" labels.**

In [6]:
# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title(x, first_sentence_file):
    return I if x.token.lower() in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title2(x, first_sentence_file):
    return I if x.token.lower() in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title3(x, first_sentence_file):
    return I if x.token.lower() in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title4(x, first_sentence_file):
    return I if x.token.lower() in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?


# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def not_in_title(x, first_sentence_file):
    return NOT_I if x.token.lower() not in first_sentence_file[x.file] else ABSTAIN
# abstain or not_i?


@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def surround_in_title(x, first_sentence_file):
    if((x.token_index>0) and (x.token_index<len(x.abstract)-1)):
        if (x.abstract[x.token_index-1].lower() in first_sentence_file[x.file]) and (x.abstract[x.token_index+1].lower() in first_sentence_file[x.file]):
            return I
    
    return ABSTAIN
# abstain or not_i?


# Labeling function for tokens that contain drug suffixes.
@labeling_function()
def contains_drug_suffix(x):
    return I if (any(suffix.lower() in x.token.lower() for suffix in drug_suffixes)) else ABSTAIN
    
# Labeling function for tokens that contain surgical suffixes.
@labeling_function()
def contains_surgical_suffix(x):
    return I if (any(suffix.lower() in x.token.lower() for suffix in df_surgery[0])) else ABSTAIN

# Labeling function for tokens that contain psychological / psychotherapeutic keywords.
@labeling_function()
def contains_psych_term(x):
    return I if (any(suffix.lower() in x.token.lower() for suffix in df_psych[0])) else ABSTAIN

# Labeling function for tokens that contain generic intervention keywords.
@labeling_function()
def is_generic(x):
    return I if (any(term.lower() in x.token.lower() for term in generic_interventions)) else ABSTAIN

# Labeling function for stop words.
@labeling_function()
def is_stop_word(x):
    return NOT_I if x.token.lower() in stop_words else ABSTAIN

# Labeling function for tokens that are punctuation.
@labeling_function()
def is_punctuation(x):
    return NOT_I if x.token.lower() in string.punctuation else ABSTAIN


# Labeling function for FDA approved drugs.
@labeling_function()
def contains_fda_drug(x):
    if (len(x.token) <= 5):
        return ABSTAIN

    return I if (any(x.token.lower() in drug.lower() for drug in set_fda)) else ABSTAIN


# checks if the preceding token is 'of' or 'with' (effect of... I, treat with... I)
@labeling_function()
def has_prev_word_as(x):
    words = ['of', 'with', 'receive', 'and']
    if ((x.token_index > 0) and (x.abstract[x.token_index-1].lower() in words)):
        return I 

    else:
        return ABSTAIN
    
# checks if the next token is 'group' or 'groups'
@labeling_function()
def has_next_word_as(x):
    words = ['group', 'groups']
    if ((x.token_index < len(x.abstract)-1) and (x.abstract[x.token_index+1].lower() in words)):
        return NOT_I

    else:
        return ABSTAIN
    
# Labeling function which labels a token as NOT_I if it is in the last 50% of the file tokens.
@labeling_function()
def has_high_idx(x):
    percent = x.token_index / x.file_len
    if percent > 0.50:
        return NOT_I
    else:
        return ABSTAIN
    
    
    
# Labeling function for tokens, sees if left span of token within sentence contains keyword
@labeling_function()
def left_span_contains(x):
    
    i = 0
    while(x.abstract[i] != x.token):
        i+=1
        
    count = 0
    while(i >= 0 and count < 10):
        if((x.abstract[i] == 'determine') or (x.abstract[i] == 'assess')):
            return I
        i-=1
        count+=1
        
    return ABSTAIN
# look into spouse tutorial left spans, and using 'resources' in LFs


# checks if the preceding token is VBD, VBN (e.g. was administered)
@labeling_function()
def right_span_vb_pos(x):
    if (x.token_index < len(x.abstract) - 2) and (x.pos_abstract[x.token_index+1] == 'VBD') and (x.pos_abstract[x.token_index+2] == 'VBN'):
        return I 

    else:
        return ABSTAIN
    
    
# checks if the preceding token is VBD, VBN (e.g. was administered)
@labeling_function()
def left_span_vb_pos(x):
    if (x.token_index > 0) and ('V' in x.pos_abstract[x.token_index-1]):
        return I

    else:
        return ABSTAIN
    


In [7]:
df_sample_orig.index[((df_sample_orig['PMID'] == '15965311') & (df_sample_orig['token_index'] == 1))]





Int64Index([], dtype='int64')

## Generate Error Files

Incorrect tokens have {* and *} surrounding it. Incorrect is defined as the majority model label being not -1 and different than the gold label.

In [8]:
# Apply LFs to dataframe.
lfs = [
       #contains_psych_term, # accuracy = 0.131380
       # is_punctuation,
       #has_prev_word_as,
    
       # has_next_word_as_drug, low accuracy and coverage
    
       # left_span_contains,
       # right_span_vb_pos,
       # left_span_vb_pos,
    
       #negative LFs
       is_stop_word,
       has_next_word_as,
       has_high_idx,
    
       #positive LFs
       is_generic,
       contains_drug_suffix,
       contains_surgical_suffix,
       contains_fda_drug,
    
#        in_title,
#         in_title2,
#         in_title3,
#         in_title4,
       # not_in_title,
#        surround_in_title,

      ]
applier = PandasLFApplier(lfs = lfs)
L_train = applier.apply(df = df_sample)
Y_train = df_sample["gold"].to_numpy(dtype = int)



100%|██████████| 8137/8137 [02:44<00:00, 49.57it/s] 


In [9]:
# Majority vote model.
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L = L_train)

preds_train

array([ 0, -1, -1, ..., -1,  0,  0])

In [10]:
df_sample['maj_pred']=pd.Series(preds_train.astype(str))
# df_sample_orig = df_sample_orig.reset_index(drop = True)

df_sample_orig['maj_pred'] = '-2'

# adding maj_pred column to sample_original
for i in range(len(df_sample)):
    pmid = df_sample.loc[i, 'PMID']
    token_index = df_sample.loc[i, 'token_index']
    maj_pred = df_sample.loc[i, 'maj_pred']
    ind = df_sample_orig.index[((df_sample_orig['PMID'] == pmid) & (df_sample_orig['token_index'] == token_index))][0]
    df_sample_orig.loc[ind, 'maj_pred'] = maj_pred
    

In [11]:

df_sample_groups = df_sample_orig.groupby('PMID')

# outputting sample files , with tokens that are gold labeled as I highlighted
for name, group in df_sample_groups:
    with open('error/gold/' + name + '.md', 'w') as output:
        tokens = group.token.tolist()
        golds = group.gold.tolist()
        for i in range(0, len(tokens)):
            if(golds[i]=='1'):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')
        
            
            
# outputting sample files, with tokens that are missed highlighted (tokens gold labeled as 1, but predicted as 0 or -1)
for name, group in df_sample_groups:
    with open('error/missed/' + name + '.md', 'w') as output:
        tokens = group.token.tolist()
        golds = group.gold.tolist()
        preds = group.maj_pred.tolist()
        for i in range(0, len(tokens)):
            if((golds[i]=='1') and ((preds[i]=='-1') or (preds[i]=='0'))):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')
                            
                    
# outputting sample files, with tokens that are mislabeled highlighted
for name, group in df_sample_groups:
    with open('error/wrong/' + name + '.md', 'w') as output:
        tokens = group.token.tolist()
        golds = group.gold.tolist()
        preds = group.maj_pred.tolist()
        for i in range(0, len(tokens)):
            if(((golds[i]=='0') and (preds[i]=='1')) or ((golds[i]=='1') and (preds[i]=='0'))):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')