# Data programming with Snorkel: Labeling the PICO dataset

PICO data from [here](https://ebm-nlp.herokuapp.com/) and [here](https://github.com/bepnye/EBM-NLP). 

Code by Jacqueline R. M. A. Maasch and Ray Wang | November 2021

## Preamble

In [2]:
# Importations.
import pandas as pd
import numpy as np
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel
from snorkel.analysis import get_label_buckets
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
import string # For punctuation.
import os
import random

## Process keyword data

In [3]:
# Read in drug name suffixes.
# https://druginfo.nlm.nih.gov/drugportal/jsp/drugportal/DrugNameGenericStems.jsp
df_drugs = pd.read_csv("drug_suffixes.txt", header = None)
# print(df_drugs.info())
# display(df_drugs.head())

In [4]:
# Read in surgical suffixes.
# https://en.wikipedia.org/wiki/List_of_surgical_procedures
df_surgery = pd.read_csv("surgical_suffixes.txt", header = None)
# print(df_surgery.info())
# display(df_surgery.head())

In [5]:
# Read in psyocholigical keywords.
# https://www.ncbi.nlm.nih.gov/books/NBK385382/
df_psych = pd.read_csv("psychotherapy_keywords.txt", header = None)
# print(df_psych.info())
# display(df_psych.head())

In [6]:
# df_surgery

In [7]:
# Read in FDA data.
# Sources: 
# https://purplebooksearch.fda.gov/downloads
# https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm
# https://www.fda.gov/drugs/drug-approvals-and-databases/drugsfda-data-files
# https://www.fda.gov/drugs/drug-approvals-and-databases/approved-drug-products-therapeutic-
#         equivalence-evaluations-orange-book

df_purple = pd.read_csv("fda_approved_drugs/products_purplebook.csv")
df_orange = pd.read_csv("fda_approved_drugs/products_orangebook.txt", 
                        sep = "~")
df_drugs_at_fda = pd.read_csv("fda_approved_drugs/products_drugs_at_fda.txt", 
                              sep = "\t", 
                              error_bad_lines = False)

# print(df_purple.info())
# display(df_purple.head())

# print(df_orange.info())
# display(df_orange.head())

# print(df_drugs_at_fda.info())
# display(df_drugs_at_fda.head())



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 35225: expected 8 fields, saw 9\nSkipping line 35226: expected 8 fields, saw 9\nSkipping line 35227: expected 8 fields, saw 9\n'


In [8]:
# Filter drug suffixes with three characters or fewer.
drug_suffixes = list(df_drugs[0])
drug_suffixes = [x.lower() for x in drug_suffixes if len(x) > 3]
# print(len(drug_suffixes), "drug suffixes of", len(df_drugs[0]), "are of adequate length.")

In [9]:
# Set all keywords to lowercase.
df_psych[0] = df_psych[0].str.lower()
# display(df_psych.head())

In [10]:
# Concatenate FDA drug data.
set_proprietary = list(df_drugs_at_fda["DrugName"]) + list(df_purple["Proprietary Name"]) + list(df_orange["Trade_Name"])
set_proper = list(df_drugs_at_fda["ActiveIngredient"]) + list(df_purple["Proper Name"]) + list(df_orange["Ingredient"])

# Remove floats and integers.
set_proprietary = [item.lower() for item in set_proprietary if not isinstance(item, float)]
set_proprietary = [item for item in set_proprietary if not isinstance(item, int)]
set_proper = [item.lower() for item in set_proper if not isinstance(item, float)]
set_proper = [item for item in set_proper if not isinstance(item, int)]

# Length prior to eliminating duplicates.
# print(len(set_proprietary))
# print(len(set_proper))

# Cast as sets to remove duplicates.
set_proprietary = set(set_proprietary)
set_proper = set(set_proper)

set_fda = set.union(set_proprietary, set_proper)

# Length after removing duplicates.
# print(len(set_proprietary))
# print(len(set_proper))

## Variables

In [11]:
# stores all files as series
files_as_series = {}
# stores first sentence of all files
first_sentence_file = {}
# stores all POS files as series
pos_files_as_series = {}

## Define functions

In [12]:

# get index of each token in the .txt file this token is from and returns all indexes as a list.
def get_indexes_as_series(tokens):
    return tokens.index.tolist()
    
# returns a list of this value of size n, where each value is the 
# length of the .txt file this token is from, where n is the length of the input tokens series
def get_len_as_series(tokens):
    temp = [len(tokens) for i in range(0, len(tokens))]
    return temp


def file_to_series(file_name):    
    # Source: https://www.geeksforgeeks.org/read-a-file-line-by-line-in-python/
    with open(file_name) as f:
        lines = [line.strip() for line in f]
    return pd.Series(lines)

# Strip PubMed IDs from file names.
strip_pmid = lambda x: x.split(".")[0]

def iter_token_dir(dir_name, df, label_dict, col_name = "Token", ext_name = ".tokens"):
    directory = os.fsencode(dir_name)
    for file in os.listdir(directory):
        file_name = os.fsdecode(file)
        if file_name.endswith(ext_name): 
            
            series_file = file_to_series(directory.decode("utf-8") + file_name)
            
            pos_series_file = file_to_series(directory.decode("utf-8") + file_name.split(".")[0] + ".pos")
            
            files_as_series[file_name] = series_file
            pos_files_as_series[file_name.split(".")[0] + ".pos"] = pos_series_file
            
            token_index = get_indexes_as_series(series_file)
            file_len = get_len_as_series(series_file)
                        
            PMID = strip_pmid(file_name)
            df_file = pd.DataFrame({col_name: series_file,
                                    "File": [file_name] * len(series_file),
                                    "Gold": label_dict.get(PMID),
                                    "PMID": [PMID] * len(series_file),
                                    "token_index": token_index,
                                    "file_len": file_len
                                   })
            df = pd.concat([df, df_file])
        else:
            continue
    return df

def iter_label_dir(dir_name, ext_name = ".AGGREGATED.ann"):
    label_dict = dict()
    directory = os.fsencode(dir_name)
    for file in os.listdir(directory):
        file_name = os.fsdecode(file)
        if file_name.endswith(ext_name): 
            series_file = file_to_series(directory.decode("utf-8") + file_name)
            PMID = strip_pmid(file_name)
            label_dict[PMID] = series_file
        else:
            continue
    return label_dict

def get_three_spans(tokens):
    span_list = []
    span_list.append([tokens[0], tokens[1]])
    for i in range(1, len(tokens) - 1):
        span_list.append([tokens[i - 1], tokens[i], tokens[i + 1]])
    span_list.append([tokens[len(tokens) - 2], tokens[len(tokens) - 1]])
    return span_list


# get sentence index, sentence, and parts of speech of sentence, that token is in
def get_sentence_info(token_index, file_name):
        
    token_series = files_as_series[file_name]
    pos_series = pos_files_as_series[file_name.split(".")[0] + ".pos"]
    sentence = []
    pos_sentence = []
    sentence_index = 0
    
    i = token_index
    while i>=0 and token_series[i]!='.':
        sentence.insert(0, token_series[i])
        pos_sentence.insert(0, pos_series[i])
        i-=1

    # index within sentence
    sentence_index = token_index - (i+1)
    i = token_index+1

    while i<len(token_series) and token_series[i]!='.':
        sentence.append(token_series[i])
        pos_sentence.append(pos_series[i])
        i+=1

    if token_index==0:
        first_sentence_file[file_name] = [x.lower() for x in sentence]
            
    return (sentence_index, sentence, pos_sentence)

def get_sentence_index(x):
    i, s, ps = x
    return i
def get_sentence(x):
    i, s, ps = x
    return s
def get_pos_sentence(x):
    i, s, ps = x
    return ps

# tokens that are punctuation.
def is_punctuation(x):
    return False if x.Token.lower() in string.punctuation else True
    

## Read data

Construct dataframe to store instances and gold labels.

In [13]:
# Iterate through directory to obtain all gold labels, 
# mapped to their respective file names.
label_dict = iter_label_dir("annotations/aggregated/starting_spans/interventions/train/")

# Iterate through directory to obtain all tokens,
# mapped to their respective file names.
# original tokens
df_orig = pd.DataFrame()
df_orig = iter_token_dir("documents/", df_orig, label_dict)
# print(df_orig.info())
# display(df_orig.head())

In [14]:
# get sample files
# files_5 = ['11369627', '9474450', '17365975', '20149953', '25162407']
# files_5_sel = df_orig['PMID'].apply(lambda x : x in files_5)

files_sample = random.sample(df_orig.PMID.unique().tolist(), 30)
files_30_sel = df_orig['PMID'].apply(lambda x : x in files_sample)

# original sample files
df_sample_orig = df_orig[files_30_sel]

In [16]:
# Remove NA gold labels.
df_orig = df_orig.dropna()
# new sample files
df_sample = df_sample_orig.dropna()

df_sample = df_sample.reset_index(drop = True)
df_orig = df_orig.reset_index(drop = True)


# get sentence related columns for each token
df_orig["sentence_info"] = df_orig.apply(lambda x : get_sentence_info(x["token_index"], x["File"]), axis=1)

df_sample["sentence_info"] = df_sample.apply(lambda x : get_sentence_info(x["token_index"], x["File"]), axis=1)


df_orig["sentence_index"] = df_orig["sentence_info"].apply(get_sentence_index)
df_orig["sentence"] = df_orig["sentence_info"].apply(get_sentence)
df_orig["pos_sentence"] = df_orig["sentence_info"].apply(get_pos_sentence)


df_sample["sentence_index"] = df_sample["sentence_info"].apply(get_sentence_index)
df_sample["sentence"] = df_sample["sentence_info"].apply(get_sentence)
df_sample["pos_sentence"] = df_sample["sentence_info"].apply(get_pos_sentence)

df_orig = df_orig.drop("sentence_info", 1)

df_sample = df_sample.drop("sentence_info", 1)

# remove punctuation tokens
df_orig = df_orig[df_orig.apply(lambda x: is_punctuation(x), axis=1)]

df_sample = df_sample[df_sample.apply(lambda x: is_punctuation(x), axis=1)]



df_sample = df_sample.reset_index(drop = True)
df_orig = df_orig.reset_index(drop = True)



df_dev = df_orig.tail(10000).sample(n=1000).reset_index(drop = True)

# Random sample for more manageable training.
df_dir = df_orig[:-10000].sample(n = 10000).reset_index(drop = True)

# Train-test split (80% / 20%, stratified by gold label value).
X_train, X_test, y_train, y_test = train_test_split(df_dir["Token"], 
                                                    df_dir["Gold"], 
                                                    test_size = 0.2, 
                                                    random_state = 42,
                                                    stratify = df_dir["Gold"])
df_train = df_dir.iloc[X_train.index].reset_index(drop = True)
df_test = df_dir.iloc[X_test.index].reset_index(drop = True)

# print(df_train.info())
# display(df_train.head())
# print(df_test.info())
# display(df_test.head())
# print(df_dev.info())
# display(df_dev.head())


  df_orig = df_orig.drop("sentence_info", 1)
  df_sample = df_sample.drop("sentence_info", 1)


## Labeling functions

Labeling functions will be written to cover the following intervention categories, as used by the manual annotators of this [dataset](https://github.com/bepnye/EBM-NLP):

- Surgical.
- Physical.
- Drug.
- Educational.
- Psychological.
- Other.
- Control.

In [17]:
# Label macros.
ABSTAIN = -1
NOT_I = 0
I = 1

# Data for labeling functions.
generic_interventions = ["therap", "treatment", "intervention",
                         "placebo", "dose", "control", "vaccin"]
nltk.download("stopwords")
stop_words = stopwords.words("english")
print("Total stop words =", len(stop_words))

Total stop words = 179


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raywang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Labeling functions
**All labeling functions label tokens. The corresponding gold labels are from the "starting span" labels.**

In [27]:
# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title(x, first_sentence_file):
    return I if x.Token.lower() in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title2(x, first_sentence_file):
    return I if x.Token.lower() in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title3(x, first_sentence_file):
    return I if x.Token.lower() in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?

# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def in_title4(x, first_sentence_file):
    return I if x.Token.lower() in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?


# Labeling function for tokens, if token is present in first sentence (title)
@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def not_in_title(x, first_sentence_file):
    return NOT_I if x.Token.lower() not in first_sentence_file[x.File] else ABSTAIN
# abstain or not_i?


@labeling_function(resources=dict(first_sentence_file=first_sentence_file))
def surround_in_title(x, first_sentence_file):
    if((x.sentence_index>0) and (x.sentence_index<len(x.sentence)-1)):
        if (x.sentence[x.sentence_index-1].lower() in first_sentence_file[x.File]) and (x.sentence[x.sentence_index+1].lower() in first_sentence_file[x.File]):
            return I
    
    return ABSTAIN
# abstain or not_i?


# Labeling function for tokens that contain drug suffixes.
@labeling_function()
def contains_drug_suffix(x):
    return I if (any(suffix.lower() in x.Token.lower() for suffix in drug_suffixes)) else ABSTAIN
    
# Labeling function for tokens that contain surgical suffixes.
@labeling_function()
def contains_surgical_suffix(x):
    return I if (any(suffix.lower() in x.Token.lower() for suffix in df_surgery[0])) else ABSTAIN

# Labeling function for tokens that contain psychological / psychotherapeutic keywords.
@labeling_function()
def contains_psych_term(x):
    return I if (any(suffix.lower() in x.Token.lower() for suffix in df_psych[0])) else ABSTAIN

# Labeling function for tokens that contain generic intervention keywords.
@labeling_function()
def is_generic(x):
    return I if (any(term.lower() in x.Token.lower() for term in generic_interventions)) else ABSTAIN

# Labeling function for stop words.
@labeling_function()
def is_stop_word(x):
    return NOT_I if x.Token.lower() in stop_words else ABSTAIN

# Labeling function for tokens that are punctuation.
@labeling_function()
def is_punctuation(x):
    return NOT_I if x.Token.lower() in string.punctuation else ABSTAIN


# Labeling function for FDA approved drugs.
@labeling_function()
def contains_fda_drug(x):
    if (len(x.Token) <= 5):
        return ABSTAIN

    return I if (any(x.Token.lower() in drug.lower() for drug in set_fda)) else ABSTAIN


# checks if the preceding token is 'of' or 'with' (effect of... I, treat with... I)
@labeling_function()
def has_prev_word_as(x):
    words = ['of', 'with', 'receive', 'and']
    if ((x.sentence_index > 0) and (x.sentence[x.sentence_index-1].lower() in words)):
        return I 

    else:
        return ABSTAIN
    
# checks if the next token is 'group' or 'groups'
@labeling_function()
def has_next_word_as(x):
    words = ['group', 'groups']
    if ((x.sentence_index < len(x.sentence)-1) and (x.sentence[x.sentence_index+1].lower() in words)):
        return NOT_I

    else:
        return ABSTAIN
    
# Labeling function which labels a token as NOT_I if it is in the last 50% of the file tokens.
@labeling_function()
def has_high_idx(x):
    percent = x.token_index / x.file_len
    if percent > 0.50:
        return NOT_I
    else:
        return ABSTAIN
    
    
    
# Labeling function for tokens, sees if left span of token within sentence contains keyword
@labeling_function()
def left_span_contains(x):
    
    i = 0
    while(x.sentence[i] != x.Token):
        i+=1
        
    count = 0
    while(i >= 0 and count < 10):
        if((x.sentence[i] == 'determine') or (x.sentence[i] == 'assess')):
            return I
        i-=1
        count+=1
        
    return ABSTAIN
# look into spouse tutorial left spans, and using 'resources' in LFs


# checks if the preceding token is VBD, VBN (e.g. was administered)
@labeling_function()
def right_span_vb_pos(x):
    if (x.sentence_index < len(x.sentence) - 2) and (x.pos_sentence[x.sentence_index+1] == 'VBD') and (x.pos_sentence[x.sentence_index+2] == 'VBN'):
        return I 

    else:
        return ABSTAIN
    
    
# checks if the preceding token is VBD, VBN (e.g. was administered)
@labeling_function()
def left_span_vb_pos(x):
    if (x.sentence_index > 0) and ('V' in x.pos_sentence[x.sentence_index-1]):
        return I 

    else:
        return ABSTAIN
    


    


## Generate Error Files

Incorrect tokens have {* and *} surrounding it. Incorrect is defined as the majority model label being not -1 and different than the gold label.

In [26]:
# Apply LFs to dataframe.
lfs = [
       #contains_psych_term, # accuracy = 0.131380
       # is_punctuation,
       #has_prev_word_as,
    
       # has_next_word_as_drug, low accuracy and coverage
    
       # left_span_contains,
       # right_span_vb_pos,
       # left_span_vb_pos,
    
       #negative LFs
       is_stop_word,
       has_next_word_as,
       has_high_idx,
    
       #positive LFs
       is_generic,
       contains_drug_suffix,
       contains_surgical_suffix,
       contains_fda_drug,
    
       in_title,
        in_title2,
        in_title3,
        in_title4,
       # not_in_title,
       surround_in_title,

      ]
applier = PandasLFApplier(lfs = lfs)
L_train = applier.apply(df = df_sample)
Y_train = df_sample["Gold"].to_numpy(dtype = int)



  0%|                                          | 2/8152 [00:00<08:11, 16.58it/s]


AttributeError: 'list' object has no attribute 'length'

In [None]:
# Majority vote model.
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L = L_train)

preds_train

In [262]:
df_sample['maj_pred']=pd.Series(preds_train.astype(str))
df_sample_orig = df_sample_orig.reset_index(drop = True)

df_sample_orig['maj_pred'] = '-2'

# adding maj_pred column to sample_original
for i in range(len(df_sample)):
    pmid = df_sample.loc[i, 'PMID']
    token_index = df_sample.loc[i, 'token_index']
    maj_pred = df_sample.loc[i, 'maj_pred']
    ind = df_sample_orig.index[((df_sample_orig['PMID'] == pmid) & (df_sample_orig['token_index'] == token_index))][0]
    df_sample_orig.loc[ind, 'maj_pred'] = maj_pred
    

In [263]:

df_sample_groups = df_sample_orig.groupby('PMID')

# outputting sample files , with tokens that are gold labeled as I highlighted
for name, group in df_sample_groups:
    with open('gold/' + name + '.md', 'w') as output:
        tokens = group.Token.tolist()
        golds = group.Gold.tolist()
        for i in range(0, len(tokens)):
            if(golds[i]=='1'):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')
        
            
            
# outputting sample files, with tokens that are missed highlighted (tokens gold labeled as 1, but predicted as 0 or -1)
for name, group in df_sample_groups:
    with open('missed/' + name + '.md', 'w') as output:
        tokens = group.Token.tolist()
        golds = group.Gold.tolist()
        preds = group.maj_pred.tolist()
        for i in range(0, len(tokens)):
            if((golds[i]=='1') and ((preds[i]=='-1') or (preds[i]=='0'))):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')
                            
                    
# outputting sample files, with tokens that are mislabeled highlighted
for name, group in df_sample_groups:
    with open('wrong/' + name + '.md', 'w') as output:
        tokens = group.Token.tolist()
        golds = group.Gold.tolist()
        preds = group.maj_pred.tolist()
        for i in range(0, len(tokens)):
            if(((golds[i]=='0') and (preds[i]=='1')) or ((golds[i]=='1') and (preds[i]=='0'))):
                output.write('**' + tokens[i] + '** ')
            else:
                output.write(tokens[i] + ' ')

## Apply labeling functions

In [28]:
# Apply LFs to dataframe.
lfs = [
       #contains_psych_term, # accuracy = 0.131380
       # is_punctuation,
       #has_prev_word_as,
    
       # has_next_word_as_drug, low accuracy and coverage
    
       # left_span_contains,
       # right_span_vb_pos,
       # left_span_vb_pos,
    
       #negative LFs
       is_stop_word,
       has_next_word_as,
       has_high_idx,
    
       #positive LFs
       is_generic,
       contains_drug_suffix,
       contains_surgical_suffix,
       contains_fda_drug,
    
       in_title,
        in_title2,
        in_title3,
        in_title4,
       # not_in_title,
       surround_in_title,

      ]
applier = PandasLFApplier(lfs = lfs)
L_train = applier.apply(df = df_train)
L_test = applier.apply(df = df_test)
# L_dev = applier.apply(df = df_dev)

100%|███████████████████████████████████████| 8000/8000 [05:40<00:00, 23.51it/s]
100%|███████████████████████████████████████| 2000/2000 [01:13<00:00, 27.39it/s]


In [220]:
#L_train
# L_dev

In [221]:
# # %%capture

# coverage_check_out, coverage_check = (L_dev != ABSTAIN).mean(axis = 0)
# print(f"check_out coverage: {coverage_check_out * 100:.1f}%")
# print(f"check coverage: {coverage_check * 100:.1f}%")


In [222]:
#L_train.shape


In [29]:
# Define Y_train, Y_test.
Y_train = df_train["Gold"].to_numpy(dtype = int)
Y_test = df_test["Gold"].to_numpy(dtype = int)
# Y_dev = df_dev["Gold"].to_numpy(dtype = int)

In [30]:
# Summarive coverage, conflicts, empirical accurcacy of LFs.
LFAnalysis(L_train, lfs).lf_summary(Y_train)
# LFAnalysis(L_dev, lfs).lf_summary(Y_dev)




Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_stop_word,0,[0],0.327,0.23025,0.118,2497,119,0.954511
has_next_word_as,1,[0],0.00925,0.008125,0.0045,57,17,0.77027
has_high_idx,2,[0],0.480625,0.2345,0.120125,3667,178,0.953706
is_generic,3,[1],0.0215,0.01525,0.009125,52,120,0.302326
contains_drug_suffix,4,[1],0.026125,0.0215,0.010375,88,121,0.421053
contains_surgical_suffix,5,[1],0.00275,0.00225,0.00125,8,14,0.363636
contains_fda_drug,6,[1],0.050875,0.039625,0.021375,141,266,0.346437
in_title,7,[1],0.258875,0.258875,0.15325,349,1722,0.168518
in_title2,8,[1],0.258875,0.258875,0.15325,349,1722,0.168518
in_title3,9,[1],0.258875,0.258875,0.15325,349,1722,0.168518


In [31]:
LFAnalysis(L_test, lfs).lf_summary(Y_test)



Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
is_stop_word,0,[0],0.318,0.223,0.1215,592,44,0.930818
has_next_word_as,1,[0],0.0105,0.009,0.0045,15,6,0.714286
has_high_idx,2,[0],0.4805,0.2345,0.1315,913,48,0.950052
is_generic,3,[1],0.0155,0.012,0.0085,8,23,0.258065
contains_drug_suffix,4,[1],0.0355,0.028,0.017,24,47,0.338028
contains_surgical_suffix,5,[1],0.0015,0.0015,0.0015,0,3,0.0
contains_fda_drug,6,[1],0.0485,0.0415,0.025,37,60,0.381443
in_title,7,[1],0.2505,0.2505,0.1555,86,415,0.171657
in_title2,8,[1],0.2505,0.2505,0.1555,86,415,0.171657
in_title3,9,[1],0.2505,0.2505,0.1555,86,415,0.171657


In [32]:
%%capture
'''
# Explore buckets for patterns in discordance.
buckets = get_label_buckets(L_train[:, 0], L_train[:, 1])
display(buckets)
display(df_train.iloc[buckets[(NOT_I, I)]].sample(10, random_state = 1))
'''

In [33]:
# Majority vote model.
majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L = L_train)


In [34]:
# Label model.
label_model = LabelModel(cardinality = 2, verbose = True)
label_model.fit(L_train = L_train, n_epochs = 500, log_freq = 100, seed = 123)


In [35]:
# Compute model performance metrics.
majority_scores = majority_model.score(L = L_test, Y = Y_test, 
                                       tie_break_policy = "random",
                                       metrics = ["f1", "accuracy", "precision", 
                                                  "recall", "roc_auc", "coverage"])
label_scores = label_model.score(L = L_test, Y = Y_test, 
                                 tie_break_policy = "random",
                                 metrics = ["f1", "accuracy", "precision", 
                                            "recall", "roc_auc", "coverage"])

In [36]:
# Compare model performance metrics.
majority_f1 = majority_scores.get("f1")
majority_acc = majority_scores.get("accuracy")
majority_prec = majority_scores.get("precision")
majority_rec = majority_scores.get("recall")
majority_roc = majority_scores.get("roc_auc")
majority_cov = majority_scores.get("coverage")
print(f"{'Majority Model F1:':<25} {majority_f1 * 100:.1f}%")
print(f"{'Majority Model Accuracy:':<25} {majority_acc * 100:.1f}%")
print(f"{'Majority Model Precision:':<25} {majority_prec * 100:.1f}%")
print(f"{'Majority Model Recall:':<25} {majority_rec * 100:.1f}%")
print(f"{'Majority Model AUC ROC:':<25} {majority_roc * 100:.1f}%")
print(f"{'Majority Model Coverage:':<25} {majority_cov * 100:.1f}%")
print("++++++++++++++++++++++++")

label_f1 = label_scores.get("f1")
label_acc = label_scores.get("accuracy")
label_prec = label_scores.get("precision")
label_rec = label_scores.get("recall")
label_roc = label_scores.get("roc_auc")
label_cov = label_scores.get("coverage")
print(f"{'Label Model F1:':<25} {label_f1 * 100:.1f}%")
print(f"{'Label Model Accuracy:':<25} {label_acc * 100:.1f}%")
print(f"{'Label Model Precision:':<25} {label_prec * 100:.1f}%")
print(f"{'Label Model Recall:':<25} {label_rec * 100:.1f}%")
print(f"{'Label Model AUC ROC:':<25} {label_roc * 100:.1f}%")
print(f"{'Label Model Coverage:':<25} {label_cov * 100:.1f}%")

Majority Model F1:        24.9%
Majority Model Accuracy:  61.4%
Majority Model Precision: 15.4%
Majority Model Recall:    65.6%
Majority Model AUC ROC:   66.3%
Majority Model Coverage:  100.0%
++++++++++++++++++++++++
Label Model F1:           24.2%
Label Model Accuracy:     51.9%
Label Model Precision:    14.3%
Label Model Recall:       78.5%
Label Model AUC ROC:      68.3%
Label Model Coverage:     100.0%


In [37]:
# View "dummy" accuracy if predicting majority class every time.
print("Accuracy if predicting majority class", 
      df_test["Gold"].value_counts(normalize = True).max())

Accuracy if predicting majority class 0.9025


## Explore errors

In [277]:
@labeling_function(pre=[spacy])
def is_noun(x):
    if x.doc.pos[0] == 'noun':
        return I
    else:
        return ABSTAIN

In [278]:
df_error = df_train.copy()

In [279]:
df_error["LF"] = df_error.apply(lambda x: has_next_word_as_drug(x), axis=1)
df_error.head()

Unnamed: 0,Token,File,Gold,PMID,token_index,file_len,Spans,LF
0,.,20369616.tokens,0,20369616,120,121,"[drug, ., Does]",-1
1,and,21410033.tokens,0,21410033,91,195,"[total, and, conjugated]",-1
2,quality,15673894.tokens,0,15673894,25,308,"[the, quality, of]",-1
3,),1631861.tokens,0,1631861,172,218,"[9/14, ), .]",-1
4,on,20390261.tokens,0,20390261,278,293,"[effect, on, reversal]",-1


In [280]:
sel = (df_error.LF == 1) & (df_error.Gold == '0') 
# labeled as NOT_I when actually I => 0 instances
# labeled as I when actually NOT_I => non-zero
df_error[sel].head(50)
# df_orig[(df_orig.PMID=='3385217') & (df_orig.Gold=='1')]

Unnamed: 0,Token,File,Gold,PMID,token_index,file_len,Spans,LF
54,fentanyl,21935685.tokens,0,21935685,147,209,"[after, fentanyl, bolus]",1
125,standardized,15264973.tokens,0,15264973,65,156,"[in, standardized, body]",1
183,anesthesia,18672629.tokens,0,18672629,72,265,"[general, anesthesia, was]",1
236,triple,24684165.tokens,0,24684165,198,337,"[the, triple, combination]",1
288,groups,24691455.tokens,0,24691455,92,214,"[;, groups, B]",1
314,groups,18453793.tokens,0,18453793,206,208,"[patient, groups, .]",1
353,papillomavirus,17367324.tokens,0,17367324,83,283,"[human, papillomavirus, infection]",1
408,infant,1741218.tokens,0,1741218,221,307,"[natural, infant, suckling]",1
459,venous,7211918.tokens,0,7211918,12,239,"[deep, venous, thrombosis]",1
464,cellular,10607234.tokens,0,10607234,404,448,"[on, cellular, distribution]",1


In [None]:
df_test[df_test.Token == 'surgery']


Unnamed: 0,Token,File,Gold,PMID,token_index,file_len,Spans,Surgical
250,surgery,19092729.tokens,0,19092729,17,332,"[buckling, surgery, .]",1
834,surgery,24532106.tokens,0,24532106,219,280,"[after, surgery, ,]",1
1130,surgery,15616772.tokens,1,15616772,147,336,"[(, surgery, only]",1
2110,surgery,10078673.tokens,0,10078673,62,291,"[abdominal, surgery, were]",1
3141,surgery,9278836.tokens,0,9278836,254,434,"[of, surgery, .]",1
3172,surgery,9278836.tokens,0,9278836,266,434,"[of, surgery, ,]",1
5358,surgery,8604728.tokens,0,8604728,170,316,"[filtering, surgery, group]",1
5837,surgery,14567804.tokens,0,14567804,86,240,"[Before, surgery, ,]",1
7105,surgery,18779477.tokens,0,18779477,202,310,"[glaucoma, surgery, was]",1
7944,surgery,11214014.tokens,0,11214014,12,306,"[bypass, surgery, .]",1


In [93]:
df_test["Gold"] = df_test["Gold"].astype(int)
df_test["Surgical Concord"] = np.where((df_test["Gold"] == df_test["Surgical"]), 1, 0)
df_test.head()

Unnamed: 0,Token,File,Gold,PMID,Spans,Surgical,Surgical Concord
0,was,7562882.tokens,0,7562882,"[lisinopril, was, observed]",-1,0
1,Mortality,3139179.tokens,0,3139179,"[RESULTS, Mortality, from]",-1,0
2,.,24077211.tokens,0,24077211,"[trial, ., OBJECTIVE]",-1,0
3,compared,10356632.tokens,0,10356632,"[and, compared, to]",-1,0
4,a,25542620.tokens,0,25542620,"[:, a, randomised]",-1,0


In [94]:
surg_discord = df_test[df_test["Surgical Concord"] == 0]
surg_discord = surg_discord[surg_discord["Surgical"] != -1]
display(surg_discord.head(100))
display(surg_discord.tail(100))

Unnamed: 0,Token,File,Gold,PMID,Spans,Surgical,Surgical Concord
13,surgery,7956382.tokens,0,7956382,"[), surgery, with]",1,0
428,surgery,1670445.tokens,0,1670445,"[hip, surgery, .]",1,0
1066,neuropsychiatric,17513813.tokens,0,17513813,"[severe, neuropsychiatric, toxicity]",1,0
1954,ureteroscopy,17156222.tokens,0,17156222,"[(, ureteroscopy, )]",1,0
3570,endoscopy,12233894.tokens,0,12233894,"[gastric, endoscopy, scores]",1,0
4036,surgical,15523393.tokens,0,15523393,"[The, surgical, resection]",1,0
5204,surgical,11922398.tokens,0,11922398,"[the, surgical, procedure]",1,0
6842,arthroscopy,12882611.tokens,0,12882611,"[requiring, arthroscopy, were]",1,0
7177,surgery,24532106.tokens,0,24532106,"[before, surgery, .]",1,0
7250,surgery,21389925.tokens,0,21389925,"["", surgery, .]",1,0


Unnamed: 0,Token,File,Gold,PMID,Spans,Surgical,Surgical Concord
13,surgery,7956382.tokens,0,7956382,"[), surgery, with]",1,0
428,surgery,1670445.tokens,0,1670445,"[hip, surgery, .]",1,0
1066,neuropsychiatric,17513813.tokens,0,17513813,"[severe, neuropsychiatric, toxicity]",1,0
1954,ureteroscopy,17156222.tokens,0,17156222,"[(, ureteroscopy, )]",1,0
3570,endoscopy,12233894.tokens,0,12233894,"[gastric, endoscopy, scores]",1,0
4036,surgical,15523393.tokens,0,15523393,"[The, surgical, resection]",1,0
5204,surgical,11922398.tokens,0,11922398,"[the, surgical, procedure]",1,0
6842,arthroscopy,12882611.tokens,0,12882611,"[requiring, arthroscopy, were]",1,0
7177,surgery,24532106.tokens,0,24532106,"[before, surgery, .]",1,0
7250,surgery,21389925.tokens,0,21389925,"["", surgery, .]",1,0
