## iGEM labeling functions

## imports

In [46]:
import pandas as pd
import numpy as np
import os
import nltk
import regex as re
import ast
from collections import defaultdict
from snorkel.labeling import labeling_function

### setting up abstain

In [47]:
ABSTAIN = -1

In [48]:
all_lfs = []

### labeling functions + small tests written

In [49]:
# includes_solution_words
# If any of the words is in a list of solution terms, we label FALSE
# https://www.hach.com/chemGlossary   
solution_terms = ["buffer", "diluent", "solute", "solvent", "saturated", "unsaturated", "saturating",
                    "saturable"]
@labeling_function()
def includes_solution_words(x):
    for word in solution_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_solution_words)

In [50]:
# includes_mixture_words
# If any of the words is in a list of mixture terms, we label FALSE
# https://www.hach.com/chemGlossary   
mixture_terms = ["suspended", "mixture", "heterogenous", "homogeneous"]
@labeling_function()
def includes_mixture_words(x):
    for word in mixture_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_mixture_words)

In [51]:
# includes_physical_words
# If there are common physical terms, we label FALSE
physical_terms = ["detection", "composed", "weight", "characteristic", "metal", "express",
    "characterization", "color", "metalic", "consists", "pure", "compose", "assay", "mm", "bound",
    "permeable", "signal", "bind", "property", "stored", "released", "capacity", "resistance", "mol"]
@labeling_function()
def includes_physical_words(x):
    for word in physical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_physical_words)

In [52]:
# includes_genetic_words
# If one of the chemicals is in a list of genetic terms such as reductase, dna, gene, we label FALSE
genetic_terms = ["reductase", "dna", "gene", "allele", "locus", "genotype", "phenotype", "dominant", "recessive", "additive", "phenoset",
    "diallelic", "multiallelic", "polyallelic", "monomorphic", "monoallelic", "polymorphism", "mutation", "complex", "trait", "multifactorial",
    "polygenic", "monogenic", "mixed model", "transmission probability", "transition probability", "epistasis", "interaction", "pleiotropy",
    "quantitative trait locus", "probit", "logit", "penetrance", "transformation", "scale of measurement", "identity by descent", "identity in state",
    "haplotype", "phase", "multilocus", "genotype", "allelic", "association", "linkage", "disequilibrium", "gametic", "rna"]
@labeling_function()
def includes_genetic_words(x):
    for word in genetic_terms:
        if (word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_genetic_words)
    

In [53]:
# includes_structural_words
# If there are common structural terms, we label FALSE
structural_terms = ["loop", "sequence", "encodes", "code", "codon", "dna", "rna", "pair", "group", "active site",
    "bond", "chain", "gene", "structure", "structural", "encoding", "cdna", "cluster"]
@labeling_function()
def includes_structural_words(x):
    for word in structural_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_structural_words)

In [54]:
# includes_general_chemical_words
# If any of the words is in a list of common chemical terms such as amino acid, sugar, adenosine, amide, adenine, etc..., we label FALSE
# https://www.hach.com/chemGlossary   
chemical_terms = ["amino acid", "sugar", "adenosine", "amide", "adenine", "algaecide", "amines", "base", "biocides",
    "clarifier", "hydrocarbon", "molecule", "nutrients", "polymer", "peptide", "polypeptide",
    "tag", "functional", "activity", "electron", "cofactor", "gas"]
@labeling_function()
def includes_general_chemical_words(x):
    for word in chemical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_general_chemical_words)

In [55]:
# includes_functional_group
# If there is a close mention of a functional chemical group, we label FALSE (as it is more likely descriptive of a structure than of a reaction)
# https://www.masterorganicchemistry.com/2010/10/06/functional-groups-organic-chemistry/
common_functional_groups = ["alkane", "alkene", "alkyne", "benzene ring", "phenyl", "amine", "alcohol", "ether", "alkyl halide", "thiol",
    "aldehyde", "ketone", "ester", "carboxylic acid", "amide", "nitrile", "epoxide", "disulfide", "imine", "acid chloride", "anhydride", "nitro",
    "sulfide", "thioether", "group", "functional"]
@labeling_function()
def includes_functional_group(x):
    for group in common_functional_groups:
        if (group in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_functional_group)


In [56]:
# includes_amino_acid
# If there is an amino acid mentioned, we label FALSE
amino_acids = ['val', 'ile', 'leu', 'glu', 'gln', \
    'asp', 'asn' 'his' 'trp', 'phe', 'tyr',    \
    'arg', 'lys', 'ser', 'thr', 'met', 'ala',    \
    'gly', 'pro', 'cys', "amino"]
@labeling_function()
def includes_amino_acid(x):
    for group in amino_acids:
        if (group in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_amino_acid)


In [57]:
# includes_paper_artifacts
# If there are common words from paper headers/footers, we label FALSE
common_terms = ["university", "univ", "pharma", "avenue", "street", "road", "department", "usa", "reference", "ref",
    "keyword", "article", "http", "png", "jpg", "journal", "(20", "(19"]
@labeling_function()
def includes_paper_artifacts(x):
    for word in common_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_paper_artifacts)

In [58]:
# includes_no_terms
# If there are words stopping or with no change, we label FALSE
common_no_terms = ["inactivated", "unaffected", "inactive", "inactivates", "stops", "prevent", "inhibit",
    "denature", "block"]
@labeling_function()
def includes_no_terms(x):
    for word in common_no_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_no_terms)

In [59]:
def helper_sep_chems_with_or(chemicals):
    final = ""
    for chem in chemicals:
        if (final == ""):
            final += re.escape(chem)
        else:
            final += "|" + re.escape(chem)
    return final

In [60]:
# structure_adjacent_mentions
# If the chemicals are adjacent, we label FALSE
@labeling_function()
def structure_adjacent_mentions(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_adjacent_mentions)

In [61]:
# structure_sep_or
# If the chemicals are separated by or, we label FALSE
@labeling_function()
def structure_sep_or(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") or (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_or)

In [62]:
# structure_sep_and
# If the chemicals are separated by and, we label FALSE
@labeling_function()
def structure_sep_and(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") and (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_and)

In [63]:
# structure_sep_comma
# If the chemicals are separated by a comma, we label FALSE
@labeling_function()
def structure_sep_comma(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + "), (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_comma)

In [64]:
# LF_sep_via
# If the chemicals are separated by via, in, etc, we label FALSE
via_terms = "via|in"
@labeling_function()
def structure_sep_via(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") (" + via_terms + ") (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_via)

In [65]:
# structure_sep_sym
# If the chemicals are separated by a single character, we label FALSE  
@labeling_function()
def structure_sep_sym(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ").(" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_via)

In [66]:
# structure_followed_by_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
@labeling_function()
def structure_followed_by_ase(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + r") \w*ase\b"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_followed_by_ase)

In [67]:
# structure_followed_by_noun
# If one of the chemicals is followed by a noun, we label FALSE
@labeling_function()
def structure_followed_by_noun(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    if (chemicals == ""):
        return ABSTAIN
    structure = "(" + chemicals + r") \w+"
    for match in re.finditer(structure, sentence):
        if (nltk.pos_tag([match.group(0).split()[1]])[0][1] == "NN"):
            return False
    return ABSTAIN

all_lfs.append(structure_followed_by_noun)

In [68]:
# structure_sep_verb
# If the chemicals are separated by a verb, we label TRUE
@labeling_function()
def structure_sep_verb(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    if (chemicals == ""):
        return ABSTAIN
    structure = "(" + chemicals + r") \w+ (" + chemicals + ")"
    for match in re.finditer(structure, sentence):
        if (re.match(r"(\bVB|NNS)", nltk.pos_tag([match.group(0).split()[1]])[0][1])):
            return True
    return ABSTAIN

all_lfs.append(structure_sep_verb)

In [69]:
# structure_sep_adverb
# If the chemicals are separated by a adverb, we label FALSE
@labeling_function()
def structure_sep_adverb(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    if (chemicals == ""):
        return ABSTAIN
    structure = "(" + chemicals + r") \w+ (" + chemicals + ")"
    for match in re.finditer(structure, sentence):
        if (re.match(r"(\bRB)", nltk.pos_tag([match.group(0).split()[1]])[0][1])):
            return False
    return ABSTAIN

all_lfs.append(structure_sep_adverb)

In [70]:
structure_sep_adverb(["cassie crazily night away", ["cassie", "night"]])

False

In [71]:
# includes_oxidation_words
# If the sentence contains oxidation words, we label True
oxidation_terms = "oxidiz|oxidis|redox|reduc|rust|corrod|oxygen"
@labeling_function()
def includes_oxidation_words(x):
    structure = "(" + oxidation_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_oxidation_words)

In [72]:
# includes_combustion_words
# If the sentence contains combustion words, we label True
combustion_terms = "combust|burn|explod|gas-form"
@labeling_function()
def includes_combustion_words(x):
    structure = "(" + combustion_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_combustion_words)

In [73]:
# includes_neutralization_words
# If the sentence contains neutralization words, we label True
neutralization_terms = "neutraliz|titrat|buffer|gas-form"
@labeling_function()
def includes_neutralization_words(x):
    structure = "(" + neutralization_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_neutralization_words)

In [74]:
# LF_includes_catalyze_words
# If the sentence contains catalyze words, we label True
# catalyze_words = ["catalyze", "catalyst", "catalyse", "catalysing", "catalyzing"]
# @labeling_function()
# def includes_catalyze_words(x):
#     for word in catalyze_words:
#         if(word in x[0]):
#             return True
#     return ABSTAIN

In [75]:
# includes_combination_words
# If the sentence contains combination words, we label True
combination_terms = "combin"
@labeling_function()
def includes_combination_words(x):
    structure = "(" + combination_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_combination_words)

In [76]:
# includes_decomposition_words
# If the sentence contains decomposition words, we label True
decomposition_terms = "decompos"
@labeling_function()
def includes_decomposition_words(x):
    structure = "(" + decomposition_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_decomposition_words)

In [77]:
# includes_replacement_words
# If the sentence contains decomposition words, we label True
replacement_terms = "replac"
@labeling_function()
def includes_replacement_words(x):
    structure = "(" + replacement_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_replacement_words)

In [291]:
# includes_reaction_words
# If the sentence contains reactions words, we label True
reaction_terms = "conver|yield|produc|mak|creat|synthesiz|synthesis|transform|ferment|break|displac|exchang" + \
                 "|precipit|transfer|through|produc|activat|revers|form|ation|metaboliz|metabolis|generat|hydroly" + \
                 "|lead|caus|methyl|result"
@labeling_function()
def includes_reaction_words(x):
    structure = "(" + reaction_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_reaction_words)

In [79]:
# includes_react
# If the sentence contains react, we label True
@labeling_function()
def includes_react(x):
    if("react" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_react)

In [292]:
# includes_react_sym
# If the sentence contains react, we label True
@labeling_function()
def includes_react_sym(x):
    if("-->" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_react_sym)

In [80]:
# includes_reaction_component_words
# If the sentence contains reaction components words, we label True
reaction_component_terms = ["substrate", "product", "reactant", "step"]
@labeling_function()
def includes_reaction_component_words(x):
    for word in reaction_component_terms:
        if(word in x[0]):
            return True
    return ABSTAIN

all_lfs.append(includes_reaction_component_words)

In [81]:
# includes_comparison_words
# If the sentence contains comparison words, we label False
comparison_terms = "similar|more|greater|less|increas|decreas|compar|differ|relativ|better|time|than"
@labeling_function()
def includes_comparison_words(x):
    structure = "(" + comparison_terms + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(includes_comparison_words)

In [82]:
# includes_concentration
# If the sentence contains react, we label True
@labeling_function()
def includes_concentration(x):
    if("concentration" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_concentration)

In [83]:
# includes_measure_words
# If the sentence contains measure words, we label False
measure_terms = ["high", "low", "ph", "stability", "corelated", "more", "less", "level", "degree", "time",
                "measure"]
@labeling_function()
def includes_measure_words(x):
    for word in measure_terms:
        if(word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_measure_words)

In [84]:
# includes_experiment_words
# If the sentence contains experiement words, we label False
experiment_terms = ["mice", "cell", "mouse", "ovary", "male", "female", "animal", "study", "method",
    "test", "treat", "protection", "brain", "nerve", "human", "tissue", "fetal", "vitro", "studies",
    "membrane", "strain", "mutant", "regulate", "dependent", "drug", "therapy", "oral", "test", "autoantigen"]
@labeling_function()
def includes_experiment_words(x):
    for word in experiment_terms:
        if(word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_experiment_words)

In [299]:
# structure_next_to_conversion_words
# If the sentence contains to, from, into, etc., we label True
conversion_terms_next = "to|from|into|becom|became|by"
@labeling_function()
def structure_next_to_conversion_words(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "((" + chemicals + r") (" + conversion_terms_next + r")\b|((" + conversion_terms_next + r")\b (" + chemicals + ")))"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_next_to_conversion_words)

In [300]:
# structure_sep_conversion_words
# If the sentence contains to, from, into, etc., we label True
conversion_terms_sep = "to|into"
@labeling_function()
def structure_sep_conversion_words(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + r") (" + conversion_terms_sep + r") (" + chemicals + ")"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_sep_conversion_words)

In [302]:
# structure_conversion_by
# If the sentence contains to, from, into, etc., we label True
conversion_terms_by = "to|from"
@labeling_function()
def structure_conversion_by(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + conversion_terms_by + ") (" + chemicals + ") by"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_conversion_by)

In [303]:
# structure_conversion_of
# If the sentence contains to, from, into, etc., we label True
conversion_terms_of = "ation of|sion of|ism of"
@labeling_function()
def structure_conversion_of(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "((" + conversion_terms_of + ") (" + chemicals + ")|of (" + chemicals + ") (" + conversion_terms_sep + "))"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_conversion_of)

In [304]:
# includes_one_chem
# If the sentence contains one identified chemical, we label False
@labeling_function()
def includes_one_chem(x):
    if(len(x[1]) == 1):
            return False
    return ABSTAIN

all_lfs.append(includes_one_chem)

In [205]:
# # includes_more_than_one_chem
# # If the sentence contains more than one identified chemical, we label True
# @labeling_function()
# def includes_more_than_one_chem(x):
#     if(len(x[1]) > 1):
#             return True
#     return ABSTAIN

# all_lfs.append(includes_more_than_one_chem)

### data cleaning stuff

In [206]:
greek_alphabet = {
    u'\u0393': 'Gamma',
    u'\u0394': 'Delta',
    u'\u0398': 'Theta',
    u'\u039B': 'Lamda',
    u'\u039E': 'Xi',
    u'\u03A0': 'Pi',
    u'\u03A3': 'Sigma',
    u'\u03A6': 'Phi',
    u'\u03A7': 'Chi',
    u'\u03A8': 'Psi',
    u'\u03A9': 'Omega',
    u'\u03B1': 'alpha',
    u'\u03B2': 'beta',
    u'\u03B3': 'gamma',
    u'\u03B4': 'delta',
    u'\u03B5': 'epsilon',
    u'\u03B6': 'zeta',
    u'\u03B7': 'eta',
    u'\u03B8': 'theta',
    u'\u03B9': 'iota',
    u'\u03BA': 'kappa',
    u'\u03BB': 'lamda',
    u'\u03BC': 'mu',
    u'\u03BD': 'nu',
    u'\u03BE': 'xi',
    u'\u03BF': 'omicron',
    u'\u03C0': 'pi',
    u'\u03C1': 'rho',
    u'\u03C3': 'sigma',
    u'\u03C4': 'tau',
    u'\u03C5': 'upsilon',
    u'\u03C6': 'phi',
    u'\u03C7': 'chi',
    u'\u03C8': 'psi',
    u'\u03C9': 'omega'
}

In [207]:
def remove_greek(words):
    for letter in greek_alphabet.keys():
        words = words.replace(letter, greek_alphabet[letter])
    return words

In [208]:
def chem_into_array(chemicals):
    if (chemicals == "0"):
        return []
    chemicals = remove_greek(chemicals)
    chemicals = chemicals.lower()
    sample_chems = chemicals.replace("%20", " ")
    sample_chems_list = sample_chems.split(", ")
    for index in range(len(sample_chems_list)):
        while(sample_chems_list[index][-1] == ","):
            sample_chems_list[index] = sample_chems_list[index][:-1]
    sample_chems_list = list(set(sample_chems_list))
    return sample_chems_list

In [209]:
def sentence_cleaned(sentence):
    sentence = remove_greek(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace(".", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(";", "")
    sentence = sentence.replace(":", "")
    return sentence

### importing data in and cleaning it (this data has truth values)

In [245]:
# have this csv file in the same folder
sentence_df = pd.read_csv("updated_data_cassie.csv")

In [246]:
sentence_chem_df = sentence_df[["sentence", "chemicals", "truth", "substrates", "products"]]
sentence_chem_df.head()

Unnamed: 0,sentence,chemicals,truth,substrates,products
0,The enzyme cyclo-oxygenase catalyses the oxyge...,prostaglandins,1.0,"arachidonic, acid",prostaglandins
1,Recently two forms of cyclo-oxygenase have bee...,,0.0,,
2,Constitutive and inducible forms of human cycl...,,0.0,,
3,hCOX-1 had a specific activity of 18.8 mumol o...,arachidonate,0.0,arachidonate,
4,"of 1500 nmol of O2/nmol of enzyme, whereas hCO...",arachidonate,0.0,arachidonate,


In [247]:
import math
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].replace(np.NaN, "0")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].replace(np.NaN, "0")


In [248]:
sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)
sentence_chem_df["truth"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)


0.0    462
1.0     33
Name: truth, dtype: int64

In [249]:
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)


In [250]:
sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()


Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1.0,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
1,recently two forms of cyclo-oxygenase have bee...,[],0.0,,,[recently two forms of cyclo-oxygenase have be...
2,constitutive and inducible forms of human cycl...,[],0.0,,,[constitutive and inducible forms of human cyc...
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0.0,arachidonate,,[hcox-1 had a specific activity of 188 mumol o...
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0.0,arachidonate,,"[of 1500 nmol of o2/nmol of enzyme, whereas hc..."
...,...,...,...,...,...,...
1409,water uptake of the polymer was only 28 and 02...,[pec],0.0,,,[water uptake of the polymer was only 28 and 0...
1410,degradation of less hydrophilic pec41 with hig...,[],0.0,,,[degradation of less hydrophilic pec41 with hi...
1411,"by this mechanism, ce-responsive drug in vitro...",[pec],0.0,,,"[by this mechanism, ce-responsive drug in vitr..."
1412,"as expected, less bovine serum albumin bsa was...",[],0.0,,,"[as expected, less bovine serum albumin bsa wa..."


### data with no truth values

In [251]:
# have this data one folder up
uncleaned_df = pd.read_csv("../sentence_annotations_elsevier_pmid_split6.csv")

In [252]:
uncleaned_no_na_df = uncleaned_df.dropna()
uncleaned_several_chem_df = uncleaned_no_na_df.loc[uncleaned_no_na_df["chemical_names"].str.contains(",")]
uncleaned_several_chem_df = uncleaned_several_chem_df[["sentence", "chemical_names"]]

In [253]:
uncleaned_several_chem_df["sentence"] = uncleaned_several_chem_df["sentence"].apply(sentence_cleaned)
uncleaned_several_chem_df["chemical_names"] = uncleaned_several_chem_df["chemical_names"].apply(chem_into_array)

In [254]:
# need to get rid of this line if this data should be used
re_check_several_chem_uncleaned_df = uncleaned_several_chem_df

In [255]:
re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()
brenda_clean_df = re_check_several_chem_uncleaned_df

### snorkel code for model (running on data with truth values)

In [256]:
from snorkel.labeling import PandasLFApplier

In [257]:
# how to find the location
# sentence_chem_df[sentence_chem_df["sentence"] == "even though the activities of mat and gnmt were elevated, the concentration of liver s-adenosylmethionine was decreased 24%, p<0001 and s-adenosylhomocysteine increased 113%, p<0001 in the dwarf mice"]

In [258]:
sentence_chem_df.head()

Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1.0,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
1,recently two forms of cyclo-oxygenase have bee...,[],0.0,,,[recently two forms of cyclo-oxygenase have be...
2,constitutive and inducible forms of human cycl...,[],0.0,,,[constitutive and inducible forms of human cyc...
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0.0,arachidonate,,[hcox-1 had a specific activity of 188 mumol o...
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0.0,arachidonate,,"[of 1500 nmol of o2/nmol of enzyme, whereas hc..."


In [259]:
brenda_clean_df.head()

Unnamed: 0,sentence,chemical_names,text
150,"we selected diltiazem, cyclosporin a, and dexa...","[diltiazem, calcein-am, cyclosporin a, dexamet...","[we selected diltiazem, cyclosporin a, and dex..."
271,"croteau r washington state univ, inst biol che...","[sodium dodecyl sulfate, limonene, geranyl pyr...","[croteau r washington state univ, inst biol ch..."
280,storage tissue h+/substrate stoichiometries fo...,[h+],[storage tissue h+/substrate stoichiometries f...
281,"briskin dp univ illinois, dept agron, 1201 w g...",[h+],"[briskin dp univ illinois, dept agron, 1201 w ..."
286,from these results and the estimated level of ...,[h+],[from these results and the estimated level of...


In [344]:
# Define the set of labeling functions (LFs)
# currently excluding amino_acid and followed_ase and followed_by_noun
lfs = [includes_solution_words, includes_mixture_words, includes_physical_words, includes_genetic_words, includes_structural_words,
      includes_general_chemical_words, includes_functional_group, includes_paper_artifacts, includes_no_terms, structure_adjacent_mentions,
      structure_sep_or, structure_sep_comma, structure_sep_via, structure_sep_sym, structure_sep_adverb, includes_oxidation_words,
      structure_sep_verb, structure_sep_conversion_words, includes_combustion_words, includes_neutralization_words, includes_combination_words,
      includes_decomposition_words, includes_replacement_words, includes_reaction_words, includes_reaction_component_words, includes_comparison_words,
      includes_one_chem, includes_react, includes_measure_words, includes_experiment_words, includes_concentration, structure_sep_and,
      structure_next_to_conversion_words, structure_conversion_by, structure_conversion_of, includes_react_sym]
      # includes_amino_acid, structure_followed_by_ase, structure_followed_by_noun]

# removing physical_words increases recall but causes large drop in precision
# sep_conversion_word and sep_verb removal increase precision to 0.71 with recall at 0.38
# Apply the LFs to the unlabeled training data
df_train = brenda_clean_df
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 50212/50212 [06:55<00:00, 120.97it/s]


In [345]:
L_train[1]

array([-1, -1, -1,  0,  0, -1, -1,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  0, -1, -1, -1,  1, -1,
        1, -1])

In [346]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
includes_solution_words,0,[0],0.053254,0.053214,0.051601
includes_mixture_words,1,[0],0.026826,0.026786,0.024237
includes_physical_words,2,[0],0.456126,0.453019,0.37979
includes_genetic_words,3,[0],0.288716,0.288377,0.248486
includes_structural_words,4,[0],0.312834,0.31122,0.255078
includes_general_chemical_words,5,[0],0.387517,0.384808,0.316
includes_functional_group,6,[0],0.250199,0.249602,0.211822
includes_paper_artifacts,7,[0],0.132717,0.132438,0.118418
includes_no_terms,8,[0],0.13198,0.130766,0.110093
structure_adjacent_mentions,9,[0],0.092906,0.092906,0.078567


In [347]:
LFAnalysis(L_train).label_coverage()

0.9932884569425635

In [348]:
# gets the average amount of labels (goal is to get this to 15)
sum = 0
total = 0
for item in L_train:
    sum += 31 - np.count_nonzero(item == -1)
    total += 1
sum/total


0.27308213176133195

In [349]:
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel

In [350]:
# Define the set of labeling functions (LFs)
# currently excluding amino_acid and followed_ase and followed_by_noun

# removing physical_words increases recall but causes large drop in precision
# sep_conversion_word and sep_verb removal increase precision to 0.71 with recall at 0.38
# Apply the LFs to the unlabeled training data
df_test = sentence_chem_df
applier_test = PandasLFApplier(lfs=lfs)
L_test = applier_test.apply(df=df_test)

100%|██████████| 1414/1414 [00:03<00:00, 439.67it/s]


In [351]:
LFAnalysis(L=L_test, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
includes_solution_words,0,[0],0.012023,0.012023,0.010608
includes_mixture_words,1,[0],0.001414,0.001414,0.001414
includes_physical_words,2,[0],0.304809,0.304809,0.218529
includes_genetic_words,3,[0],0.212871,0.212871,0.166195
includes_structural_words,4,[0],0.194484,0.194484,0.149222
includes_general_chemical_words,5,[0],0.251061,0.251061,0.188826
includes_functional_group,6,[0],0.173975,0.173975,0.139321
includes_paper_artifacts,7,[0],0.041726,0.041726,0.028996
includes_no_terms,8,[0],0.192362,0.191655,0.14215
structure_adjacent_mentions,9,[0],0.39604,0.39604,0.269448


In [352]:
LFAnalysis(L_test).label_coverage()

0.9985855728429985

In [353]:
majority_model = MajorityLabelVoter()
df_test["label_voter"] = majority_model.predict(L=L_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["label_voter"] = majority_model.predict(L=L_test)


In [354]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance=[0.95, 0.05])
df_test["label_model"] = label_model.predict(L=L_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["label_model"] = label_model.predict(L=L_test)


In [355]:
def grab_second(probs):
    return (probs[1])

In [356]:
# df_train["label_probs"] = np.apply_along_axis(grab_second, 1, majority_model.predict_proba(L=L_train))
# df_train["label_probs"].value_counts()

In [357]:
df_test.to_csv(r'../labeled.csv')

In [358]:
df_test["truth"].value_counts()

0.0    462
1.0     33
Name: truth, dtype: int64

In [359]:
only_truth_df = df_test.dropna(subset=['truth'])
only_truth_df

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label_voter,label_model
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1.0,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...,1,0
1,recently two forms of cyclo-oxygenase have bee...,[],0.0,,,[recently two forms of cyclo-oxygenase have be...,0,0
2,constitutive and inducible forms of human cycl...,[],0.0,,,[constitutive and inducible forms of human cyc...,0,0
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0.0,arachidonate,,[hcox-1 had a specific activity of 188 mumol o...,0,0
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0.0,arachidonate,,"[of 1500 nmol of o2/nmol of enzyme, whereas hc...",0,0
...,...,...,...,...,...,...,...,...
1409,water uptake of the polymer was only 28 and 02...,[pec],0.0,,,[water uptake of the polymer was only 28 and 0...,0,0
1410,degradation of less hydrophilic pec41 with hig...,[],0.0,,,[degradation of less hydrophilic pec41 with hi...,0,0
1411,"by this mechanism, ce-responsive drug in vitro...",[pec],0.0,,,"[by this mechanism, ce-responsive drug in vitr...",-1,0
1412,"as expected, less bovine serum albumin bsa was...",[],0.0,,,"[as expected, less bovine serum albumin bsa wa...",0,0


In [360]:
import sklearn as sk
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score


In [361]:
no_abstain_df = only_truth_df[only_truth_df["label_voter"] != -1]
print("f1:", sk.metrics.f1_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
print("recall:", recall_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
print("precision:", precision_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
print("accuracy:", accuracy_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
print("balanced accuracy:", balanced_accuracy_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
tn, fp, fn, tp = confusion_matrix(no_abstain_df["truth"], no_abstain_df["label_voter"]).ravel()
print("matrix:", (tn, fp, fn, tp))

f1: 0.7096774193548386
recall: 0.7586206896551724
precision: 0.6666666666666666
accuracy: 0.9611231101511879
balanced accuracy: 0.8666375337676784
matrix: (423, 11, 7, 22)


In [None]:
# no physical structure lf
# f1: 0.7272727272727273
# recall: 0.8275862068965517
# precision: 0.6486486486486487
# accuracy: 0.9602649006622517
# balanced accuracy: 0.8984629147690306
# matrix: (411, 13, 5, 24)

In [362]:
no_abstain_df = only_truth_df[only_truth_df["label_model"] != -1]
print("f1:", sk.metrics.f1_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
print("recall:", recall_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
print("precision:", precision_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
print("accuracy:", accuracy_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
print("balanced accuracy:", balanced_accuracy_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
tn, fp, fn, tp = confusion_matrix(no_abstain_df["truth"], no_abstain_df["label_model"]).ravel()
print("matrix:", (tn, fp, fn, tp))

f1: 0.2631578947368421
recall: 0.15151515151515152
precision: 1.0
accuracy: 0.9434343434343434
balanced accuracy: 0.5757575757575758
matrix: (462, 0, 28, 5)


In [None]:
# no physical structure lf
# f1: 0.3902439024390244
# recall: 0.24242424242424243
# precision: 1.0
# accuracy: 0.9494949494949495
# balanced accuracy: 0.6212121212121212
# matrix: (462, 0, 25, 8)

In [212]:
df_train[df_train["label_voter"] != -1]["truth"].value_counts()

0.0    325
1.0     11
Name: truth, dtype: int64

In [213]:
df_train[df_train["label_voter"] != -1]["label"].value_counts()

0    1285
1      45
Name: label, dtype: int64

In [214]:
matched_df = df_train[df_train["label_voter"] == df_train["truth"]]
matched_df["label"].value_counts()

0    323
1      7
Name: label, dtype: int64