## iGEM labeling functions

## example

In [60]:
import pandas as pd
import numpy as np
import os
import nltk
import regex as re
import ast
from collections import defaultdict
# from snorkel.labeling import labeling_function

In [61]:
# @labeling_function()
def oxidation_with_sub_prod(sentence):
    for word in oxidationWords:
        if word in sentence.lower:
            return TRUE
    return ABSTAIN

### my functions

In [101]:
test_data = [
    [["carbon", "oxygen"], "carbon was oxidized by the oxygen"],
    [["carbon", "oxygen", "amino acid", "cassie"], "the cassie amino acid is connected to the carbon which was oxidixed by the oxygen inside the dna"],
    
    [["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."],
    [["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."],
    [["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."],
    [["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."]
]

In [81]:
ABSTAIN = -1

In [88]:
# LF_general_chemical
# If one of the chemicals is in a list of common chemical terms such as amino acid, sugar, adenosine, amide, adenine, etc..., we label FALSE
# https://www.hach.com/chemGlossary   
common_chemical_terms = ["amino acid", "sugar", "adenosine", "amide", "adenine", "acid", "algaecide", "amines", "base", "biocides",
    "buffer", "clarifier", "diluent", "hydrocarbon", "hydrogenation", "molecule", "nutrients", "polymer", "peptide", "polypeptide", "solute",
    "solvent", "suspended", "mixture", "saturated", "unsaturated"]
def general_chemical(chemicals):
    for chem in chemicals:
        if (chem in common_chemical_terms):
            return (False)
    return ABSTAIN

In [64]:
# LF_adjacent_mentions
# If the chemicals are adjacent, we label FALSE
def adjacent_mentions(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            if (chemicals[index] + " " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [89]:
# LF_gene_words
# If one of the chemicals is in a list of genetic terms such as reductase, dna, gene, we label FALSE
# 
common_genetic_terms = ["reductase", "dna", "gene", "allele", "locus", "genotype", "phenotype", "dominant", "recessive", "additive", "phenoset",
    "diallelic", "multiallelic", "polyallelic", "monomorphic", "monoallelic", "polymorphism", "mutation", "complex", "trait", "multifactorial",
    "polygenic", "monogenic", "mixed model", "transmission probability", "transition probability", "epistasis", "interaction", "pleiotropy",
    "quantitative trait locus", "probit", "logit", "penetrance", "transformation", "scale of measurement", "identity by descent", "identity in state",
    "haplotype", "phase", "multilocus", "genotype", "allelic", "association", "linkage", "disequilibrium", "gametic"]
def gene_words(chemicals):
    for chem in chemicals:
        if (chem in common_genetic_terms):
            return False
    return ABSTAIN
    

In [66]:
# LF_sep_or
# If the chemicals are separated by or, we label FALSE
def sep_or(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            if (chemicals[index] + " or " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [67]:
# LF_sep_and
# If the chemicals are separated by and, we label FALSE
def sep_and(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            if (chemicals[index] + " and " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [68]:
# LF_sep_comma
# If the chemicals are separated by a comma, we label FALSE
def sep_comma(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            if (chemicals[index] + ", " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [69]:
# LF_sep_via
# If the chemicals are separated by via, in, etc, we label FALSE
common_via_words = ["via", "in", "within", "through", "inside", "by"]
def sep_via(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            for term in common_via_words:
                if (chemicals[index] + " " + term + " " + chemicals[index_2] in sentence):
                    return False
    return ABSTAIN

In [70]:
# LF_sep_sym
# If the chemicals are separated by a single character, we label FALSE  
def sep_sym(chemicals, sentence):
    for index in range(len(chemicals)):
        chem_1_index = sentence.find(chemicals[index])
        if (chem_1_index == -1):
            continue
        chem_1_len = len(chemicals[index])
        for index_2 in range(index + 1, len(chemicals)):
            chem_2_index = sentence.find(chemicals[index_2])
            if (chem_2_index == -1):
                continue
            if (chem_1_index + chem_1_len + 1 == chem_2_index):
                return False
    return ABSTAIN

In [97]:
# LF_followed_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
def followed_ase(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    for chem in chemicals:
        if (" " in chem):
            chem_split = chem.split(" ")
            index = words.index(chem_split[-1])
        else:
            index = words.index(chem)
        if (index + 1 < len(words) and words[index + 1][-3:] == "ase"):
            return False
    return ABSTAIN

In [90]:
# LF_group
# If there is a close mention of a functional chemical group, we label FALSE (as it is more likely descriptive of a structure than of a reaction)
# https://www.masterorganicchemistry.com/2010/10/06/functional-groups-organic-chemistry/
common_functional_groups = ["alkane", "alkene", "alkyne", "benzene ring", "phenyl", "amine", "alcohol", "ether", "alkyl halide", "thiol",
    "aldehyde", "ketone", "ester", "carboxylic acid", "amide", "nitrile", "epoxide", "disulfide", "imine", "acid chloride", "anhydride", "nitro",
    "sulfide", "thioether"]
def group(sentence):
    for group in common_functional_groups:
        if (group in sentence):
            return False
    return ABSTAIN


In [96]:
# LF_followed_by_noun
# If one of the chemicals is followed by a noun, we label FALSE
def followed_by_noun(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    words = sentence.split(" ")
    tagged = nltk.pos_tag(words)
    for chem in chemicals:
        if (" " in chem):
            chem_split = chem.split(" ")
            index = words.index(chem_split[-1])
        else:
            index = words.index(chem)
        if (index + 1 < len(words)):
            if (tagged[index + 1][1] == "NN"):
                return False
    return ABSTAIN

In [74]:
# test cases
print(followed_by_noun(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(followed_by_noun(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(followed_by_noun(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(followed_by_noun(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

False
False
-1
False


In [98]:
# LF_sep_verb
# If the chemicals are separated by a verb, we label TRUE
def sep_verb(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in chemicals:
        if (" " in chem):
            chem_split = chem.split(" ")
            index = words.index(chem_split[-1])
        else:
            index = words.index(chem)
        indexes.append(index)
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "VB"):
            return True
    return ABSTAIN

In [76]:
# test cases
print(sep_verb(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(sep_verb(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(sep_verb(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(sep_verb(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

-1
-1
-1
True


In [99]:
# LF_sep_adverb
# If the chemicals are separated by a adverb, we label FALSE
def sep_adverb(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    words = sentence.split(" ")
    indexes = []
    for chem in chemicals:
        if (" " in chem):
            chem_split = chem.split(" ")
            index = words.index(chem_split[-1])
        else:
            index = words.index(chem)
        indexes.append(index)
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "RB"):
            return False
    return ABSTAIN

In [78]:
# test cases
print(sep_adverb(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(sep_adverb(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(sep_adverb(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(sep_adverb(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

-1
-1
-1
False


In [79]:
# things to fix
# if the chem is several words
# creating more test cases
# current test sentences from https://academic.oup.com/pcp/article/45/9/1271/1857717

In [102]:
for data in test_data:
    print(data)
    print("general chemicals")
    print(general_chemical(data[0]))
    print("adjacent mentions")
    print(adjacent_mentions(data[0], data[1]))
    print("sep or")
    print(sep_or(data[0], data[1]))
    print("sep and")
    print(sep_and(data[0], data[1]))
    print("sep comma")
    print(sep_comma(data[0], data[1]))
    print("sep via")
    print(sep_via(data[0], data[1]))
    print("sep sym")
    print(sep_sym(data[0], data[1]))
    print("followed ase")
    print(followed_ase(data[0], data[1]))
    print("group")
    print(group(data[1]))
    print("followed by noun")
    print(followed_by_noun(data[0], data[1]))
    print("sep verb")
    print(sep_verb(data[0], data[1]))
    print("sep adverb")
    print(sep_adverb(data[0], data[1]))
    print()

[['carbon', 'oxygen'], 'carbon was oxidized by the oxygen']
general chemicals
-1
adjacent mentions
-1
sep or
-1
sep and
-1
sep comma
-1
sep via
-1
sep sym
-1
followed ase
-1
group
-1
followed by noun
-1
sep verb
True
sep adverb
-1

[['carbon', 'oxygen', 'amino acid', 'cassie'], 'the cassie amino acid is connected to the carbon which was oxidixed by the oxygen inside the dna']
general chemicals
False
adjacent mentions
-1
sep or
-1
sep and
-1
sep comma
-1
sep via
-1
sep sym
-1
followed ase
-1
group
-1
followed by noun
False
sep verb
True
sep adverb
-1

[['L-GalDH'], 'The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity.']
general chemicals
-1
adjacent mentions
-1
sep or
-1
sep and
-1
sep comma
-1
sep via
-1
sep sym
-1
followed ase
-1
group
-1
followed by noun
False
sep verb
-1
sep adverb
-1

[['L-GalDH'], 'Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy.']
general chemica