## iGEM labeling functions

## example

In [1]:
import pandas as pd
import numpy as np
import os
import nltk
import regex as re
import ast
from collections import defaultdict
# from snorkel.labeling import labeling_function

In [61]:
# @labeling_function()
def oxidation_with_sub_prod(sentence):
    for word in oxidationWords:
        if word in sentence.lower:
            return TRUE
    return ABSTAIN

### my functions

In [101]:
test_data = [
    [["carbon", "oxygen"], "carbon was oxidized by the oxygen"],
    [["carbon", "oxygen", "amino acid", "cassie"], "the cassie amino acid is connected to the carbon which was oxidixed by the oxygen inside the dna"],
    
    [["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."],
    [["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."],
    [["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."],
    [["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."]
]

In [50]:
ABSTAIN = -1

In [51]:
# LF_general_chemical
# If one of the chemicals is in a list of common chemical terms such as amino acid, sugar, adenosine, amide, adenine, etc..., we label FALSE
# https://www.hach.com/chemGlossary   
common_chemical_terms = ["amino acid", "sugar", "adenosine", "amide", "adenine", "acid", "algaecide", "amines", "base", "biocides",
    "buffer", "clarifier", "diluent", "hydrocarbon", "hydrogenation", "molecule", "nutrients", "polymer", "peptide", "polypeptide", "solute",
    "solvent", "suspended", "mixture", "saturated", "unsaturated"]
def general_chemical(chemicals):
    for chem in chemicals:
        if (chem in common_chemical_terms):
            return (False)
    return ABSTAIN

In [52]:
# LF_adjacent_mentions
# If the chemicals are adjacent, we label FALSE
def adjacent_mentions(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(0, len(chemicals)):
            if ((chemicals[index] + " " + chemicals[index_2]) in sentence):
                return False
    return ABSTAIN

In [53]:
# LF_gene_words
# If one of the chemicals is in a list of genetic terms such as reductase, dna, gene, we label FALSE
# 
common_genetic_terms = ["reductase", "dna", "gene", "allele", "locus", "genotype", "phenotype", "dominant", "recessive", "additive", "phenoset",
    "diallelic", "multiallelic", "polyallelic", "monomorphic", "monoallelic", "polymorphism", "mutation", "complex", "trait", "multifactorial",
    "polygenic", "monogenic", "mixed model", "transmission probability", "transition probability", "epistasis", "interaction", "pleiotropy",
    "quantitative trait locus", "probit", "logit", "penetrance", "transformation", "scale of measurement", "identity by descent", "identity in state",
    "haplotype", "phase", "multilocus", "genotype", "allelic", "association", "linkage", "disequilibrium", "gametic"]
def gene_words(chemicals):
    for chem in chemicals:
        if (chem in common_genetic_terms):
            return False
    return ABSTAIN
    

In [54]:
# LF_sep_or
# If the chemicals are separated by or, we label FALSE
def sep_or(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            if (chemicals[index] + " or " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [55]:
# LF_sep_and
# If the chemicals are separated by and, we label FALSE
def sep_and(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            if (chemicals[index] + " and " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [56]:
# LF_sep_comma
# If the chemicals are separated by a comma, we label FALSE
def sep_comma(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            if (chemicals[index] + ", " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [57]:
# LF_sep_via
# If the chemicals are separated by via, in, etc, we label FALSE
common_via_words = ["via", "in", "within", "through", "inside", "by"]
def sep_via(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(index + 1, len(chemicals)):
            for term in common_via_words:
                if (chemicals[index] + " " + term + " " + chemicals[index_2] in sentence):
                    return False
    return ABSTAIN

In [58]:
# LF_sep_sym
# If the chemicals are separated by a single character, we label FALSE  
def sep_sym(chemicals, sentence):
    for index in range(len(chemicals)):
        chem_1_index = sentence.find(chemicals[index])
        if (chem_1_index == -1):
            continue
        chem_1_len = len(chemicals[index])
        for index_2 in range(index + 1, len(chemicals)):
            chem_2_index = sentence.find(chemicals[index_2])
            if (chem_2_index == -1):
                continue
            if (chem_1_index + chem_1_len + 1 == chem_2_index):
                return False
    return ABSTAIN

In [59]:
# LF_followed_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
def followed_ase(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    for chem in chemicals:
        if (" " in chem):
            chem_split = chem.split(" ")
            index = words.index(chem_split[-1])
        else:
            index = words.index(chem)
        if (index + 1 < len(words) and words[index + 1][-3:] == "ase"):
            return False
    return ABSTAIN

In [60]:
# LF_group
# If there is a close mention of a functional chemical group, we label FALSE (as it is more likely descriptive of a structure than of a reaction)
# https://www.masterorganicchemistry.com/2010/10/06/functional-groups-organic-chemistry/
common_functional_groups = ["alkane", "alkene", "alkyne", "benzene ring", "phenyl", "amine", "alcohol", "ether", "alkyl halide", "thiol",
    "aldehyde", "ketone", "ester", "carboxylic acid", "amide", "nitrile", "epoxide", "disulfide", "imine", "acid chloride", "anhydride", "nitro",
    "sulfide", "thioether"]
def group(sentence):
    for group in common_functional_groups:
        if (group in sentence):
            return False
    return ABSTAIN


In [61]:
# LF_followed_by_noun
# If one of the chemicals is followed by a noun, we label FALSE
def followed_by_noun(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    words = sentence.split(" ")
    tagged = nltk.pos_tag(words)
    for chem in chemicals:
        if (" " in chem):
            chem_split = chem.split(" ")
            index = words.index(chem_split[-1])
        else:
            index = words.index(chem)
        if (index + 1 < len(words)):
            if (tagged[index + 1][1] == "NN"):
                return False
    return ABSTAIN

In [62]:
# test cases
print(followed_by_noun(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(followed_by_noun(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(followed_by_noun(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(followed_by_noun(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

False
False
-1
False


In [63]:
# LF_sep_verb
# If the chemicals are separated by a verb, we label TRUE
def sep_verb(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in chemicals:
        if (" " in chem):
            chem_split = chem.split(" ")
            index = words.index(chem_split[-1])
        else:
            index = words.index(chem)
        indexes.append(index)
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "VB"):
            return True
    return ABSTAIN

In [64]:
# test cases
print(sep_verb(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(sep_verb(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(sep_verb(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(sep_verb(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

-1
-1
-1
True


In [65]:
# LF_sep_adverb
# If the chemicals are separated by a adverb, we label FALSE
def sep_adverb(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    words = sentence.split(" ")
    indexes = []
    for chem in chemicals:
        if (" " in chem):
            chem_split = chem.split(" ")
            index = words.index(chem_split[-1])
        else:
            index = words.index(chem)
        indexes.append(index)
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "RB"):
            return False
    return ABSTAIN

In [66]:
# test cases
print(sep_adverb(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(sep_adverb(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(sep_adverb(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(sep_adverb(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

-1
-1
-1
False


In [67]:
# things to fix
# if the chem is several words
# creating more test cases
# current test sentences from https://academic.oup.com/pcp/article/45/9/1271/1857717

In [110]:
for data in test_data:
    print(data)
    print("general chemicals")
    print(general_chemical(data[0]))
    print("adjacent mentions")
    print(adjacent_mentions(data[0], data[1]))
    print("sep or")
    print(sep_or(data[0], data[1]))
    print("sep and")
    print(sep_and(data[0], data[1]))
    print("sep comma")
    print(sep_comma(data[0], data[1]))
    print("sep via")
    print(sep_via(data[0], data[1]))
    print("sep sym")
    print(sep_sym(data[0], data[1]))
    print("followed ase")
    print(followed_ase(data[0], data[1]))
    print("group")
    print(group(data[1]))
    print("followed by noun")
    print(followed_by_noun(data[0], data[1]))
    print("sep verb")
    print(sep_verb(data[0], data[1]))
    print("sep adverb")
    print(sep_adverb(data[0], data[1]))
    print()

[['carbon', 'oxygen'], 'carbon was oxidized by the oxygen']
general chemicals
-1
adjacent mentions
carbon carbon
carbon oxygen
oxygen carbon
oxygen oxygen
-1
sep or
-1
sep and
-1
sep comma
-1
sep via
-1
sep sym
-1
followed ase
-1
group
-1
followed by noun
-1
sep verb
True
sep adverb
-1

[['carbon', 'oxygen', 'amino acid', 'cassie'], 'the cassie amino acid is connected to the carbon which was oxidixed by the oxygen inside the dna']
general chemicals
False
adjacent mentions
carbon carbon
carbon oxygen
carbon amino acid
carbon cassie
oxygen carbon
oxygen oxygen
oxygen amino acid
oxygen cassie
amino acid carbon
amino acid oxygen
amino acid amino acid
amino acid cassie
cassie carbon
cassie oxygen
cassie amino acid
False
sep or
-1
sep and
-1
sep comma
-1
sep via
-1
sep sym
-1
followed ase
-1
group
-1
followed by noun
False
sep verb
True
sep adverb
-1

[['L-GalDH'], 'The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH act

In [10]:
sentence_df = pd.read_csv("sentence_data_cleaned_jacob.csv")
sentence_with_chem_df = sentence_df.dropna()
several_chem_df = sentence_with_chem_df.loc[sentence_with_chem_df["chemicals"].str.contains(",")]

In [107]:
sentence_chem_df = several_chem_df[["sentence", "chemicals"]]

In [108]:
greek_alphabet = {
    u'\u0393': 'Gamma',
    u'\u0394': 'Delta',
    u'\u0398': 'Theta',
    u'\u039B': 'Lamda',
    u'\u039E': 'Xi',
    u'\u03A0': 'Pi',
    u'\u03A3': 'Sigma',
    u'\u03A6': 'Phi',
    u'\u03A7': 'Chi',
    u'\u03A8': 'Psi',
    u'\u03A9': 'Omega',
    u'\u03B1': 'alpha',
    u'\u03B2': 'beta',
    u'\u03B3': 'gamma',
    u'\u03B4': 'delta',
    u'\u03B5': 'epsilon',
    u'\u03B6': 'zeta',
    u'\u03B7': 'eta',
    u'\u03B8': 'theta',
    u'\u03B9': 'iota',
    u'\u03BA': 'kappa',
    u'\u03BB': 'lamda',
    u'\u03BC': 'mu',
    u'\u03BD': 'nu',
    u'\u03BE': 'xi',
    u'\u03BF': 'omicron',
    u'\u03C0': 'pi',
    u'\u03C1': 'rho',
    u'\u03C3': 'sigma',
    u'\u03C4': 'tau',
    u'\u03C5': 'upsilon',
    u'\u03C6': 'phi',
    u'\u03C7': 'chi',
    u'\u03C8': 'psi',
    u'\u03C9': 'omega'
}
greek_alphabet

{'Γ': 'Gamma',
 'Δ': 'Delta',
 'Θ': 'Theta',
 'Λ': 'Lamda',
 'Ξ': 'Xi',
 'Π': 'Pi',
 'Σ': 'Sigma',
 'Φ': 'Phi',
 'Χ': 'Chi',
 'Ψ': 'Psi',
 'Ω': 'Omega',
 'α': 'alpha',
 'β': 'beta',
 'γ': 'gamma',
 'δ': 'delta',
 'ε': 'epsilon',
 'ζ': 'zeta',
 'η': 'eta',
 'θ': 'theta',
 'ι': 'iota',
 'κ': 'kappa',
 'λ': 'lamda',
 'μ': 'mu',
 'ν': 'nu',
 'ξ': 'xi',
 'ο': 'omicron',
 'π': 'pi',
 'ρ': 'rho',
 'σ': 'sigma',
 'τ': 'tau',
 'υ': 'upsilon',
 'φ': 'phi',
 'χ': 'chi',
 'ψ': 'psi',
 'ω': 'omega'}

In [109]:
'β' in greek_alphabet.keys()

True

In [110]:
def remove_greek(words):
    for letter in greek_alphabet.keys():
        words = words.replace(letter, greek_alphabet[letter])
    return words


In [111]:
# greek test
remove_greek("11β-hydroxysteroid")

'11beta-hydroxysteroid'

In [112]:
def chem_into_array(chemicals):
    chemicals = remove_greek(chemicals)
    sample_chems = chemicals.replace(",", "")
    sample_chems_list = sample_chems.split(" ")
    return sample_chems_list

In [113]:
def sentence_cleaned(sentence):
    sentence = remove_greek(sentence)
    sentence = sentence.replace(".", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(";", "")
    return sentence

In [114]:
sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)


Unnamed: 0,sentence,chemicals
197,BACKGROUND AND AIMS: Glutamic acid decarboxyla...,"glutamate, GABA"
252,"Interestingly, the allele of PRO1 was shown to...","gamma-glutamyl, gamma-glutamyl, L-proline, L-g..."
282,We concluded that FDH has no direct role in th...,"Ser, formate"
318,Even though the activities of MAT and GNMT wer...,"S-adenosylmethionine, S-adenosylhomocysteine"
323,One of the enzymes responsible for the product...,"KA, kynurenine, glutamine"
439,PURPOSE: The fluoropyrimidine carbamate capeci...,"capecitabine, 5-fluorouracil, 5-FU, thymidine"
465,Carnitine acetyltransferases CrAT catalyze the...,"Carnitine, acetyl-CoA, carnitine, acetylcarnitine"
484,"L-serine dehydratase SDH, a member of the beta...","L-serine, L-serine, L-threonine, pyruvate, 2-o..."
545,Spermidine/spermine N1-acetyltransferase SSAT ...,"polyamine, spermidine, spermine"
561,Astrocytes may play a role in these manifestat...,"glutamate, glutamine, glutamine"


In [115]:
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)


Unnamed: 0,sentence,chemicals
197,BACKGROUND AND AIMS: Glutamic acid decarboxyla...,"[glutamate, GABA]"
252,"Interestingly, the allele of PRO1 was shown to...","[gamma-glutamyl, gamma-glutamyl, L-proline, L-..."
282,We concluded that FDH has no direct role in th...,"[Ser, formate]"
318,Even though the activities of MAT and GNMT wer...,"[S-adenosylmethionine, S-adenosylhomocysteine]"
323,One of the enzymes responsible for the product...,"[KA, kynurenine, glutamine]"
439,PURPOSE: The fluoropyrimidine carbamate capeci...,"[capecitabine, 5-fluorouracil, 5-FU, thymidine]"
465,Carnitine acetyltransferases CrAT catalyze the...,"[Carnitine, acetyl-CoA, carnitine, acetylcarni..."
484,"L-serine dehydratase SDH, a member of the beta...","[L-serine, L-serine, L-threonine, pyruvate, 2-..."
545,Spermidine/spermine N1-acetyltransferase SSAT ...,"[polyamine, spermidine, spermine]"
561,Astrocytes may play a role in these manifestat...,"[glutamate, glutamine, glutamine]"


In [119]:
for index in range(len(sentence_chem_df["sentence"])):
    sentence = sentence_chem_df["sentence"].iloc[index]
    chems = sentence_chem_df["chemicals"].iloc[index]
    print(chems)
    print(sentence)
    print("general chemicals: " + str(general_chemical(chems)))
    print("adjacent mentions: " + str(adjacent_mentions(chems, sentence)))
    print("sep or: " + str(sep_or(chems, sentence)))
    print("sep and: " + str(sep_and(chems, sentence)))
    print("sep comma: " + str(sep_comma(chems, sentence)))
    print("sep via: " + str(sep_via(chems, sentence)))
    print("sep sym: " + str(sep_sym(chems, sentence)))
    print("followed ase: " + str(followed_ase(chems, sentence)))
    print("group: " + str(group(sentence)))
    print("followed by noun: " + str(followed_by_noun(chems, sentence)))
    print("sep verb: " + str(sep_verb(chems, sentence)))
    print("sep adverb: " + str(sep_adverb(chems, sentence)))
    print()

['glutamate', 'GABA']
BACKGROUND AND AIMS: Glutamic acid decarboxylase GAD, EC 41115 catalyses the conversion of glutamate to gamma-aminobutyric acid GABA
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: -1
sep verb: -1
sep adverb: -1

['gamma-glutamyl', 'gamma-glutamyl', 'L-proline', 'L-glutamate']
Interestingly, the allele of PRO1 was shown to enhance the activities of gamma-glutamyl kinase and gamma-glutamyl phosphate reductase, both of which catalyze the first two steps of L-proline synthesis from L-glutamate and which together may form a complex in vivo
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: False
group: False
followed by noun: False
sep verb: True
sep adverb: -1

['Ser', 'formate']
We concluded that FDH has no direct role in the regulation of the above two pathways of Ser synthesis the breakdown of for