## iGEM labeling functions

## example

In [293]:
import pandas as pd
import numpy as np
import os
import nltk
import regex as re
import ast
from collections import defaultdict
# from snorkel.labeling import labeling_function

In [294]:
# @labeling_function()
def oxidation_with_sub_prod(sentence):
    for word in oxidationWords:
        if word in sentence.lower:
            return TRUE
    return ABSTAIN

### test data written by hand

In [295]:
test_data = [
    [["carbon", "oxygen"], "carbon was oxidized by the oxygen"],
    [["carbon", "oxygen", "amino acid", "cassie"], "the cassie amino acid is connected to the carbon which was oxidixed by the oxygen inside the dna"],
    
    [["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."],
    [["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."],
    [["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."],
    [["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."]
]

### setting up abstain

In [296]:
ABSTAIN = -1

### labeling functions + small tests written

In [297]:
# LF_general_chemical
# If one of the chemicals is in a list of common chemical terms such as amino acid, sugar, adenosine, amide, adenine, etc..., we label FALSE
# https://www.hach.com/chemGlossary   
common_chemical_terms = ["amino acid", "sugar", "adenosine", "amide", "adenine", "acid", "algaecide", "amines", "base", "biocides",
    "buffer", "clarifier", "diluent", "hydrocarbon", "hydrogenation", "molecule", "nutrients", "polymer", "peptide", "polypeptide", "solute",
    "solvent", "suspended", "mixture", "saturated", "unsaturated"]
def general_chemical(chemicals):
    for chem in chemicals:
        if (chem in common_chemical_terms):
            return (False)
    return ABSTAIN

In [360]:
# LF_adjacent_mentions
# If the chemicals are adjacent, we label FALSE
def adjacent_mentions(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(len(chemicals)):
            if ((chemicals[index] + " " + chemicals[index_2]) in sentence):
                return False
    return ABSTAIN

In [299]:
# LF_gene_words
# If one of the chemicals is in a list of genetic terms such as reductase, dna, gene, we label FALSE
# 
common_genetic_terms = ["reductase", "dna", "gene", "allele", "locus", "genotype", "phenotype", "dominant", "recessive", "additive", "phenoset",
    "diallelic", "multiallelic", "polyallelic", "monomorphic", "monoallelic", "polymorphism", "mutation", "complex", "trait", "multifactorial",
    "polygenic", "monogenic", "mixed model", "transmission probability", "transition probability", "epistasis", "interaction", "pleiotropy",
    "quantitative trait locus", "probit", "logit", "penetrance", "transformation", "scale of measurement", "identity by descent", "identity in state",
    "haplotype", "phase", "multilocus", "genotype", "allelic", "association", "linkage", "disequilibrium", "gametic"]
def gene_words(chemicals):
    for chem in chemicals:
        if (chem in common_genetic_terms):
            return False
    return ABSTAIN
    

In [359]:
# LF_sep_or
# If the chemicals are separated by or, we label FALSE
def sep_or(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(len(chemicals)):
            if (chemicals[index] + " or " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [358]:
# LF_sep_and
# If the chemicals are separated by and, we label FALSE
def sep_and(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(len(chemicals)):
            if (chemicals[index] + " and " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [357]:
# LF_sep_comma
# If the chemicals are separated by a comma, we label FALSE
def sep_comma(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(len(chemicals)):
            if (chemicals[index] + ", " + chemicals[index_2] in sentence):
                return False
    return ABSTAIN

In [356]:
# LF_sep_via
# If the chemicals are separated by via, in, etc, we label FALSE
common_via_words = ["via", "in"]
def sep_via(chemicals, sentence):
    for index in range(len(chemicals)):
        for index_2 in range(len(chemicals)):
            for term in common_via_words:
                if (chemicals[index] + " " + term + " " + chemicals[index_2] in sentence):
                    return False
    return ABSTAIN

In [304]:
# LF_sep_sym
# If the chemicals are separated by a single character, we label FALSE  
def sep_sym(chemicals, sentence):
    for index in range(len(chemicals)):
        chem_1_index = sentence.find(chemicals[index])
        chem_1_len = len(chemicals[index])
        while (0 <= chem_1_index < len(sentence)):
            for index_2 in range(len(chemicals)):
                chem_2_index = sentence.find(chemicals[index_2])
                while (0 <= chem_2_index < len(sentence)):
                    if (sep_sym_helper(chem_1_index, chem_1_len, chem_2_index)):
                        return False
                    chem_2_index = sentence.find(chemicals[index_2], chem_2_index + 1)
            chem_1_index = sentence.find(chemicals[index], chem_1_index + 1)
    return ABSTAIN

def sep_sym_helper(index_1, length, index_2):
    if (index_1 + length + 1== index_2):
        return True
    return False

In [305]:
# testing separated by a single character (ensuring it works with repeated chemicals)
sep_sym(["cassie", "cas"], "cassie is here and her name cassie cas is cas cassie")

False

In [306]:
# LF_followed_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
def followed_ase(chemicals, sentence):
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    for chem in chemicals:
        index = -1
        while (index < len(words)):
            if not(chem in words[index + 1:]):
                index = len(words)
                break
            if (" " in chem):
                chem_split = chem.split(" ")
                index = words.index(chem_split[-1], index + 1)
            else:
                index = words.index(chem, index + 1)
            if (0 < index + 1 < len(words) and words[index + 1][-3:] == "ase"):
                return False
    return ABSTAIN

In [307]:
# test followed_ase (making sure it works if the chem is repeated)
followed_ase(["carbon", "oxygen"], "carbon fiber and carbon lactase with oxygen")

False

In [308]:
# LF_group
# If there is a close mention of a functional chemical group, we label FALSE (as it is more likely descriptive of a structure than of a reaction)
# https://www.masterorganicchemistry.com/2010/10/06/functional-groups-organic-chemistry/
common_functional_groups = ["alkane", "alkene", "alkyne", "benzene ring", "phenyl", "amine", "alcohol", "ether", "alkyl halide", "thiol",
    "aldehyde", "ketone", "ester", "carboxylic acid", "amide", "nitrile", "epoxide", "disulfide", "imine", "acid chloride", "anhydride", "nitro",
    "sulfide", "thioether"]
def group(sentence):
    for group in common_functional_groups:
        if (group in sentence):
            return False
    return ABSTAIN


In [309]:
# LF_followed_by_noun
# If one of the chemicals is followed by a noun, we label FALSE
def followed_by_noun(chemicals, sentence):
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    tagged = nltk.pos_tag(words)
    for chem in chemicals:
        index = -1
        while (index < len(words)):
            if not(chem in words[index + 1:]):
                index = len(words)
                break
            if (" " in chem):
                chem_split = chem.split(" ")
                index = words.index(chem_split[-1], index + 1)
            else:
                index = words.index(chem, index + 1)
            if (index + 1 < len(words)):
                if (tagged[index + 1][1] == "NN"):
                    return False
    return ABSTAIN

In [310]:
# testing followed_by_noun (ensuring it works with repeats)
followed_by_noun(["cassie", "apple"], "cassie eating apple, eating for cassie table")

False

In [311]:
# test cases
print(followed_by_noun(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(followed_by_noun(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(followed_by_noun(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(followed_by_noun(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

False
False
-1
False


In [312]:
# LF_sep_verb
# If the chemicals are separated by a verb, we label TRUE
def sep_verb(chemicals, sentence):
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in chemicals:
        index = -1
        while (index < len(words)):
            if (" " in chem):
                chem_split = chem.split(" ")
                if not(chem_split[-1] in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem_split[-1], index + 1)
            else:
                if not(chem in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem, index + 1)
            indexes.append(index)
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "VB"):
            return True
    return ABSTAIN

In [313]:
# testing sep_verb (works when there are several instances)
sep_verb(["cassie", "cas"], "cassie cas cassie hi there oxidized cas")

True

In [314]:
# test cases
print(sep_verb(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(sep_verb(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(sep_verb(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(sep_verb(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

-1
-1
-1
True


In [315]:
# LF_sep_adverb
# If the chemicals are separated by a adverb, we label FALSE
def sep_adverb(chemicals, sentence):
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in chemicals:
        index = -1
        while (index < len(words)):
            if (" " in chem):
                chem_split = chem.split(" ")
                if not(chem_split[-1] in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem_split[-1], index + 1)
            else:
                if not(chem in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem, index + 1)
            indexes.append(index)
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "RB"):
            return False
    return ABSTAIN

In [316]:
# testing sep_adverb (for repeated chemicals)
sep_adverb(["cassie", "cas"], "cassie happily danced to the moon with cas")

False

In [317]:
# test cases
print(sep_adverb(["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."))
print(sep_adverb(["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."))
print(sep_adverb(["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."))
print(sep_adverb(["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."))

-1
-1
-1
False


In [340]:
# LF_includes_reaction_words
# If the sentence contains reactions words, we label True
reaction_words = ["convert", "yeild", "produce", "make", "react", "create", "synthesize", "conversion",
    "transformation", "transform", "synthesise", "burn", "rust", "ferment", "explode", "agent", "catalyze"
    "combust", "corrode", "decompose", "oxidize", "neutralize", "oxidization", "neutralization", "combustion",
    "redox", "methylaion", "gas-forming", "displacement", "combination", "exchange", "precipitate",
    "precipitation", "transfer", "through", "by"]
def includes_reaction_words(sentence):
    for word in reaction_words:
        if(word in sentence):
            return True
    return ABSTAIN

In [361]:
# LF_sep_conversion_words
# If the sentence contains to, from, into, etc., we label True
conversion_words = ["to", "from", "into", "becomes", "became"]
def sep_converstion_words(chemicals, sentence):
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in chemicals:
        index = -1
        while (index < len(words)):
            if (" " in chem):
                chem_split = chem.split(" ")
                if not(chem_split[-1] in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem_split[-1], index + 1)
            else:
                if not(chem in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem, index + 1)
            indexes.append(index)
    min_index = min(indexes)
    max_index = max(indexes)
    for index in range(min_index + 1, max_index):
        if (words[index] in conversion_words):
            return True
    return ABSTAIN

In [365]:
# test sep_conversion_words
sep_converstion_words(["cassie", "cas"], "cassie truns into cas")

True

### my todo + notes

In [318]:
# things to fix
    # create specific reation verb labeling (complete)
        # oxidation
        # reduction
        # combustion
        # composition
        # decomposition
        # etc
    # creating more test cases
    # current test sentences from
        # jacob's csv
        # https://academic.oup.com/pcp/article/45/9/1271/1857717
    # ways to clean data
        # ensure there are at least two chemicals (complete)
        # get rid of the greek alphabet (complete)
        # get rid of periods and () and ; (complete)
        # get rid of repeat chems (complete)
    # need to figure out how to deal with things in the sentence twice (complete)

### testing on data written in this file

In [319]:
for data in test_data:
    print(data)
    print("general chemicals")
    print(general_chemical(data[0]))
    print("adjacent mentions")
    print(adjacent_mentions(data[0], data[1]))
    print("sep or")
    print(sep_or(data[0], data[1]))
    print("sep and")
    print(sep_and(data[0], data[1]))
    print("sep comma")
    print(sep_comma(data[0], data[1]))
    print("sep via")
    print(sep_via(data[0], data[1]))
    print("sep sym")
    print(sep_sym(data[0], data[1]))
    print("followed ase")
    print(followed_ase(data[0], data[1]))
    print("group")
    print(group(data[1]))
    print("followed by noun")
    print(followed_by_noun(data[0], data[1]))
    print("sep verb")
    print(sep_verb(data[0], data[1]))
    print("sep adverb")
    print(sep_adverb(data[0], data[1]))
    print()

[['carbon', 'oxygen'], 'carbon was oxidized by the oxygen']
general chemicals
-1
adjacent mentions
-1
sep or
-1
sep and
-1
sep comma
-1
sep via
-1
sep sym
-1
followed ase
-1
group
-1
followed by noun
-1
sep verb
True
sep adverb
-1

[['carbon', 'oxygen', 'amino acid', 'cassie'], 'the cassie amino acid is connected to the carbon which was oxidixed by the oxygen inside the dna']
general chemicals
False
adjacent mentions
False
sep or
-1
sep and
-1
sep comma
-1
sep via
-1
sep sym
False
followed ase
-1
group
-1
followed by noun
False
sep verb
True
sep adverb
-1

[['L-GalDH'], 'The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity.']
general chemicals
-1
adjacent mentions
-1
sep or
-1
sep and
-1
sep comma
-1
sep via
-1
sep sym
-1
followed ase
-1
group
-1
followed by noun
False
sep verb
-1
sep adverb
-1

[['L-GalDH'], 'Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy.']
general c

### importing data in and cleaning it

In [397]:
sentence_df = pd.read_csv("sentence_data_cleaned_jacob.csv")
sentence_with_chem_df = sentence_df.dropna()
several_chem_df = sentence_with_chem_df.loc[sentence_with_chem_df["chemicals"].str.contains(",")]
several_chem_df

Unnamed: 0,doc,sentence_index,sentence,entity_type,label,chemicals,genes,substrates,products,enzymes,label index dict,abstract_clean,abstract_expand,abstract_ordered
197,12067524,0,BACKGROUND AND AIMS: Glutamic acid decarboxyla...,"null, null, null, null, null, null, null, null...","O, O, O, O, B-enzyme, I-enzyme, O, O, B-enzyme...","glutamate, GABA",GAD,glutamate,"gamma-aminobutyric, acid, GABA","Glutamic, acid, GAD, EC, 4.1.1.15","{4: 'B-enzyme', 5: 'I-enzyme', 8: 'B-enzyme', ...",BACKGROUND AND AIMS: Glutamic acid decarboxyla...,['These included a specific enzyme activity of...,['BACKGROUND AND AIMS: Glutamic acid decarboxy...
252,12513997,3,"Interestingly, the allele of PRO1 was shown to...","null, null, null, null, null, GENE-Y, null, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, B-enzym...","gamma-glutamyl, gamma-glutamyl, L-proline, L-g...",PRO1,L-glutamate,L-proline,"gamma-glutamyl, kinase, gamma-glutamyl, phosphate","{13: 'B-enzyme', 14: 'I-enzyme', 16: 'B-enzyme...",We previously isolated a mutant which showed a...,['The approach described in this paper could b...,['We previously isolated a mutant which showed...
282,12668769,8,We concluded that FDH has no direct role in th...,"null, null, null, GENE-Y, null, null, null, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O...","Ser, formate","FDH, FDH",formate,CO,FDH,"{23: 'B-SUBSTRATE', 25: 'B-PRODUCT-OF', 31: 'B...",Serine (Ser) biosynthesis in C(3) plants can o...,"['In shoots, therefore, the pathway from forma...",['Serine (Ser) biosynthesis in C(3) plants can...
318,12742526,6,Even though the activities of MAT and GNMT wer...,"null, null, null, null, null, GENE-N, null, GE...","O, O, O, O, O, B-enzyme, O, B-enzyme, O, O, O,...","S-adenosylmethionine, S-adenosylhomocysteine","MAT, GNMT",S-adenosylmethionine,S-adenosylhomocysteine,"MAT, GNMT","{5: 'B-enzyme', 7: 'B-enzyme', 15: 'B-SUBSTRAT...",Ames dwarf mice (df/df) are deficient in growt...,"['Taken together, the data suggest that methio...",['Ames dwarf mice (df/df) are deficient in gro...
323,12850267,1,One of the enzymes responsible for the product...,"null, null, null, null, null, null, null, null...","O, O, O, O, O, O, O, O, O, B-PRODUCT-OF, O, B-...","KA, kynurenine, glutamine","KATI, GTK",glutamine,"KA, oxoglutaramic, acid","kynurenine, aminotransferase, KATI, GTK, EC, 2...","{9: 'B-PRODUCT-OF', 11: 'B-enzyme', 12: 'I-enz...",Kynurenic acid (KA) is an endogenous glutamate...,['Kynurenic acid (KA) is an endogenous glutama...,['Kynurenic acid (KA) is an endogenous glutama...
439,15132128,0,PURPOSE: The fluoropyrimidine carbamate (capec...,"null, null, null, null, null, null, CHEMICAL, ...","O, O, O, B-SUBSTRATE, I-SUBSTRATE, O, B-SUBSTR...","capecitabine, 5-fluorouracil, 5-FU, thymidine",TP,"fluoropyrimidine, carbamate, capecitabine","5-fluorouracil, 5-FU","thymidine, phosphorylase, TP","{3: 'B-SUBSTRATE', 4: 'I-SUBSTRATE', 6: 'B-SUB...",PURPOSE: The fluoropyrimidine carbamate (capec...,['Favorable enzyme profiles (high TP and low D...,['PURPOSE: The fluoropyrimidine carbamate (cap...
465,15155769,1,Carnitine acetyltransferases (CrAT) catalyze t...,"CHEMICAL, null, null, GENE-Y, null, null, null...","B-enzyme, I-enzyme, O, B-enzyme, O, O, O, O, O...","Carnitine, acetyl-CoA, carnitine, acetylcarnitine",CrAT,"acetyl-CoA, carnitine",acetylcarnitine,"Carnitine, acetyltransferases, CrAT","{0: 'B-enzyme', 1: 'I-enzyme', 3: 'B-enzyme', ...","In eukaryotes, L-carnitine is involved in ener...","['In eukaryotes, L-carnitine is involved in en...","['In eukaryotes, L-carnitine is involved in en..."
484,15689518,0,"L-serine dehydratase (SDH), a member of the be...","CHEMICAL, null, null, GENE-Y, null, null, null...","B-enzyme, I-enzyme, O, B-enzyme, O, O, O, O, O...","L-serine, L-serine, L-threonine, pyruvate, 2-o...",SDH,"L-serine, L-threonine","pyruvate, 2-oxobutyrate","L-serine, dehydratase, SDH","{0: 'B-enzyme', 1: 'I-enzyme', 3: 'B-enzyme', ...","L-serine dehydratase (SDH), a member of the be...","['Furthermore, the activity of hSDH-PLP was as...","['L-serine dehydratase (SDH), a member of the ..."
545,16455797,0,Spermidine/spermine N1-acetyltransferase (SSAT...,"null, null, null, GENE-Y, null, null, null, nu...","B-enzyme, I-enzyme, O, O, O, O, O, O, O, O, O,...","polyamine, spermidine, spermine",SSAT,"spermidine, spermine",polyamine,"Spermidine/spermine, N1-acetyltransferase","{0: 'B-enzyme', 1: 'I-enzyme', 13: 'B-PRODUCT-...",Spermidine/spermine N1-acetyltransferase (SSAT...,['Spermidine/spermine N1-acetyltransferase (SS...,['Spermidine/spermine N1-acetyltransferase (SS...
561,16484281,1,Astrocytes may play a role in these manifestat...,"null, null, null, null, null, null, null, null...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O...","glutamate, glutamine, glutamine",GS,glutamate,glutamine,"glutamine, synthetase, GS","{17: 'B-SUBSTRATE', 22: 'B-PRODUCT-OF', 26: 'B...",Excess activation of glutamatergic neurotransm...,['Packing density of GS and GFAP-immunoreactiv...,['Excess activation of glutamatergic neurotran...


In [460]:
sentence_chem_df = several_chem_df[["sentence", "chemicals"]]

In [461]:
greek_alphabet = {
    u'\u0393': 'Gamma',
    u'\u0394': 'Delta',
    u'\u0398': 'Theta',
    u'\u039B': 'Lamda',
    u'\u039E': 'Xi',
    u'\u03A0': 'Pi',
    u'\u03A3': 'Sigma',
    u'\u03A6': 'Phi',
    u'\u03A7': 'Chi',
    u'\u03A8': 'Psi',
    u'\u03A9': 'Omega',
    u'\u03B1': 'alpha',
    u'\u03B2': 'beta',
    u'\u03B3': 'gamma',
    u'\u03B4': 'delta',
    u'\u03B5': 'epsilon',
    u'\u03B6': 'zeta',
    u'\u03B7': 'eta',
    u'\u03B8': 'theta',
    u'\u03B9': 'iota',
    u'\u03BA': 'kappa',
    u'\u03BB': 'lamda',
    u'\u03BC': 'mu',
    u'\u03BD': 'nu',
    u'\u03BE': 'xi',
    u'\u03BF': 'omicron',
    u'\u03C0': 'pi',
    u'\u03C1': 'rho',
    u'\u03C3': 'sigma',
    u'\u03C4': 'tau',
    u'\u03C5': 'upsilon',
    u'\u03C6': 'phi',
    u'\u03C7': 'chi',
    u'\u03C8': 'psi',
    u'\u03C9': 'omega'
}

In [462]:
'β' in greek_alphabet.keys()

True

In [463]:
def remove_greek(words):
    for letter in greek_alphabet.keys():
        words = words.replace(letter, greek_alphabet[letter])
    return words


In [464]:
# greek test
remove_greek("11β-hydroxysteroid")

'11beta-hydroxysteroid'

In [510]:
def chem_into_array(chemicals):
    chemicals = remove_greek(chemicals)
    chemicals = chemicals.lower()
    sample_chems = chemicals.replace("%20", " ")
    sample_chems_list = sample_chems.split(", ")
    for index in range(len(sample_chems_list)):
        while(sample_chems_list[index][-1] == ","):
            sample_chems_list[index] = sample_chems_list[index][:-1]
    sample_chems_list = list(set(sample_chems_list))
    return sample_chems_list

In [512]:
# removing excess commas from the end
chem_into_array("cassie,, cassie, cas")

['cassie', 'cas']

In [488]:
def sentence_cleaned(sentence):
    sentence = remove_greek(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace(".", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(";", "")
    sentence = sentence.replace(":", "")
    return sentence

In [467]:
sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)


Unnamed: 0,sentence,chemicals
197,background and aims glutamic acid decarboxylas...,"glutamate, GABA"
252,"interestingly, the allele of pro1 was shown to...","gamma-glutamyl, gamma-glutamyl, L-proline, L-g..."
282,we concluded that fdh has no direct role in th...,"Ser, formate"
318,even though the activities of mat and gnmt wer...,"S-adenosylmethionine, S-adenosylhomocysteine"
323,one of the enzymes responsible for the product...,"KA, kynurenine, glutamine"
439,purpose the fluoropyrimidine carbamate capecit...,"capecitabine, 5-fluorouracil, 5-FU, thymidine"
465,carnitine acetyltransferases crat catalyze the...,"Carnitine, acetyl-CoA, carnitine, acetylcarnitine"
484,"l-serine dehydratase sdh, a member of the beta...","L-serine, L-serine, L-threonine, pyruvate, 2-o..."
545,spermidine/spermine n1-acetyltransferase ssat ...,"polyamine, spermidine, spermine"
561,astrocytes may play a role in these manifestat...,"glutamate, glutamine, glutamine"


In [468]:
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)
sentence_chem_df = sentence_chem_df.loc[sentence_chem_df["chemicals"].str.len() > 1]
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)


Unnamed: 0,sentence,chemicals
197,background and aims glutamic acid decarboxylas...,"[glutamate, gaba]"
252,"interestingly, the allele of pro1 was shown to...","[l-glutamate, gamma-glutamyl, l-proline]"
282,we concluded that fdh has no direct role in th...,"[formate, ser]"
318,even though the activities of mat and gnmt wer...,"[s-adenosylmethionine, s-adenosylhomocysteine]"
323,one of the enzymes responsible for the product...,"[glutamine, kynurenine, ka]"
439,purpose the fluoropyrimidine carbamate capecit...,"[capecitabine, thymidine, 5-fu, 5-fluorouracil]"
465,carnitine acetyltransferases crat catalyze the...,"[carnitine, acetyl-coa, acetylcarnitine]"
484,"l-serine dehydratase sdh, a member of the beta...","[l-serine, 2-oxobutyrate, pyruvate, l-threonine]"
545,spermidine/spermine n1-acetyltransferase ssat ...,"[spermidine, spermine, polyamine]"
561,astrocytes may play a role in these manifestat...,"[glutamate, glutamine]"


### running test on the imported data

In [470]:
for index in range(len(sentence_chem_df["sentence"])):
    sentence = sentence_chem_df["sentence"].iloc[index]
    chems = sentence_chem_df["chemicals"].iloc[index]
    print(chems)
    print(sentence)
    print("general chemicals: " + str(general_chemical(chems)))
    print("adjacent mentions: " + str(adjacent_mentions(chems, sentence)))
    print("sep or: " + str(sep_or(chems, sentence)))
    print("sep and: " + str(sep_and(chems, sentence)))
    print("sep comma: " + str(sep_comma(chems, sentence)))
    print("sep via: " + str(sep_via(chems, sentence)))
    print("sep sym: " + str(sep_sym(chems, sentence)))
    print("followed ase: " + str(followed_ase(chems, sentence)))
    print("group: " + str(group(sentence)))
    print("followed by noun: " + str(followed_by_noun(chems, sentence)))
    print("sep verb: " + str(sep_verb(chems, sentence)))
    print("sep adverb: " + str(sep_adverb(chems, sentence)))
    print("includes reaction words: " + str(includes_reaction_words(sentence)))
    print("sep conversion words: " + str(sep_converstion_words(chems, sentence)))
    print()

['glutamate', 'gaba']
background and aims glutamic acid decarboxylase gad, ec 41115 catalyses the conversion of glutamate to gamma-aminobutyric acid gaba
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: -1
sep verb: -1
sep adverb: -1
includes reaction words: True
sep conversion words: True

['l-glutamate', 'gamma-glutamyl', 'l-proline']
interestingly, the allele of pro1 was shown to enhance the activities of gamma-glutamyl kinase and gamma-glutamyl phosphate reductase, both of which catalyze the first two steps of l-proline synthesis from l-glutamate and which together may form a complex in vivo
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: False
group: False
followed by noun: False
sep verb: True
sep adverb: -1
includes reaction words: -1
sep conversion words: True

['formate', 'ser']
we concluded that fdh has no 

### new csv uncleaned data

In [489]:
uncleaned_df = pd.read_csv("sentence_annotations_elsevier_pmid_split6.csv")

In [513]:
uncleaned_no_na_df = uncleaned_df.dropna()
uncleaned_several_chem_df = uncleaned_no_na_df.loc[uncleaned_no_na_df["chemical_names"].str.contains(",")]
uncleaned_several_chem_df = uncleaned_several_chem_df[["sentence", "chemical_names"]]

In [514]:
uncleaned_several_chem_df["sentence"] = uncleaned_several_chem_df["sentence"].apply(sentence_cleaned)
uncleaned_several_chem_df["chemical_names"] = uncleaned_several_chem_df["chemical_names"].apply(chem_into_array)

In [515]:
uncleaned_several_chem_df["chemical_names"]

150       [diltiazem, cyclosporin a, dexamethasone, calc...
271       [sodium dodecyl sulfate, limonene, geranyl pyr...
280                                                    [h+]
281                                                    [h+]
286                                                    [h+]
                                ...                        
952414                    [vancomycin, d-alanine-d-alanine]
952450                         [methanol, tetrahydrofurane]
952517                                      [kato, sucrose]
952526                     [vancomycin, d-alanyl-d-alanine]
952541                                [amp, atp, adenylate]
Name: chemical_names, Length: 50212, dtype: object

In [518]:
re_check_several_chem_uncleaned_df = uncleaned_several_chem_df.loc[uncleaned_several_chem_df["chemical_names"].str.len() > 1]

In [521]:
re_check_several_chem_uncleaned_df

Unnamed: 0,sentence,chemical_names
150,"we selected diltiazem, cyclosporin a, and dexa...","[diltiazem, cyclosporin a, dexamethasone, calc..."
271,"croteau r washington state univ, inst biol che...","[sodium dodecyl sulfate, limonene, geranyl pyr..."
295,chemicalmodificationschickenliverpyruvatecarbo...,"[pyruvate, cysteine, lysine]"
296,"ash de temple univ, hlth sci ctr, sch med, dep...","[cysteine, n-(7-dimethylamino-4-methyl-3-couma..."
298,at a one- to two-fold molar excess over active...,"[pyruvate, adp, oxaloacetate]"
...,...,...
952414,key words d-alanine-d-alanine ligase d-amino a...,"[vancomycin, d-alanine-d-alanine]"
952450,the mobile phase consisted of a linear gradien...,"[methanol, tetrahydrofurane]"
952517,weber v falkenhagen d subpol a novel sucrose-b...,"[kato, sucrose]"
952526,active-site mutants of the vanc2 d-alanyl-d-se...,"[vancomycin, d-alanyl-d-alanine]"


In [522]:
for index in range(0, 20):
    sentence = re_check_several_chem_uncleaned_df["sentence"].iloc[index]
    chems = re_check_several_chem_uncleaned_df["chemical_names"].iloc[index]
    print(chems)
    print(sentence)
    print("general chemicals: " + str(general_chemical(chems)))
    print("adjacent mentions: " + str(adjacent_mentions(chems, sentence)))
    print("sep or: " + str(sep_or(chems, sentence)))
    print("sep and: " + str(sep_and(chems, sentence)))
    print("sep comma: " + str(sep_comma(chems, sentence)))
    print("sep via: " + str(sep_via(chems, sentence)))
    print("sep sym: " + str(sep_sym(chems, sentence)))
    print("followed ase: " + str(followed_ase(chems, sentence)))
    print("group: " + str(group(sentence)))
    print("followed by noun: " + str(followed_by_noun(chems, sentence)))
    print("sep verb: " + str(sep_verb(chems, sentence)))
    print("sep adverb: " + str(sep_adverb(chems, sentence)))
    print("includes reaction words: " + str(includes_reaction_words(sentence)))
    print("sep conversion words: " + str(sep_converstion_words(chems, sentence)))
    print()

['diltiazem', 'cyclosporin a', 'dexamethasone', 'calcein-am']
we selected diltiazem, cyclosporin a, and dexamethasone as typical p-gp substrates, which were investigated using a monolayer efflux, atpase, and calcein-am assays by polli et al 15 concerning the diltiazem transport, both the apparent k m,raw and k m,ad values of mdr1 transfected cells showed 03–46-fold differences compared to hmdr1, suggesting that the affinity to diltiazem would differ among species
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: False
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: False
sep verb: True
sep adverb: False
includes reaction words: True
sep conversion words: True

['sodium dodecyl sulfate', 'limonene', 'geranyl pyrophosphate']
croteau r washington state univ, inst biol chem, pullman, wa 99164, usa and washington state univ, dept biochem & biophys, pullman, wa 99164, usa abstract limonene synthase, a monoterpene cyclase from the oil glands of