In [292]:
import csv, re

In [293]:
def loadSentences(filename):
    f = open(filename, encoding="utf8")
    reader = csv.DictReader(f, delimiter='\t')
    sentences = [row for row in reader]
    return sentences

In [294]:
def tokeniseBEL(bs):
    bs = re.subn(r',GOCCID:\d+', '', bs) # Replace GOCCID (additional parameters of tloc)
    terms = re.findall(r'(a|bp|path|g|m|r|p)\(([A-Z]+)\:([^"]*?|".*?")(,pmod\(P.*?\))?\)', bs[0])
    # print(terms)
    bs = re.subn(r'(a|bp|path|g|m|r|p)\(([A-Z]+)\:([^"]*?|".*?")(,pmod\(P.*?\))?\)', '@', bs[0])
    assert bs[1] == len(terms)
    # print(bs)
    relations = re.findall(r'\s((?:increases)|(?:decreases)|(?:directlyIncreases)|(?:directlyDecreases))\s', bs[0])
    # print(relations)
    bs = re.subn(r'\s((?:increases)|(?:decreases)|(?:directlyIncreases)|(?:directlyDecreases))\s', '&', bs[0])
    assert bs[1] == len(relations)
    # print(bs)
    functions = re.findall(r'((?:act)|(?:complex)|(?:tloc)|(?:deg)|(?:kin)|(?:tscript)|(?:cat)|(?:sec)|(?:chap)|(?:gtp)|(?:pep)|(?:phos)|(?:ribo)|(?:tport))', bs[0])
    # print(functions)
    bs = re.subn(r'((?:act)|(?:complex)|(?:tloc)|(?:deg)|(?:kin)|(?:tscript)|(?:cat)|(?:sec)|(?:chap)|(?:gtp)|(?:pep)|(?:phos)|(?:ribo)|(?:tport))', '$', bs[0])
    assert bs[1] == len(functions)
    # print(bs[0]) # Term = @, Function = $, Relation = &
    template = bs[0]
    return stringToTokens(template, terms, relations, functions)

In [295]:
def normaliseBEL(bs):
    return ''.join(tokeniseBEL(bs))

In [296]:
def stringToTokens(template, terms, relations, functions):
    aList = []
    relationDict = {'increases':' -> ', 'decreases':' -| ', 'directlyIncreases':' -> ', 'directlyDecreases':' -| '}
    functionDict = {'act':'act', 'kin':'act', 'tscript':'act', 'cat':'act', 'chap':'act', 'gtp':'act', 'pep':'act', 'phos':'act', 'ribo':'act', 'tport':'act',
                    'complex':'complex', 
                    'tloc': 'tloc', 'sec':'tloc', 
                    'deg': 'deg'}
    for s in template:
        if s == '@': # Term
            termTuple = terms.pop(0)
            if termTuple[3] == '':
                aList.extend([termTuple[0], '(', termTuple[1]+':'+termTuple[2], ')'])
            else:
                aList.extend([termTuple[0], '(', termTuple[1]+':'+termTuple[2], ',', 'pmod(P)', ')'])
        elif s == '&': # Relation
            aList.append(relationDict[relations.pop(0)])
        elif s == '$': # Function
            aList.append(functionDict[functions.pop(0)])
        else: # brackets and comma
            aList.append(s)
    return aList

In [297]:
sentences = loadSentences('dataset/Training.BEL')
BELofSentence = dict()
for s in sentences:
    sid = s['Sentence-ID'][4:]
    if sid not in BELofSentence:
        BELofSentence[sid] = [s['BEL original'].strip()]
    else:
        BELofSentence[sid].append(s['BEL original'].strip())
print(BELofSentence['10000092'])
print(len(BELofSentence))

['cat(p(HGNC:UBE2L3)) increases deg(p(HGNC:FOS))', 'cat(p(HGNC:UBE2L3)) increases deg(p(HGNC:MYC))', 'cat(p(HGNC:UBE2L3)) increases deg(p(HGNC:TP53))']
6353


In [303]:
text = loadSentences('dataset/Training.sentence')
TrainingSentence = dict()
for s in text:
    sid = s['Sentence-ID'][4:]
    TrainingSentence[sid] = s['Sentence'].strip()
print(TrainingSentence['10000092'])
print(len(TrainingSentence))

UbcH7 is a ubiquitin-conjugating enzyme mediating c-fos degradation, transcription factor NF-kappaB maturation, human papilloma virus-mediated p53 and Myc protein degradation, in vitro.
6353


In [304]:
# for s in BELofSentence['10020166']:
#     print(tokeniseBEL(s))
# for s in BELofSentence['10020166']:
#     print(normaliseBEL(s))
cnt = 0
for sid in BELofSentence.keys():
    sentenceList = BELofSentence[sid]
    if len(sentenceList) > 1:
        print(sid, ':', TrainingSentence[sid])
        cnt += 1
        for s in sentenceList:
            # print(s)
            # print(tokeniseBEL(s))
            print(normaliseBEL(s))
        print('----------------------')
print(cnt)

10000006 : Induction of C/EBP beta DNA-binding activity in NIH-3T3 beta 2 cells exposed to dexamethasone in the presence of insulin and fetal bovine serum activates the expression of an adipocyte-specific nuclear hormone receptor, PPAR gamma, that stimulates the conversion of these fibroblasts into committed preadipocytes
act(p(MGI:Cebpb)) -> p(MGI:Pparg)
a(CHEBI:dexamethasone) -> act(p(MGI:Cebpb))
act(p(MGI:Cebpb)) -> r(MGI:Pparg)
act(p(MGI:Nr3c1)) -> act(p(MGI:Cebpb))
act(p(MGI:Pparg)) -> bp(GOBP:"fat cell differentiation")
----------------------
10000010 : Pulse-chase biosynthetic labeling studies showed that AtT-20 cells expressed much less RESP18 than the endogenous prohormone, POMC, but that glucocorticoid treatment lowered POMC and raised RESP18 biosynthetic rates so that they were nearly equimolar.
a(CHEBI:glucocorticoid) -| p(MGI:Pomc)
a(CHEBI:glucocorticoid) -> p(MGI:Resp18)
----------------------
10000046 : Thrombin receptor mediated signal transduction could induce the expr

p(MGI:Tgfb1) -> p(MGI:Ctla4)
----------------------
10002858 : Both SMAD2 and SMAD1/5 were activated upon exposure to TGF-beta1
p(HGNC:TGFB1) -> act(p(HGNC:SMAD1))
p(HGNC:TGFB1) -> act(p(HGNC:SMAD2))
p(HGNC:TGFB1) -> act(p(HGNC:SMAD5))
----------------------
10002864 : Primary astrocytes cultured from SBE-luc mice showed specific activation of the reporter in response to Smad2/3-activating TGF-beta family members. Treatment of mice with the endotoxin LPS resulted in a fast and vigorous, but transient activation of the reporter in the intestine
p(HGNC:TGFB1) -> act(p(HGNC:SMAD2))
p(HGNC:TGFB1) -> act(p(HGNC:SMAD3))
a(CHEBI:lipopolysaccharide) -> act(p(MGI:Smad2))
a(CHEBI:lipopolysaccharide) -> act(p(MGI:Smad3))
p(MGI:Tgfb1) -> act(p(MGI:Smad2))
p(MGI:Tgfb1) -> act(p(MGI:Smad3))
----------------------
10002872 : Neutralizing Ab to TGF-beta1 eliminated TMEV-induced IL-23 production and SMAD-3 activation in RAW264.7 cells, BMM, and SPM.
p(HGNC:TGFB1) -> act(p(HGNC:SMAD3))
p(MGI:Tgfb1) -> p

p(HGNC:CCL4) -> act(p(HGNC:CCR5))
----------------------
10000958 : Following exposure to either CD40-specific mAbs or the soluble trimeric ligand (sCD40L), high responder (HR) lines showed rapid aggregation, activation of NF-kappa B, up-regulation of cell surface markers ICAM-1/CD54 and Fas/CD95, and growth inhibition
act(p(HGNC:CD40)) -| bp(GOBP:"cell growth")
act(p(HGNC:CD40)) -> r(HGNC:FAS)
act(p(HGNC:CD40)) -> r(HGNC:ICAM1)
----------------------
10018458 : Importantly, CD40 ligation enhanced the sensitivity of airway epithelial cells to the effects of TNF-alpha and/or IL-1beta on expression of RANTES, MCP-1, IL-8, and VCAM-1. In contrast, neither IL-4 nor IL-13 modified the effects of CD40 engagement on the expression of RANTES, MCP-1, IL-8, or VCAM-1; however, both IL-4 and IL-13 attenuated the effects of CD40 cross-linking on ICAM-1 expression
act(p(HGNC:CD40)) -> p(HGNC:CCL2)
act(p(HGNC:CD40)) -> p(HGNC:CCL5)
act(p(HGNC:CD40)) -> p(HGNC:ICAM1)
act(p(HGNC:CD40)) -> p(HGNC:IL8)


act(p(HGNC:MAPK14)) -| act(p(HGNC:RB1))
----------------------
10005948 : Optimal induction of cyclin D1 by pp60(v-src) involved the extracellular signal-regulated kinase, p38, and c-Jun N-terminal kinase members of the mitogen-activated protein kinase family.
act(p(HGNC:MAPK14)) -> r(HGNC:CCND1)
act(p(HGNC:MAPK8)) -> r(HGNC:CCND1)
act(p(HGNC:MAPK9)) -> r(HGNC:CCND1)
----------------------
10021330 : Its (RSK2) activation requires phosphorylation of the linker region at Ser(369), catalyzed by extracellular signal-regulated kinase (ERK)
act(p(HGNC:MAPK1)) -> act(p(HGNC:RPS6KA3))
act(p(HGNC:MAPK1)) -> p(HGNC:RPS6KA3,pmod(P))
p(HGNC:RPS6KA3,pmod(P)) -> act(p(HGNC:RPS6KA3))
----------------------
10021334 : Furthermore, Erk-2 phosphorylated threonine 1179 and serine 1185 (and to a lesser extent, serine 395) in vitro, suggesting the importance of this pathway for SRC-1 regulation.
act(p(HGNC:MAPK1)) -> p(HGNC:NCOA1,pmod(P))
act(p(HGNC:MAPK1)) -> p(HGNC:NCOA1,pmod(P))
act(p(HGNC:MAPK1)) -> p

10024302 : <s16> DNA-PK can bind to RPA (Shao et al., 1999). <s17> Phosphorylation of RPA2 by DNA-PK impairs the binding of RPA to DNA-PK (Shao et al., 1999).
p(HGNC:RPA2,pmod(P)) -| complex(p(HGNC:RPA2),p(HGNC:RPA3),p(HGNC:RPA1),p(HGNC:PRKDC))
p(HGNC:RPA2,pmod(P)) -| complex(p(HGNC:RPA2),p(HGNC:RPA3),p(HGNC:RPA1),p(HGNC:PRKDC))
----------------------
10024304 : Here, we identify Ser386 in the hydrophobic motif of RSK2 as a phosphorylation-dependent docking site and activator of PDK1.
p(HGNC:RPS6KA3,pmod(P)) -> complex(p(HGNC:RPS6KA3),p(HGNC:PDPK1))
p(HGNC:RPS6KA3,pmod(P)) -> act(p(HGNC:PDPK1))
p(HGNC:RPS6KA3,pmod(P)) -> act(p(HGNC:RPS6KA3))
----------------------
10024312 : Human peripheral blood monocytes stimulated with sE-selectin showed a time-dependent increase in the tyrosine phosphorylation of a broad range of cellular proteins, predominantly in the molecular size range of Src family kinases (50-60 kDa) and mitogen-activated protein kinases (MAPKs). Western blot analysis of Src

p(MGI:Shh) -> p(MGI:Kdr)
p(MGI:Shh) -> p(MGI:Pecam1)
----------------------
10027236 : In contrast, Tbx2 and Tbx3 expressions are significantly repressed in Shh-/- and enhanced in Shh-/-;Gli3-/- lungs (Figs. 5B-e,h,f,i).
p(MGI:Gli3) -| r(MGI:Tbx2)
p(MGI:Gli3) -| r(MGI:Tbx3)
p(MGI:Shh) -> r(MGI:Tbx2)
p(MGI:Shh) -> r(MGI:Tbx3)
----------------------
10008860 : In contrast, Glp1r-/- thymocytes exhibited a hypoproliferative response, whilst peripheral Glp1r-/- lymphocytes were hyperproliferative in response to mitogenic stimulation
p(MGI:Glp1r) -> bp(GOBP:"cell proliferation")
p(MGI:Glp1r) -| bp(GOBP:"cell proliferation")
----------------------
10027240 : In the Grx1 KO, the cigarette smoke-induced upregulation of these cytokines was significantly impaired, compared to the wild type controls. Moreover, the baseline levels of IL12(p40), GCSF, RANTES, MIP-1a, TNFa (data not shown) and IFNg (data not shown) were found to be lower in the BAL fluid of Grx1 KO mice compared to wild type mice
p(M

----------------------
10029384 : By using neutralizing antibody against TLR4, the releases of CCL3 and CXCL2 was decreased (Fig. 3A and B). MyD88 is a critical adapter molecule for the transduction of TLRs signals. Therefore, cDCs lacking MyD88 were investigated. CSE did not induce the release of CCL3 and CXCL2 from cDCs obtained from MyD88 knockout mice (Fig. 3C and D).
p(MGI:Myd88) -> p(MGI:Ccl3)
p(MGI:Myd88) -> p(MGI:Cxcl2)
----------------------
10029386 : In response to hepatocyte cell death, MyD88 signalling was responsible for the activation of NF-?B and for the production of factors such as IL-6 (ReF. 122).
p(MGI:Myd88) -> p(MGI:Il6)
p(HGNC:MYD88) -> p(HGNC:IL6)
----------------------
10029392 : Deficiency of MyD88, a downstream signaling molecule of TLR4, also abolished the lipopolysaccharide response in WT and Abcg1-deficient macrophages (Figure IA and IB in the online-only Data Supplement).
p(MGI:Myd88) -> r(MGI:Ccl3)
p(MGI:Myd88) -> r(MGI:Cxcl2)
p(MGI:Myd88) -> r(MGI:Il1b)

10032432 : The mechanism for the increased protein synthesis and growth appeared to be a transcriptional upregulation of the eIF-2alpha and eIF-2beta genes.
act(p(HGNC:NRF1)) -> r(HGNC:EIF2S1)
act(p(HGNC:NRF1)) -> r(HGNC:EIF2S2)
----------------------
10032450 : Transfection of expression plasmids for Brn3a, Brn3b, and Brn3c (Pou4f1, 2, and 3) transactivated the Brn3b promoter by 3- to 10-fold in 293T cells or ND7 cells.
act(p(HGNC:POU4F1)) -> r(HGNC:POU3F2)
act(p(HGNC:POU4F2)) -> r(HGNC:POU3F2)
act(p(HGNC:POU4F3)) -> r(HGNC:POU3F2)
----------------------
10010946 : PPARalpha activation increases plasma HDL cholesterol via the induction of hepatic apolipoprotein A-I and apolipoprotein A-II expression in humans
act(p(HGNC:PPARA)) -> r(HGNC:APOA1)
act(p(HGNC:PPARA)) -> r(HGNC:APOA2)
----------------------
10010948 : PPARalpha down-regulates hepatic apolipoprotein C-III and increases lipoprotein lipase gene expression
act(p(HGNC:PPARA)) -| r(HGNC:APOC3)
act(p(HGNC:PPARA)) -> r(HGNC:LPL)
-

10035688 : Figure 2. Inhibition by doxycycline (DOXY) of digestion of type II collagen by collagenases 1, 2, or 3.
act(p(HGNC:MMP1)) -> deg(p(HGNC:COL2A1))
act(p(HGNC:MMP8)) -> deg(p(HGNC:COL2A1))
----------------------
10035696 : This gene encodes an enzyme which degrades type IV collagen, the major structural component of basement membranes. The enzyme plays a role in endometrial menstrual breakdown, regulation of vascularization and the inflammatory response.
act(p(HGNC:MMP2)) -> deg(p(HGNC:COL4A1))
act(p(HGNC:MMP2)) -> deg(p(HGNC:COL4A2))
----------------------
10035706 : TPO to c-Mpl (a receptor tyrosine kinase) activates both Janus Kinase 2 (JAK2) and Tyk2
act(p(HGNC:MPL)) -> act(p(HGNC:JAK2))
act(p(HGNC:MPL)) -> act(p(HGNC:TYK2))
----------------------
10035712 : MyD88 promotes association with IRAK4 and IRAK1.  During the formation of this complex, IRAK4 is activated, leading to the hyperphosphorylation of IRAK1, which then induces interaction of TRAF6 with the complex.
act(p(H

In [300]:
import unittest

class TestDemo(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.s1 = 'cat(p(HGNC:UBE2L3)) increases deg(p(HGNC:FOS))'
        cls.s2 = 'a(CHEBI:"(2Z,3Z)-bis\{amino[(2-aminophenyl)sulfanyl]methylidene\}butanedinitrile") decreases (path(MESHD:Hyperoxia) increases tscript(p(MGI:Nfe2l2)))'
        cls.s3 = 'path(MESHD:Hyperoxia) increases p(MGI:Nfe2l2,pmod(P))'
        cls.s4 = 'cat(p(HGNC:HIF1AN)) directlyDecreases complex(p(HGNC:HIF1A),p(HGNC:EP300))'
        cls.s5 = 'p(MGI:Nfe2l2,pmod(P,Y,576)) directlyIncreases tloc(p(MGI:Nfe2l2),GOCCID:0005634,GOCCID:0005737)'
    
    def test_normaliseBEL(self):
        self.assertEqual(normaliseBEL(self.s1), 'act(p(HGNC:UBE2L3)) -> deg(p(HGNC:FOS))')
        self.assertEqual(normaliseBEL(self.s2), 'a(CHEBI:"(2Z,3Z)-bis\{amino[(2-aminophenyl)sulfanyl]methylidene\}butanedinitrile") -| (path(MESHD:Hyperoxia) -> act(p(MGI:Nfe2l2)))')
        self.assertEqual(normaliseBEL(self.s3), 'path(MESHD:Hyperoxia) -> p(MGI:Nfe2l2,pmod(P))')
        self.assertEqual(normaliseBEL(self.s4), 'act(p(HGNC:HIF1AN)) -| complex(p(HGNC:HIF1A),p(HGNC:EP300))')
        self.assertEqual(normaliseBEL(self.s5), 'p(MGI:Nfe2l2,pmod(P)) -> tloc(p(MGI:Nfe2l2))')
    
    def test_tokeniseBEL(self):
        self.assertEqual(tokeniseBEL(self.s1), ['act','(', 'p', '(', 'HGNC:UBE2L3', ')', ')', ' -> ', 'deg', '(', 'p', '(', 'HGNC:FOS', ')', ')'])
        self.assertEqual(tokeniseBEL(self.s2), ['a', '(', 'CHEBI:"(2Z,3Z)-bis\\{amino[(2-aminophenyl)sulfanyl]methylidene\\}butanedinitrile"', ')', ' -| ', '(', 'path', '(', 'MESHD:Hyperoxia', ')', ' -> ', 'act', '(', 'p', '(', 'MGI:Nfe2l2', ')', ')', ')'])
        self.assertEqual(tokeniseBEL(self.s3), ['path', '(', 'MESHD:Hyperoxia', ')', ' -> ', 'p', '(', 'MGI:Nfe2l2', ',', 'pmod(P)', ')'])
        self.assertEqual(tokeniseBEL(self.s4), ['act', '(', 'p', '(', 'HGNC:HIF1AN', ')', ')', ' -| ', 'complex', '(', 'p', '(', 'HGNC:HIF1A', ')', ',', 'p', '(', 'HGNC:EP300', ')', ')'])
        self.assertEqual(tokeniseBEL(self.s5), ['p', '(', 'MGI:Nfe2l2', ',', 'pmod(P)', ')', ' -> ', 'tloc', '(', 'p', '(', 'MGI:Nfe2l2', ')', ')'])

In [301]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.002s

OK
