In [219]:
import csv, re

In [220]:
def loadSentences(filename):
    f = open(filename)
    reader = csv.DictReader(f, delimiter='\t')
    sentences = [row for row in reader]
    return sentences

In [221]:
def tokeniseBEL(bs):
    terms = re.findall(r'(a|bp|path|g|m|r|p)\(([A-Z]+)\:([^"]*?|".*?")(,pmod\(P\))?\)', bs)
    # print(terms)
    bs = re.subn(r'(a|bp|path|g|m|r|p)\(([A-Z]+)\:([^"]*?|".*?")(,pmod\(P\))?\)', '@', bs)
    assert bs[1] == len(terms)
    # print(bs)
    relations = re.findall(r'\s((?:increases)|(?:decreases)|(?:directlyIncreases)|(?:directlyDecreases))\s', bs[0])
    # print(relations)
    bs = re.subn(r'\s((?:increases)|(?:decreases)|(?:directlyIncreases)|(?:directlyDecreases))\s', '&', bs[0])
    assert bs[1] == len(relations)
    # print(bs)
    functions = re.findall(r'((?:act)|(?:complex)|(?:tloc)|(?:deg)|(?:kin)|(?:tscript)|(?:cat))', bs[0])
    # print(functions)
    bs = re.subn(r'((?:act)|(?:complex)|(?:tloc)|(?:deg)|(?:kin)|(?:tscript)|(?:cat))', '$', bs[0])
    assert bs[1] == len(functions)
    # print(bs[0]) # Term = @, Function = $, Relation = &
    template = bs[0]
    return stringToTokens(template, terms, relations, functions)

In [222]:
def normaliseBEL(bs):
    return ''.join(tokeniseBEL(bs))

In [223]:
def stringToTokens(template, terms, relations, functions):
    aList = []
    relationDict = {'increases':' -> ', 'decreases':' -| ', 'directlyIncreases':' -> ', 'directlyDecreases':' -| '}
    functionDict = {'act':'act', 'complex':'complex', 'tloc': 'tloc', 'deg': 'deg', 'kin':'act', 'tscript':'act', 'cat':'act'}
    for s in template:
        if s == '@': # Term
            termTuple = terms.pop(0)
            if termTuple[3] == '':
                aList.extend([termTuple[0], '(', termTuple[1]+':'+termTuple[2], ')'])
            else:
                aList.extend([termTuple[0], '(', termTuple[1]+':'+termTuple[2], ',', 'pmod(P)', ')'])
        elif s == '&': # Relation
            aList.append(relationDict[relations.pop(0)])
        elif s == '$': # Function
            aList.append(functionDict[functions.pop(0)])
        else: # brackets and comma
            aList.append(s)
    return aList

In [224]:
sentences = loadSentences('dataset/Training.BEL')
BELofSentence = dict()
for s in sentences:
    sid = s['Sentence-ID'][4:]
    if sid not in BELofSentence:
        BELofSentence[sid] = [s['BEL original'].strip()]
    else:
        BELofSentence[sid].append(s['BEL original'].strip())
print(BELofSentence['10000092'])
print(len(BELofSentence))

['cat(p(HGNC:UBE2L3)) increases deg(p(HGNC:FOS))', 'cat(p(HGNC:UBE2L3)) increases deg(p(HGNC:MYC))', 'cat(p(HGNC:UBE2L3)) increases deg(p(HGNC:TP53))']
6353


In [225]:
for s in BELofSentence['10000092']:
    print(tokeniseBEL(s))
for s in BELofSentence['10000092']:
    print(normaliseBEL(s))

['act', '(', 'p', '(', 'HGNC:UBE2L3', ')', ')', ' -> ', 'deg', '(', 'p', '(', 'HGNC:FOS', ')', ')']
['act', '(', 'p', '(', 'HGNC:UBE2L3', ')', ')', ' -> ', 'deg', '(', 'p', '(', 'HGNC:MYC', ')', ')']
['act', '(', 'p', '(', 'HGNC:UBE2L3', ')', ')', ' -> ', 'deg', '(', 'p', '(', 'HGNC:TP53', ')', ')']
act(p(HGNC:UBE2L3)) -> deg(p(HGNC:FOS))
act(p(HGNC:UBE2L3)) -> deg(p(HGNC:MYC))
act(p(HGNC:UBE2L3)) -> deg(p(HGNC:TP53))


In [226]:
import unittest

class TestDemo(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.s1 = 'cat(p(HGNC:UBE2L3)) increases deg(p(HGNC:FOS))'
        cls.s2 = 'a(CHEBI:"(2Z,3Z)-bis\{amino[(2-aminophenyl)sulfanyl]methylidene\}butanedinitrile") decreases (path(MESHD:Hyperoxia) increases tscript(p(MGI:Nfe2l2)))'
        cls.s3 = 'path(MESHD:Hyperoxia) increases p(MGI:Nfe2l2,pmod(P))'
        cls.s4 = 'cat(p(HGNC:HIF1AN)) directlyDecreases complex(p(HGNC:HIF1A),p(HGNC:EP300))'
    
    def test_normaliseBEL(self):
        self.assertEqual(normaliseBEL(self.s1), 'act(p(HGNC:UBE2L3)) -> deg(p(HGNC:FOS))')
        self.assertEqual(normaliseBEL(self.s2), 'a(CHEBI:"(2Z,3Z)-bis\{amino[(2-aminophenyl)sulfanyl]methylidene\}butanedinitrile") -| (path(MESHD:Hyperoxia) -> act(p(MGI:Nfe2l2)))')
        self.assertEqual(normaliseBEL(self.s3), 'path(MESHD:Hyperoxia) -> p(MGI:Nfe2l2,pmod(P))')
        self.assertEqual(normaliseBEL(self.s4), 'act(p(HGNC:HIF1AN)) -| complex(p(HGNC:HIF1A),p(HGNC:EP300))')
    
    def test_tokeniseBEL(self):
        self.assertEqual(tokeniseBEL(self.s1), ['act','(', 'p', '(', 'HGNC:UBE2L3', ')', ')', ' -> ', 'deg', '(', 'p', '(', 'HGNC:FOS', ')', ')'])
        self.assertEqual(tokeniseBEL(self.s2), ['a', '(', 'CHEBI:"(2Z,3Z)-bis\\{amino[(2-aminophenyl)sulfanyl]methylidene\\}butanedinitrile"', ')', ' -| ', '(', 'path', '(', 'MESHD:Hyperoxia', ')', ' -> ', 'act', '(', 'p', '(', 'MGI:Nfe2l2', ')', ')', ')'])
        self.assertEqual(tokeniseBEL(self.s3), ['path', '(', 'MESHD:Hyperoxia', ')', ' -> ', 'p', '(', 'MGI:Nfe2l2', ',', 'pmod(P)', ')'])
        self.assertEqual(tokeniseBEL(self.s4), ['act', '(', 'p', '(', 'HGNC:HIF1AN', ')', ')', ' -| ', 'complex', '(', 'p', '(', 'HGNC:HIF1A', ')', ',', 'p', '(', 'HGNC:EP300', ')', ')'])

In [227]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.002s

OK
