In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width: 95% !important; }</style>"))

In [2]:
import numpy, pandas, re
debug = False

In [3]:
with open('./tf-tfbs.txt', 'r') as infile:
    data = pandas.read_csv(infile, delimiter = '\t', header = None, comment = '#')
data

Unnamed: 0,0,1
0,"[crp, SMALL-CAMP, crp, SMALL-CAMP]",BS-83-104
1,"[crp, SMALL-CAMP, crp, SMALL-CAMP]","[BS-83-104, BS-araB-pro1]"
2,araC,BS-35-51
3,araC,BS-56-72
4,araC,BS-109-125
5,araC,BS-130-146
6,araC,BS-267-283
7,"[araC, BS-56-72]","[araC, BS-267-283, BS-araC-pro1]"
8,"[araC, SMALL-alpha-L-arabinopyranose]",BS-35-51
9,"[araC, BS-56-72]",SMALL-alpha-L-arabinopyranose


In [4]:
description = []
RULE_LHS = []
for i in data.index:
    # data
    agents = (data.iloc[i, 0] + ', ' + data.iloc[i, 1])
    names = agents.split(', ')
    
    if debug:
        print(data.iloc[i, 0] + ' interacts with ' + data.iloc[i, 1])
    
    ## form the LHS
    LHS = []
    next_in_complex = False
    for name in names:
        if name[0] == '[': # we are dealing with the first monomer of a complex
            molecule = name[1:]
            next_in_complex = True
        elif name[-1] == ']': # we are dealing with the last monomer of a complex
            molecule = name[:-1]
            next_in_complex = False
        elif next_in_complex: # we are dealing with a monomer part of a complex
            molecule = name
        else:
            molecule = name
            linked = 'None'
            
        if 'BS' in name:
            if 'pro' in name:
                molecule = '{:s}\', type = \'{:s}'.format(molecule.split('-')[-2], molecule.split('-')[-1])
            LHS.append('dna(name = \'{:s}\', prot = dna_link, free = \'True\', up = bs_link, dw = bs_link)' \
                       .format(molecule))
        elif 'SMALL' in name:
            LHS.append('met(name = \'{:s}\', prot = met_link)' \
                       .format(molecule.replace('SMALL-', '')))
        else:
            LHS.append('prot(name = \'{:s}\', dna = dna_link, met = met_link, up = prot_link, dw = prot_link)' \
                       .format(molecule))
            
    ## look for where starts and ends a complex in the LHS    
    complexes = [(m.start()+1, m.end()-1) for m in re.finditer(r'\[[A-Za-z0-9-_, ]+\]', agents)]
    monomers = [(m.start(), m.end()) for m in re.finditer(r'[A-Za-z0-9-_]+', agents)]
    
    positions = []
    for cplx_pos in reversed(complexes):
        pos_i = None
        pos_f = None
        for index, kmer_pos in enumerate(monomers):
            if cplx_pos[0] == kmer_pos[0]:
                pos_i = index
            if cplx_pos[1] == kmer_pos[1]:
                pos_f = index
                positions.append((pos_i, pos_f))
                break

    ## join complexes following start and end positions
    for position in positions:
        ## join agents and remove from LHS list because they were joined into one position
        LHS[position[0]] = ' %\n    '.join(LHS[position[0]:position[1]+1])
        for index in reversed(range(position[0]+1, position[1]+1)):
            LHS.pop(index)
            
    ## create numbered links
    starter_link = 1
    for index, agent in enumerate(LHS):
        count_monomers = len(agent.split('%'))
        count_small = agent.count('met(')
        count_prots = agent.count('prot(')
        count_dnas = agent.count('dna(')

        if count_prots > 1:
            dw = [None] * count_prots
            for prot in range(count_prots-1):
                dw[prot] = starter_link
                starter_link += 1
            up = dw[-1:] + dw[:-1]
            ## and replace indexes
            c = list(zip(up, dw))
            c = [elt for sublist in c for elt in sublist]
            LHS[index] = LHS[index].replace('prot_link', '{}').format(*c)
                   
        if count_small >= 1 and count_prots >= 1:
            dw = [None] * (count_small + count_prots)
            for met in numpy.arange(0, count_small + count_prots, 2):
                dw[met] = starter_link
                dw[met-1] = starter_link
                starter_link += 1
            ## and replace indexes
            LHS[index] = LHS[index].replace('met_link', '{}').format(*tuple(dw))
        
        if count_dnas > 1:
            dw = ['WILD'] * count_dnas
#             for dna in range(count_dnas-1):
#                 dw[dna] = starter_link
#                 starter_link += 1
            up = dw[-1:] + dw[:-1]
            ## and replace indexes
            c = list(zip(up, dw))
            c = [elt for sublist in c for elt in sublist]
            LHS[index] = LHS[index].replace('bs_link', '{}').format(*c)
            
        if count_dnas >= 1 and count_prots >= 1: # a protein is complexed with the dna
            dw = [None] * (count_prots + count_dnas)
            for dna in range(count_prots + count_dnas):
                if dna == count_prots:
                    dw[dna] = starter_link
                    dw[dna-1] = starter_link
                    starter_link += 1
            ## and replace indexes
            LHS[index] = LHS[index].replace('True', 'False').replace('dna_link', '{}').format(*dw)
        
        ## final replace
        LHS[index] = LHS[index].replace('prot_link', 'None')
        LHS[index] = LHS[index].replace('met_link', 'None')
        LHS[index] = LHS[index].replace('bs_link', 'WILD')
        LHS[index] = LHS[index].replace('dna_link', 'None')
        
    ## LHS final join
    LHS = ' +\n    '.join(LHS)
    RULE_LHS.append(LHS)
    
    description.append('# ' + data.iloc[i, 0] + ' interacts with ' + data.iloc[i, 1])
    
#     print(LHS)
#     print()

In [5]:
RULE_RHS = []
for i in data.index:
    ## data
    agents = (data.iloc[i, 0] + ', ' + data.iloc[i, 1]).replace('[', '').replace(']', '')
    names = agents.split(', ')

    ## write the RHS
    RHS = []
    for index, name in enumerate(names):
        if name[0] == '[': # we are dealing with the first monomer of a complex
            molecule = name[1:]
            next_in_complex = True
        elif name[-1] == ']': # we are dealing with the last monomer of a complex
            molecule = name[:-1]
            next_in_complex = False
        elif next_in_complex: # we are dealing with a monomer part of a complex
            molecule = name
        else:
            molecule = name
          
        if 'BS' in name:
            if 'pro' in name:
                molecule = '{:s}\', type = \'{:s}'.format(molecule.split('-')[-2], molecule.split('-')[-1])
            RHS.append('dna(name = \'{:s}\', prot = dna_link, free = \'False\', up = bs_link, dw = bs_link)' \
                       .format(molecule))
        elif 'SMALL' in name:
            RHS.append('met(name = \'{:s}\', prot = met_link)' \
                       .format(molecule.replace('SMALL-', '')))
        else:
            RHS.append('prot(name = \'{:s}\', dna = dna_link, met = met_link, up = prot_link, dw = prot_link)' \
                       .format(molecule))

    ## join complexes
    RHS = ' %\n    '.join(RHS)
            
    ## create numbered links
    agent = RHS
    count_monomers = len(agent.split('%'))
    count_small = agent.count('met(')
    count_prots = agent.count('prot(')
    count_dnas = agent.count('dna(')

    starter_link = 1
    if count_prots > 1:
        dw = [None] * count_prots
        for prot in range(count_prots-1):
            dw[prot] = starter_link
            starter_link += 1
        up = dw[-1:] + dw[:-1]
        ## and replace indexes
        c = list(zip(up, dw))
        c = [elt for sublist in c for elt in sublist]
        RHS = RHS.replace('prot_link', '{}').format(*c)

    if count_small >= 1:
        dw = [None] * (count_small + count_prots)
        for met in numpy.arange(0, count_small + count_prots, 2):
            dw[met] = starter_link
            dw[met-1] = starter_link
            starter_link += 1
        ## and replace indexes
        RHS = RHS.replace('met_link', '{}').format(*tuple(dw))

    if count_dnas > 1:
        dw = ['WILD'] * count_dnas
#         for dna in range(count_dnas-1):
#             dw[dna] = starter_link
#             starter_link += 1
        up = dw[-1:] + dw[:-1]
        ## and replace indexes
        c = list(zip(up, dw))
        c = [elt for sublist in c for elt in sublist]
        RHS = RHS.replace('bs_link', '{}').format(*c)
    
    ## always
    dw = [None] * (count_prots + count_dnas)
    for dna in range(count_prots + count_dnas):
        if dna == count_prots:
            dw[dna] = starter_link
            dw[dna-1] = starter_link
            starter_link += 1
    up = dw[-1:] + dw[:-1]
    ## and replace indexes
    RHS = RHS.replace('dna_link', '{}').format(*dw)
    
    ## final replace
    RHS = RHS.replace('prot_link', 'None')
    RHS = RHS.replace('met_link', 'None')
    RHS = RHS.replace('bs_link', 'WILD')
    RHS = RHS.replace('dna_link', 'None')
    
    RULE_RHS.append(RHS)
    
#     print(RHS)
#     print()

In [6]:
for index, _ in enumerate(data.index):
    ## complete rule
    name = 'TranscriptionFactorMet_AssemblyRule_' + str(index+1)
    print('{:s}\n' \
          'Rule(\'{:s}\', \n' \
          '    {:s} | \n' \
          '    {:s}, \n' \
          '    Parameter(\'fwd_{:s}\', 0),\n' \
          '    Parameter(\'rvs_{:s}\', 0))' \
          .format(description[index], name, RULE_LHS[index], RULE_RHS[index], name, name).replace('-', '_'))
    print()

# [crp, SMALL_CAMP, crp, SMALL_CAMP] interacts with BS_83_104
Rule('TranscriptionFactorMet_AssemblyRule_1', 
    prot(name = 'crp', dna = None, met = 2, up = None, dw = 1) %
    met(name = 'CAMP', prot = 3) %
    prot(name = 'crp', dna = None, met = 3, up = 1, dw = None) %
    met(name = 'CAMP', prot = 2) +
    dna(name = 'BS_83_104', prot = None, free = 'True', up = WILD, dw = WILD) | 
    prot(name = 'crp', dna = None, met = 2, up = None, dw = 1) %
    met(name = 'CAMP', prot = 3) %
    prot(name = 'crp', dna = 4, met = 3, up = 1, dw = None) %
    met(name = 'CAMP', prot = 2) %
    dna(name = 'BS_83_104', prot = 4, free = 'False', up = WILD, dw = WILD), 
    Parameter('fwd_TranscriptionFactorMet_AssemblyRule_1', 0),
    Parameter('rvs_TranscriptionFactorMet_AssemblyRule_1', 0))

# [crp, SMALL_CAMP, crp, SMALL_CAMP] interacts with [BS_83_104, BS_araB_pro1]
Rule('TranscriptionFactorMet_AssemblyRule_2', 
    prot(name = 'crp', dna = None, met = 2, up = None, dw = 1) %
    met(name = 'CA

In [7]:
## names for the dna agent
BS = []
for bs in ', '.join(data.iloc[:,1]).replace('[','').replace(']','').split(', '):
    if bs.startswith('BS') and 'pro' not in bs:
        BS.append(bs.replace('-','_'))
list(set(BS))

['BS_158_174',
 'BS_35_51',
 'BS_130_146',
 'BS_109_125',
 'BS_83_99',
 'BS_36_52',
 'BS_56_72',
 'BS_83_104',
 'BS_62_78',
 'BS_137_153',
 'BS_267_283',
 'BS_57_73']