In [1]:
import glob
import re
import sys
import os
import itertools
import numpy as np
import math
import subprocess
import pyrosetta
pyrosetta.init( extra_options = "-mute all -corrections::beta_nov16" )

PyRosetta-4 2021 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python37.Release 2021.31+release.c7009b3115c22daa9efe2805d9d1ebba08426a54 2021-08-07T10:04:12] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


In [2]:
def load_repeat_pose(pdb):
    pose = pyrosetta.pose_from_pdb(pdb)
    return (pose)

In [3]:
def sort_into_repeated_postions(repeat, residues):
    repeat_positions = {}
    residues = sorted(residues)
    #print (repeat, residues)
    for res in residues:
        if res <= repeat:
            repeat_positions[res] = [res]
        else:
            key = res
            while key > repeat:
                key -= repeat
            repeat_positions[key].append(res)
    return (repeat_positions)
#repeat_position_dict = sort_into_repeated_postions(repeat, select_a)

In [4]:
################################################################################
'RESIDUE SELECTORS'
################################################################################
def generate_residue_selector(position_dict, mute=1):
    residue_selectors = []
    selector_names = []
    selector_template = '''        <Index name="{0}" resnums="{1}" />'''     
    
    master_res = []
    for ur_res in position_dict:
        all_res = position_dict[ur_res]
        resnums_name = 'position{0}'.format(ur_res)
        resnums_string = ','.join([str(x) for x in all_res])
        residue_selectors.append(selector_template.format(resnums_name, resnums_string))
        selector_names.append(resnums_name)
        master_res.append(ur_res)
    
    residue_selectors.append(selector_template.format('ref_res', ','.join([str(x) for x in master_res])))
    residue_selectors.append( '        <Not name="not_ref_res" selector="ref_res"/>' )

    residue_selectors.append( '        <Or name="designable" selectors="{0}"/>'.format(','.join(selector_names) ) )
    residue_selectors.append( '        <Not name="not_designable" selector="designable"/>' )

    if not mute: print ('\n'.join(residue_selectors))
    return residue_selectors

# generate_residue_selector(repeat_position_dict) 

In [5]:
################################################################################
'TASK OPERATORS'
################################################################################
def generate_task_operators(selectors, identities):
    task_operators = []

    # RestrictToRepackingRLT
    task_operators.append('''        <OperateOnResidueSubset name="not_designable" selector="not_designable">
        <RestrictToRepackingRLT />
    </OperateOnResidueSubset>''')

    task_operators.append('''        <OperateOnResidueSubset name="not_ref_res" selector="not_ref_res">
        <RestrictToRepackingRLT />
    </OperateOnResidueSubset>''')
    
    for i, selector in enumerate(selectors):
        task_operators.append('''        <OperateOnResidueSubset name="{0}" selector="{0}">
        <RestrictAbsentCanonicalAASRLT aas="{1}" />
    </OperateOnResidueSubset>'''.format(selector, identities[i]))
    
    return (task_operators)
# generate_task_operators(['designable'],['EDKRQNH'])

In [6]:
################################################################################
'NCS BLOCK'
################################################################################
def generate_NCS_block(position_dict, mute=1):
    NCS_block_list = ['        <SetupNCS name="NCS" bb="0" chi="0" wt="1.00" symmetric_sequence="1" >']
    NCS_string = '            <NCSgroup source="{0}" target="{1}"/>'
    
    for ur_res in position_dict:
        all_res = position_dict[ur_res]
        target_res = [res for res in all_res if res != ur_res]
        
        for target in target_res:
            NCS_block_list.append(NCS_string.format(ur_res, target))

    NCS_block_list.append('        </SetupNCS>')
    NCS_block = '\n'.join(NCS_block_list)
    return NCS_block


In [7]:
################################################################################
'AA COMPOSITION CONSTRAINTS'
################################################################################
def write_composition_constraint(tsv_line, delta=0.05, path='./constraints', mute=1):
    OneToThree = {'G':'GLY','A':'ALA','V':'VAL','L':'LEU','I':'ILE','M':'MET','P':'PRO','F':'PHE','W':'TRP','S':'SER','T':'THR','N':'ASN','Q':'GLN','Y':'TYR','C':'CYS','K':'LYS','R':'ARG','H':'HIS','D':'ASP','E':'GLU'}
    constraint_template = '''PENALTY_DEFINITION
TYPE {0}
FRACT_DELTA_START -{1}
FRACT_DELTA_END {1}
PENALTIES 500 50 0 0 0 50 500
FRACTION {2}
BEFORE_FUNCTION LINEAR
AFTER_FUNCTION LINEAR
END_PENALTY_DEFINITION
'''
    if not len(tsv_line.strip('\n')): return
    line_list = tsv_line.strip('\n').split('\t')
    line_list = [element for element in line_list if len(element)] 
    print (line_list)
    assert len(line_list) >= 2, 'Assertion 1 failed. Cannot parse line:\n{0}'.format(tsv_line)
    assert '=' not in line_list[0], 'Assertion 2 failed. Cannot parse line:\n{0}'.format(tsv_line)
    for item in line_list[1:]: assert re.match(r'[A-Za-z]+\=\d+',item), 'Assertion 3 failed. Cannot parse line:\n{0}'.format(tsv_line)
    
    name = line_list[0]
    compositions = [item.split('=') for item in line_list[1:]]
    compositions = [(comp[0],float(comp[1])/100.0) for comp in compositions]
    
    filename = '{0}/{1}.comp'.format(path, name)
    with open(filename, 'w') as output_constraint:
        for residues, fraction in compositions:
            three_letter = ' '.join([OneToThree[aa] for aa in residues])
            formated_constraint = constraint_template.format(three_letter, delta, fraction)
            if not mute: print(formated_constraint)
            print (formated_constraint, file=output_constraint)
    
    return filename

#write_composition_constraint('Glu75	E=75	KRQ=0', mute=0)
################################################################################
'ANTI-REPEAT COMPOSITION CONSTRAINTS'
################################################################################

path='./constraints'

OneToThree = {'G':'GLY','A':'ALA','V':'VAL','L':'LEU','I':'ILE','M':'MET','P':'PRO','F':'PHE','W':'TRP','S':'SER','T':'THR','N':'ASN','Q':'GLN','Y':'TYR','C':'CYS','K':'LYS','R':'ARG','H':'HIS','D':'ASP','E':'GLU'}
constraint_template = '''PENALTY_DEFINITION
TYPE {0}
DELTA_START 0
DELTA_END 2
PENALTIES 0 0 10
ABSOLUTE 1
BEFORE_FUNCTION LINEAR
AFTER_FUNCTION LINEAR
END_PENALTY_DEFINITION
'''

with open(path+'/AntiRepeat.comp', 'w') as output_constraint:
    for aa in OneToThree:
        print (constraint_template.format(OneToThree[aa]), file=output_constraint)

In [8]:
##############################################################################################
#
# Continue with (processed) output from 01_check_in_dhrs.ipynb
#         inspected, edited, and renamed output should be renamed as 'dhr_surface_cooked_selections.tsv'
#
#############################################################################################

In [2]:
cd ./scaffolds

/home/pylesh/PATH/protein_mineral_library/scaffolds


In [3]:
ls

2H_15_cap.pdb
2H_26_cap.pdb
2H_28_cap.pdb
DHR10_5CWG_XtalFit.pdb
DHR14_5CWH_XtalFit.pdb
DHR49_5CWJ_XtalFit.pdb
DHR53_5CWK_XtalFit.pdb
EIQAQFQGDTQVQNG_seq16_0002.pdb
EVQNVNKF_seq19_0001.pdb
FD31_rep3.pdb
FI2000161.pdb
FI998143.pdb
FI998252.pdb
FI_AP_6MRR.pdb
FI_AP_6MRS.pdb
FQIGSSGQ_seq86_0004.pdb
KVSSNQVQQV_seq146_0006.pdb
PDL_0_4.pdb
PPR_c3a145_N2SLT.pdb
PPR_c3a145_OG.pdb
QAEGGQLQVQAQGNSQIEVGSNG_seq58_0004.pdb
QAQAQLQLQAQGGGDT_seq5_0001.pdb
QAQLQIQASGT_seq131_0002.pdb
QAQLQIQASGT_seq2_0002.pdb
QAQLQIQSSGSS_seq95_0002.pdb
QAQLQVQGSSV_seq131_0001.pdb
QFQVQLQAGSGEIQLSNSQLQIQAQIGTG_seq116_0008.pdb
QIQQGT_seq23_0008.pdb
QIQVQAQGSNT_seq85_0005.pdb
QIQVQIQSSGGS_seq106_0005.pdb
QNQVQLQGGS_seq134_0007.pdb
QVQAQLQVQSTG_seq137_0004.pdb
QVQAQLQVQSTG_seq144_0007.pdb
QVQIQVQAQAQG_seq49_0004.pdb
QVQVQIQSSGAS_seq131_0003.pdb
RiAFP_4DT5.pdb
THR_8_NSR_XtalFit.pdb
THR_DN_T6_XtalFit.pdb
abr_10.pdb
abr_3.pdb
[0m[01;34minput_dhrs[0m/


In [13]:
with open('./constraints/AA_compositions.tsv', 'r') as input_compositions:
    lines = input_compositions.readlines()
lines = [line for line in lines if not line.startswith('#')]

AA_comp_files = []

for line in lines:
    file = write_composition_constraint(line)
    if file: AA_comp_files.append(file)

['Thr90', 'T=90']


In [5]:
cd ..

/home/pylesh/PATH/protein_mineral_library


In [6]:
ls ./constraints/

AA_compositions.tsv  Neg90.comp     PosGre45.comp    Val10_Al50.comp
AntiRepeat.comp      NegDEqQN.comp  PosMidGr.comp    greNegDEY.comp
DEAroMidGr.comp      NegGre45.comp  PosMidGrDv.comp  lsNeGreDiv.comp
H40E20D20.comp       PolyAsp.comp   Q65N25.comp      posGreDiv.comp
H40LIM40.comp        PolyLys.comp   S35T35.comp
His20Cys20.comp      Pos60.comp     Thr60.comp
Neg60.comp           Pos90.comp     Thr90.comp


In [16]:
with open('surface_cooked_selections.tsv', 'r') as cooked_selections:
# with open('dhr_surface_cooked_selections.tsv', 'r') as cooked_selections:
    selection_lines = cooked_selections.readlines()

dhr_surfaces = []
for i, line in enumerate(selection_lines):
    try: pdb, repeat, rep_n, select_a, select_b = line.split()
    except ValueError: print ('ERROR: could not parse line #{0}: {1}'.format(i+1,line)); continue
    repeat = int(repeat)
    select_a = [int(a) for a in select_a.split('+')]
    select_b = [int(b) for b in select_b.split('+')]
    try:
        if not pdb.endswith('.pdb'): pdb =pdb+'.pdb'
        dhr_surfaces.append((load_repeat_pose(pdb), repeat, rep_n, select_a, select_b))
        subprocess.check_output(['cp','./scaffolds/'+pdb,'./sampling/'])
    except RuntimeError:
        print ('WARNING! could not load pdb:', pdb)

In [17]:
import xml_templates
import importlib
importlib.reload(xml_templates)
print (xml_templates.__file__)
print (xml_templates.xml_string)

/mnt/home/pylesh/designs/repeats/library_scripts/xml_templates.py

<ROSETTASCRIPTS>
    <SCOREFXNS>
        <ScoreFunction name="BetaNov16" weights="beta_nov16"/>

        <ScoreFunction name="DesignBetaNov16Cst" weights="beta_nov16">
            <Reweight scoretype="atom_pair_constraint" weight="1.0"/>
            <Reweight scoretype="dihedral_constraint" weight="1.0"/>                
            <Reweight scoretype="coordinate_constraint" weight="1.0"/>
            <Reweight scoretype="aa_composition" weight="1.0"/>
            <Reweight scoretype="arg_cation_pi" weight="3" />
            <Reweight scoretype="approximate_buried_unsat_penalty" weight="5.0" />
            <Set approximate_buried_unsat_penalty_assume_const_backbone="true" />
            <Set approximate_buried_unsat_penalty_natural_corrections1="true" />
            <Set approximate_buried_unsat_penalty_hbond_energy_threshold="-0.5" />
            <Set approximate_buried_unsat_penalty_hbond_bonus_cross_chain="0.0" />
 

In [7]:
cd ./sampling

/home/pylesh/PATH/protein_mineral_library/sampling


In [21]:
##############################################################################################
'''GENERATE XMLS AND GET READY FOR SAMPLING'''
#############################################################################################

sampling_path = './'

rosetta_scripts_executable = '/software/rosetta/latest/bin/rosetta_scripts.hdf5.linuxgccrelease'
rosetta_database = '/software/rosetta/main/database'

nstruct = 100
threads = 2

tasks = []
all_designs = []
all_pdb_stems = []
all_xmls = []
all_suffix_ids = []
all_aa_comps = []

pdb_dirs = []

for pose, repeat, rep_n, select_a, select_b in dhr_surfaces:
    pdb = pose.pdb_info().name()
    print (pdb)
    stem = re.sub(r'.*?/?([^//]*).pdb', r'\1', pdb)
    pdb_dirs.append(stem)
    
    if not os.path.isdir(stem):
        os.mkdir(stem)
        
    print (pdb, repeat, select_a, select_b)
    for surface, selection in zip(['surfA', 'surfB'], [select_a, select_b]):
        # print (surface, selection)
        repeat_position_dict = sort_into_repeated_postions(repeat, selection)
        # print (repeat_position_dict, 'repeat_position_dict')
        residue_selectors = '\n'.join(generate_residue_selector(repeat_position_dict))
        task_operators = '\n'.join(generate_task_operators(['designable'], ['EDKRQNHSATLIVWYPGCMF'])) #NOT excluding any
        # task_operators = '\n'.join(generate_task_operators(['designable'], ['EDKRQNHSATLIV'])) #excluding: WYPGCMF
        NCS = generate_NCS_block(repeat_position_dict) 
        
        XML_NAMES = []
        PROTOCOL_SUFFIX = []
        
        # REPEAT ENFORCED XML
        xml_name = '{0}/{1}/{1}_{2}_repeat.xml'.format(sampling_path, stem, surface)
        repeat_protocol = '''        <Add mover_name="NCS" />
        <Add mover_name="load_aa_comp_cst" />
        <Add mover_name="fast_design" />
        <Add mover_name="NCS" />
        <Add mover_name="monte_carlo_seq_design" />
        <Add mover_name="full_repack" />'''
        formated_xml = xml_templates.xml_string.format(residue_selectors, task_operators, NCS, '', repeat_protocol, rep_n)
        with open(xml_name, 'w') as xml_file:
            print(formated_xml, file=xml_file)
        XML_NAMES.append(xml_name)
        PROTOCOL_SUFFIX.append('repeat')
        
        # ANTI-REPEAT  XML
        xml_name = '{0}/{1}/{1}_{2}_antirep.xml'.format(sampling_path, stem, surface)
        anti_repeat_protocol = '''        <Add mover_name="load_aa_comp_cst" />
{0}
        <Add mover_name="fast_design" />
        <Add mover_name="full_repack" />'''
        repeat_selector_names = ['position{0}'.format(res) for res in sorted([res for res in repeat_position_dict])]
        anti_repeat_constraints = '\n'.join(['            <AddCompositionConstraintMover name="{0}" filename="/home/pylesh/designs/repeats/library/constraints/AntiRepeat.comp" selector="{0}" />'.format(selector) for selector in repeat_selector_names ])
        anti_repeat_cst_loaders = '\n'.join(['        <Add mover_name="{0}" />'.format(selector) for selector in repeat_selector_names ])
        formated_xml = xml_templates.xml_string.format(residue_selectors, task_operators, NCS, anti_repeat_constraints, anti_repeat_protocol.format(anti_repeat_cst_loaders), rep_n)
        with open(xml_name, 'w') as xml_file:
            print(formated_xml, file=xml_file)
        XML_NAMES.append(xml_name)
        PROTOCOL_SUFFIX.append('antirep')

        for aa_comp in AA_comp_files:
            for i, xml_file in enumerate(XML_NAMES):
                suffix_id = '{1}_{0}_{2}'.format(surface, aa_comp.strip('.comp').split('/')[-1], PROTOCOL_SUFFIX[i])
                if not os.path.isdir(stem+'/'+suffix_id):
                    os.mkdir(stem+'/'+suffix_id)
                design_name = stem+'_'+suffix_id
                all_designs.append(design_name)
                all_pdb_stems.append(stem)
                all_xmls.append(xml_file)
                all_suffix_ids.append(suffix_id)
                all_aa_comps.append(aa_comp)

for thread in range(1, threads+1):
    for design_name, pdb_stem, xml_file, suffix, aa_comp in zip(all_designs, all_pdb_stems, all_xmls, all_suffix_ids, all_aa_comps):
#         thread_name = '{0}_thread{1}'.format(design_name, thread)
        # xml_command = f'{rosetta_scripts_executable} -database {rosetta_database} @./rosetta_scripts.flag -parser:protocol {0} -s {1}.pdb -nstruct {2} -scorefile {5}/{6}/{4}_{7}_score.sc -parser:script_vars aa_comp={3} -out:file:silent {5}/{6}/{4}_{7}.silent -out:suffix _{6}_{7} -mute all'.format(xml_file, pdb_stem, nstruct, aa_comp, design_name, pdb_stem, suffix, 'thread{0}'.format(thread) )
        xml_command = f'{rosetta_scripts_executable} -database {rosetta_database} @./rosetta_scripts.flag -parser:protocol {xml_file} -s {pdb_stem}.pdb -nstruct {nstruct} -scorefile {pdb_stem}/{suffix}/{design_name}_{thread}_score.sc -parser:script_vars aa_comp=../{aa_comp} -out:file:silent {pdb_stem}/{suffix}/{design_name}_thread{thread}.silent -out:suffix _{suffix}_thread{thread} -mute all'
        tasks.append(xml_command)

                
with open(f'{sampling_path}/surface_design.list', 'w') as task_list:
    print('\n'.join(tasks), file=task_list)

with open(f'{sampling_path}/design_names.list', 'w') as name_list:
    print ('\n'.join(all_designs), file=name_list)

with open(f'{sampling_path}/clean_up.sh', 'w') as cleanup:
    for stem in pdb_dirs:
        print ('#rm -r '+stem, file=cleanup)

copied = subprocess.check_output(['cp', '../surface_design.sh', sampling_path])
copied = subprocess.check_output(['cp', '../rosetta_scripts.flag', sampling_path])

EIQAQFQGDTQVQNG_seq105_0008.pdb 15 [3, 18, 33, 48, 63, 78, 93, 108, 123, 138, 5, 20, 35, 50, 65, 80, 95, 110, 125, 140, 7, 22, 37, 52, 67, 82, 97, 112, 127, 142] [9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149]
surfA [3, 18, 33, 48, 63, 78, 93, 108, 123, 138, 5, 20, 35, 50, 65, 80, 95, 110, 125, 140, 7, 22, 37, 52, 67, 82, 97, 112, 127, 142]
{3: [3, 18, 33, 48, 63, 78, 93, 108, 123, 138], 5: [5, 20, 35, 50, 65, 80, 95, 110, 125, 140], 7: [7, 22, 37, 52, 67, 82, 97, 112, 127, 142]} repeat_position_dict
surfB [9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149]
{9: [9, 24, 39, 54, 69, 84, 99, 114, 129, 144], 11: [11, 26, 41, 56, 71, 86, 101, 116, 131, 146], 13: [13, 28, 43, 58, 73, 88, 103, 118, 133, 148], 14: [14, 29, 44, 59, 74, 89, 104, 119,

In [22]:
ls ./sampling/

[0m[01;34mEIQAQFQGDTQVQNG_seq105_0008[0m/
EIQAQFQGDTQVQNG_seq105_0008.pdb
[01;34mEIQAQFQGDTQVQNG_seq108_0008[0m/
EIQAQFQGDTQVQNG_seq108_0008.pdb
[01;34mEIQAQFQGDTQVQNG_seq16_0002[0m/
EIQAQFQGDTQVQNG_seq16_0002.pdb
[01;34mEIQAQFQGDTQVQNG_seq16_0004[0m/
EIQAQFQGDTQVQNG_seq16_0004.pdb
[01;34mEIQAQFQGDTQVQNG_seq27_0006[0m/
EIQAQFQGDTQVQNG_seq27_0006.pdb
[01;34mEIQAQFQGDTQVQNG_seq31_0002[0m/
EIQAQFQGDTQVQNG_seq31_0002.pdb
[01;34mEIQAQFQGDTQVQNG_seq35_0005[0m/
EIQAQFQGDTQVQNG_seq35_0005.pdb
[01;34mEIQAQFQGDTQVQNG_seq54_0001[0m/
EIQAQFQGDTQVQNG_seq54_0001.pdb
[01;34mEIQAQFQGDTQVQNG_seq84_0008[0m/
EIQAQFQGDTQVQNG_seq84_0008.pdb
[01;34mEIQAQFQGDTQVQNG_seq85_0002[0m/
EIQAQFQGDTQVQNG_seq85_0002.pdb
[01;34mEIQAQFQGDTQVQNG_seq9_0006[0m/
EIQAQFQGDTQVQNG_seq9_0006.pdb
[01;34mEVQNVNKF_seq104_0003[0m/
EVQNVNKF_seq104_0003.pdb
[01;34mEVQNVNKF_seq112_0004[0m/
EVQNVNKF_seq112_0004.pdb
[01;34mEVQNVNKF_seq125_0001[0m/
EVQNVNKF_seq125_0001.pdb
[01;34mEVQNVNKF_seq125_0005[0m/
EV