In [1]:
import glob
import re
import sys
import itertools
import numpy as np
import math

In [2]:
import pyrosetta
pyrosetta.init( extra_options = "-mute all -corrections::beta_nov16" )

PyRosetta-4 2021 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python37.Release 2021.26+release.b308454c455dd04f6824cc8b23e54bbb9be2cdd7 2021-07-02T13:01:54] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


In [3]:
def load_repeat_pose(pdb):
    pose = pyrosetta.pose_from_pdb(pdb)
    return (pose)

In [4]:
def determine_repeats(pose, mute=1, max_repeat_length=0):
    sequence = pose.sequence()
    seq_len = len(sequence)
    if not max_repeat_length:
        max_repeat_length = int(seq_len/2)
    #print (max_repeat_length,'max_repeat_length')
    repeat_ranges = [x for x in range(10, max_repeat_length)]
    match_fraction = []
    for rep in repeat_ranges:
        #print (rep,'rep')
        seq_frags = []
        for r in range(0, int((seq_len+1)/rep)):
            seq_frags.append(sequence[r*rep: min([seq_len,r*rep+rep]) ])
        if not mute: print ('\n'.join(seq_frags))
        total = 0.0
        match = 0.0
        #print (seq_frags,'seq_frags')
        for fragA, fragB in itertools.combinations(seq_frags, 2):
            #print ('fragA, fragB', fragA, fragB)
            for p in range(min([len(fragA), len(fragB)])):
                total += 1.0
                if fragA[p] == fragB[p]:
                    match += 1.0
        if not mute: print(match/total)
        if not mute: print()
        match_fraction.append(match/total)
    
    maximum = max(match_fraction)
    if not mute: print (maximum)
    if not mute: print (match_fraction.index(maximum))
    repeat = repeat_ranges[match_fraction.index(maximum)]
    return (repeat)

In [5]:
def select_surface(pose, supplement=['E','R','K','H'], exclude=['G', 'P', 'C', 'A', 'S'], mute=1):
    # Use layer selector to generate base list of surface residues
    surface_selector = pyrosetta.rosetta.core.select.residue_selector.LayerSelector()
    surface_selector.set_use_sc_neighbors(1)
    surface_selector.set_cutoffs(5.2,3.5)
    surface_selector.set_layers(False, False, True)
    surface_resi = []
    seq = pose.sequence()
    for i, b in enumerate(surface_selector.apply(pose)):
        if b and not seq[i] in exclude:
            surface_resi.append(i+1)
    
    # Supplement surface residues by adding certain residue types
    if len(supplement):
        for i, s in enumerate(seq):
            if s in supplement:
                surface_resi.append(i+1)
                
    pdb_namestem = p.pdb_info().name().strip('.pdb').split('/')[-1]
    if not mute: print ('#SURFACE:\ncolor magenta, resi {0} and {1}'.format('+'.join([str(r) for r in surface_resi]), pdb_namestem))
    return(surface_resi)
#select_surface(p, mute=0)

In [6]:
def select_helices(pose, mute=1):
    seq = pose.sequence()
    prehelices = []
    for r in range(1, pose.size()+1):
        if pose.phi(r) < 0 and pose.psi(r) < 0:
            prehelices.append(1)
        else:
            prehelices.append(0)

    loop_selector = pyrosetta.rosetta.core.select.residue_selector.SecondaryStructureSelector()
    loop_selector.set_selected_ss('L')
    loops = loop_selector.apply(pose)
    #print (loops,'loops')
    helices = []
    for i, h in enumerate(prehelices):
        if loops[i+1]:
            # Correcting for DSSP pathology with DHRs 
            if seq[i] in ['Q','R']:
                helices.append(h)                
            else:
                helices.append(0)
        else:
            helices.append(h)
                        
    helix_resi = []
    for i, b in enumerate(helices):
        if b: helix_resi.append(i+1)
    pdb_namestem = p.pdb_info().name().strip('.pdb').split('/')[-1]
    if not mute: print ('#HELICES:\ncolor magenta, resi {0} and {1}'.format('+'.join([str(r) for r in helix_resi]), pdb_namestem))
    return(helix_resi)
# select_helices(p)

In [7]:
def select_repeat_surfaces(pose, mute=1, repeat=0):
    if not repeat: 
        repeat = determine_repeats(pose, mute=1)
    surface = select_surface(pose, mute=1)
    helices = select_helices(pose, mute=1)
    #print (repeat, surface, helices)
    helix_surface = (list(set(surface) & set(helices)))
    pdb_namestem = p.pdb_info().name().strip('.pdb').split('/')[-1]
    if not mute: print ('#HELIX_SURFACES:\ncolor cyan, resi {0} and {1}'.format('+'.join([str(r) for r in helix_surface]), pdb_namestem))
    
    Nrepeat_to_try = round(pose.size() / repeat) + 1

    helix_surface_repeats = []
    for residue in range(1, repeat+1):
        in_all = 1
        counter = 0.0
        shift_list = []
        for Nrepeat in range(Nrepeat_to_try):
            shifted_residue = Nrepeat*repeat + residue
            if shifted_residue < pose.size():
                if not shifted_residue in helix_surface:
                    in_all = 0
                else:
                    counter += 1
                shift_list.append(shifted_residue)
        if in_all:
            helix_surface_repeats.extend(shift_list)
        elif counter:
            if counter / len(shift_list) > 0.6:
                helix_surface_repeats.extend(shift_list)
            
    if not mute: print ('#HELIX_SURFACE_REPEATS:\ncolor magenta, resi {0} and {1}'.format('+'.join([str(r) for r in helix_surface_repeats]), pdb_namestem))            
    print()
    
    return(helix_surface_repeats)
#select_repeat_surfaces(p, mute=0)

In [8]:
def select_and_split_residues(pose, mute=1, repeat=0):
    if not repeat:
        repeat = determine_repeats(pose)
    #print ('SSSSSS', pose)
    selected_residues = select_repeat_surfaces(pose, mute=0, repeat=repeat)
    selected_coordinates = np.array([ list(pose.residue(r).xyz('CA')) for r in selected_residues])    

    rep_number = pose.size()/repeat
    #print (repeat, rep_number)
    first_half = []
    #print ('selected_residues', selected_residues, len(selected_residues))
    
    for residue in selected_residues:
        which_repeat = int(residue/repeat)
        start_of_repeat = which_repeat*repeat 
        middle_of_repeat = start_of_repeat + repeat/2.0
        #print ('residue, which_repeat, middle_of_repeat', residue, which_repeat, middle_of_repeat)
        if residue < middle_of_repeat:
            first_half.append(1)
        else:
            first_half.append(0)
    #print ('first_half',first_half, len(first_half))
    
    a=[]
    b=[]
    for i, r in enumerate(selected_residues):
        if first_half[i]:
            a.append(r)
        else:
            b.append(r)
    
    pdb_namestem = pose.pdb_info().name().strip('.pdb').split('/')[-1]
    if not mute: print ('#a:\ncolor orange, resi {0} and {1}'.format('+'.join([str(r) for r in a]), pdb_namestem))
    if not mute: print ('#b:\ncolor tv_blue, resi {0} and {1}'.format('+'.join([str(r) for r in b]), pdb_namestem))
    return(a, b)
#select_and_split_residues(p)

In [9]:
def repack_pose(pose):
    repack_move_map = pyrosetta.MoveMap()
    repack_move_map.set_bb(False)
    repack_move_map.set_chi(1)
    repack_move_map.set_jump(False)   
    scorefxn = pyrosetta.rosetta.core.scoring.ScoreFunctionFactory.create_score_function("beta_nov16")
    fast_repack = pyrosetta.rosetta.protocols.relax.FastRelax(scorefxn)
    fast_repack.set_movemap(repack_move_map)
    fast_repack.apply(pose)

In [10]:
def trim_and_recap_dhr(pose, max_length, repack=1, mute=1, recap=1, max_repeat_length=65):
    repeat = determine_repeats(pose, mute=1, max_repeat_length=max_repeat_length)
    rep_n = pose.size()/repeat
    assert rep_n == int(rep_n), 'Pose not integer number of repeat'
    
    if pose.size() < max_length:
        return pose
    
    new_repeat_number = math.floor(max_length / repeat)
    length = repeat * new_repeat_number
    trimmed_pose = pyrosetta.rosetta.protocols.grafting.return_region(pose, 1, length)
    
    original_cap_positions = [r for r in range(pose.size()-repeat+1, pose.size()+1, 1)]
    new_cap_positions = [r for r in range(trimmed_pose.size()-repeat+1, trimmed_pose.size()+1, 1)]
    if not mute:
        print (original_cap_positions,len(original_cap_positions), 'original_cap_positions')
        print (new_cap_positions,len(new_cap_positions), 'new_cap_positions')
    
    if recap:
        for old, new in zip(original_cap_positions, new_cap_positions):
            trimmed_pose.replace_residue( new, pose.residue(old), 1 )

    if repack:
        print('Repacking trimmed pose...')
        repack_pose(trimmed_pose)
        print('Packed!')
        
    return trimmed_pose

# print (p)
# print (trim_and_recap_dhr(p, 121, 0))

In [11]:
#######################################################################################
# DIRECTORY CONTAINING INPUTS
#######################################################################################

input_pdbs = glob.glob('./scaffolds/input_dhrs/*pdb')
input_poses = [load_repeat_pose(pdb) for pdb in input_pdbs]

trim = 156

collection_path = './scaffolds/'

tsv_rows = []
for p in input_poses:
    pdb = p.pdb_info().name().split('/')[-1]
    repeat = determine_repeats(p, mute=1, max_repeat_length=60)
    
    rep_n = p.size()/repeat
    assert rep_n == int(rep_n), '{0} pose is not integer number of repeat'.format(pdb)

    if trim:
        if repeat > trim:
            print ('#Skipping: ', pdb)
            continue
        p = trim_and_recap_dhr(p, trim, repack=0, mute=1, max_repeat_length=60)
    
    p.pdb_info().name(pdb)
    p.dump_pdb(collection_path+'/'+pdb)
    
    a_select, b_select = select_and_split_residues(p, mute=0, repeat=repeat)
#     print ('a_select, b_select', a_select, b_select)

    new_rep_n = p.size()/repeat
    row = '\t'.join([pdb, str(repeat), str(int(new_rep_n)), '+'.join([str(a) for a in a_select]), '+'.join([str(b) for b in b_select])])
    
    tsv_rows.append(row)

with open('{0}/dhr_surface_raw_selections.tsv'.format(collection_path), 'w') as selection_output:
    print('\n'.join(tsv_rows), file=selection_output)

#HELIX_SURFACES:
color cyan, resi 4+5+7+8+10+11+12+14+15+18+24+25+27+28+31+32+35+38+45+48+51+52+55+56+59+65+68+69+72+76+79+86+88+89+92+93+95+96+97+99+100+106+108+109+110+112+113+114+116+117+119+120 and 2H_15_ca
#HELIX_SURFACE_REPEATS:
color magenta, resi 4+45+86+7+48+89+10+51+92+11+52+93+14+55+96+15+56+97+18+59+100+24+65+106+27+68+109+28+69+110+31+72+113+32+73+114+35+76+117+38+79+120 and 2H_15_ca

#a:
color orange, resi 4+45+86+7+48+89+10+51+92+11+52+93+14+55+96+15+56+97+18+59+100 and 2H_15_ca
#b:
color tv_blue, resi 24+65+106+27+68+109+28+69+110+31+72+113+32+73+114+35+76+117+38+79+120 and 2H_15_ca
#HELIX_SURFACES:
color cyan, resi 2+3+4+5+6+7+9+10+13+14+16+17+21+22+23+25+26+29+30+33+36+37+42+44+45+46+49+53+56+57+61+62+65+66+69+73+76+77+82+84+85+86+88+89+92+93+96+97+101+102+104+105+106+108+109+111+112+113+115+116+117 and DHR14_5CWH_XtalFit
#HELIX_SURFACE_REPEATS:
color magenta, resi 2+42+82+4+44+84+5+45+85+6+46+86+9+49+89+13+53+93+16+56+96+17+57+97+21+61+101+22+62+102+25+65+105+26+66+1

In [None]:


##############################################################################################
#
# Look at selections and edit dhr_surface_raw_selections.tsv accordingly, save as:
#     dhr_surface_cooked_selections.tsv
#
# Continue with 02_setup_for_sequence_sampling.ipynb
#
#############################################################################################

