In [1]:
import itertools
import glob
import Bio
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from Bio.PDB.PDBParser import PDBParser
parser = PDBParser(PERMISSIVE=1)

def bio_get_atoms(pdb, start, end, atom_names=['CA']):
    stem = pdb.split('/')[-1].replace('.pdb','')
    #print (stem, pdb)
    model = parser.get_structure(stem, pdb)[0]
    
    atom_coords = []
    
    for chain in model:
        for res in chain:
            if start <= res.get_id()[1] <= end:
                for atom in atom_names:
                    atom_coords.append(res[atom])
    return atom_coords, model


In [4]:
'From PdbSeq.py'
def GetFasta(PdbName):
    ThreeToOne = {'GLY':'G','ALA':'A','VAL':'V','LEU':'L','ILE':'I','MET':'M','PRO':'P','PHE':'F','TRP':'W','SER':'S','THR':'T','ASN':'N','GLN':'Q','TYR':'Y','CYS':'C','CYD':'C','LYS':'K','ARG':'R','HIS':'H','HIP':'H','ASP':'D','GLU':'E'}
    Pdb = open(PdbName, 'rU')
    PdbLines = Pdb.readlines()
    Pdb.close()
    FastaName = PdbName.replace('.pdb','')
    FastaName = re.sub(r'[\.\-\:\,\;]',r'_',FastaName)
    AminoAcids = []
    Res = 0
    for Line in PdbLines:
        if Line.startswith('ATOM'):
            Line = re.sub(r' +\n',r'',Line)
            Line = re.sub(r' +',r'|',Line)
            LineList = Line.split('|')
            if int(LineList[5]) > Res:
                Res = int(LineList[5])
                AminoAcids.append(ThreeToOne[LineList[3]])
    Seq = ''.join(AminoAcids)
    return FastaName, Seq

def determine_repeat_length(sequence, mute=1, max_repeat_length=0):
#     sequence = pose.sequence()
    seq_len = len(sequence)
    if not max_repeat_length:
        max_repeat_length = int(seq_len/2)
    #print (max_repeat_length,'max_repeat_length')
    repeat_ranges = [x for x in range(5, max_repeat_length)]
    match_fraction = []
    for rep in repeat_ranges:
        #print (rep,'rep')
        seq_frags = []
        for r in range(0, int((seq_len+1)/rep)):
            seq_frags.append(sequence[r*rep: min([seq_len,r*rep+rep]) ])
        if not mute: print ('\n'.join(seq_frags))
        total = 0.0
        match = 0.0
        #print (seq_frags,'seq_frags')
        for fragA, fragB in itertools.combinations(seq_frags, 2):
            #print ('fragA, fragB', fragA, fragB)
            for p in range(min([len(fragA), len(fragB)])):
                total += 1.0
                if fragA[p] == fragB[p]:
                    match += 1.0
        if not mute: print(match/total)
        if not mute: print()
        match_fraction.append(match/total)
    
    maximum = max(match_fraction)
    if not mute: print (maximum)
    if not mute: print (match_fraction.index(maximum))
    repeat = repeat_ranges[match_fraction.index(maximum)]
    return (repeat)


In [5]:
alpha_fold_pdbs = glob.glob('your_path_to_alphafold_output/*pdb')
rosetta_fold_pdbs = glob.glob('your_path_to_rosettafold_output/*1.pdb')

alpha_fold_suffix = r'.pdb'
rosetta_fold_suffix = r'_model_1.pdb'

both = []
missing = []

alpha_fold_names = [re.sub(r'(.*)'+alpha_fold_suffix, r'\1', pdb.split('/')[-1]) for pdb in alpha_fold_pdbs]
alpha_fold_dict = {name:pdb for name, pdb in zip(alpha_fold_names, alpha_fold_pdbs)}
rosetta_fold_names = [re.sub(r'(.*)'+rosetta_fold_suffix, r'\1', pdb.split('/')[-1]) for pdb in rosetta_fold_pdbs]
rosetta_fold_dict = {name:pdb for name, pdb in zip(rosetta_fold_names, rosetta_fold_pdbs)}

sequences = {name:GetFasta(alpha_fold_dict[name])[1] for name in alpha_fold_names}

for name in alpha_fold_names:
    if name in rosetta_fold_names:
        both.append(name)
    else:
        missing.append(name)


In [6]:
superimposer = Bio.PDB.Superimposer()

full_rms = {}
best_region_rms = {}

align_rep_num = 4

for name in both:
    #print (alpha_fold_dict[name])
    #print (rosetta_fold_dict[name])
    seq = sequences[name]
    seq_len = len(seq)
    rep_len = determine_repeat_length(seq)
    #print (seq_len, rep_len)
    
    alpha_atoms, alpha_model = bio_get_atoms(alpha_fold_dict[name],1,seq_len)
    rosetta_atoms, rosetta_model = bio_get_atoms(rosetta_fold_dict[name],1,seq_len)
    
    superimposer.set_atoms(alpha_atoms, rosetta_atoms)
    superimposer.apply(rosetta_model.get_atoms())
    full_rms[name] = superimposer.rms
    align_len = rep_len * align_rep_num
    
    min_rms = 999

    for rep in range(int((seq_len-align_len+rep_len)/rep_len)):
        start = rep*rep_len+1
        end = start+align_len-1
        alpha_atoms, alpha_model = bio_get_atoms(alpha_fold_dict[name],start,end)
        rosetta_atoms, rosetta_model = bio_get_atoms(rosetta_fold_dict[name],start,end)
        superimposer.set_atoms(alpha_atoms, rosetta_atoms)
        superimposer.apply(rosetta_model.get_atoms())
        if superimposer.rms < min_rms:
            min_rms = superimposer.rms

    best_region_rms[name] = min_rms


In [1]:
for name in both:
    print (name, full_rms[name], best_region_rms[name])