In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import shutil

#TODO: Set directory to the directory containing the zipped alphafold results files
directory = '/path/to/alphafold/results'

#unzip all the files within the directory on google drive
for filename in os.listdir(directory):
    if filename.endswith(".zip"):
        filepath = os.path.join(directory, filename)
        shutil.unpack_archive(filepath, directory)

In [None]:
!pip install biopython

In [None]:
import os
import numpy as np
from Bio import pairwise2
from Bio.PDB import PDBParser, Superimposer, PDBIO

#reference pdb file - DOES NOT CHANGE
reference_pdb_file = "/content/drive/MyDrive/ML_Project/pdbs/avg_2beg.pdb"
output_dir = os.path.dirname(reference_pdb_file)

#get a list of all the pdb files in subdirectories containing "rank_001"
pdb_files = []
for root, dirs, files in os.walk(directory):
    for file in files:
        if "rank_001" in file and file.endswith(".pdb"):
            pdb_files.append(os.path.join(root, file))

#initialize the parser
parser = PDBParser(QUIET=True)

#load the reference structure
reference_structure = parser.get_structure("reference", reference_pdb_file)

#get the CA atoms of the reference structure
reference_atoms = []
for model in reference_structure:
    for chain in model:
        for residue in chain:
            if 17 <= residue.id[1] <= 42:
                for atom in residue:
                    if atom.name == "CA":
                        reference_atoms.append(atom)

#loop over the pdb files and calculate the RMSD
rmsd_values = []
for pdb_file in pdb_files:
    #load the structure
    structure = parser.get_structure("structure", pdb_file)

    #get the atoms of the structure
    atoms = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if 17 <= residue.id[1]%48 <= 42:
                    for atom in residue:
                        if atom.name == "CA":
                            atoms.append(atom)
    #superimpose the structure onto the reference structure
    superimposer = Superimposer()
    superimposer.set_atoms(reference_atoms, atoms)
    superimposer.apply(atoms)

    #calculate the RMSD
    rmsd = superimposer.rms
    #store the RMSD value
    pdb_name = os.path.basename(pdb_file)[:6]
    rmsd_values.append((pdb_name, rmsd))

#write the output file in the same directory as the reference PDB file
# TODO: CHANGE FILE OUTPUT NAME
output_file = os.path.join(output_dir, 'output_file_name.txt')
with open(output_file, 'w') as f:
    for pdb_name, rmsd in rmsd_values:
        f.write('{}\t{}\n'.format(pdb_name, rmsd))

In [None]:
#check the shape of the output
print(np.shape(rmsd_values))