Here we would use cyclic peptides from this paper: Crystal Structures of Protein-Bound Cyclic Peptides
We would evaluate several dimensional reduction algorithms (PCA, t-SNE and UMAP)

In [2]:
from platform import python_version
print(python_version())

%pip install --upgrade --user notebook

3.10.7
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install --user scikit-learn peptides biopython numpy pandas

Note: you may need to restart the kernel to use updated packages.


In [4]:
### download sequences from the PDB database

import os
import time
import urllib
import itertools
import tempfile

import numpy as np
from Bio.PDB.PDBParser import PDBParser

from typing import List, Set, Dict
from collections import defaultdict

sequences: List[str] = []

downloadurl: str = "https://files.rcsb.org/download/{0}.pdb"
parser = PDBParser()

with open(os.path.join(os.getcwd(), "data/pdbs.txt"), "r") as pdbs:
    for pdb in list(pdbs):
        url: str
        seq: str

        pdb = pdb.strip()
        if pdb == "":
            continue
        else:
            url: str = downloadurl.format(pdb)

        with tempfile.NamedTemporaryFile(mode='w', delete=False) as pdb_file:
            urllib.request.urlretrieve(url, pdb_file.name)

            structure = parser.get_structure('XXXX', pdb_file.name)
            for model in structure:
                distinct_peptides: Set = set()
                for chain in model:
                    if len(list(chain.get_residues())) < 30:
                        seq = "-".join(s.resname for s in chain.get_residues() if s.resname != "HOH" and s.resname != "NH2")
                        sequences.append(seq)




In [None]:
d3to1: Dict[str, str] = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
                            'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
                            'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
                            'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

In [None]:

import random
import peptides
import pandas as pd

random.seed(42)

seqs: List[str] = []

for sequence in sequences:

    seq: str = ""
    for s in sequence.split('-'):
        if s in d3to1:
            seq = ''.join([seq, d3to1[s]])
        else:
            non_standard: str = random.choice(["O", "U"])
            seq = ''.join([seq, non_standard])

    pep = peptides.Peptide(seq)

    seqs.append(seq)

In [None]:
df = pd.DataFrame([peptides.Peptide(s).descriptors() for s in seqs ])
print(df.head(10))
