In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from Bio import SeqIO
from rdkit import Chem
from rdkit.Chem import AllChem
from IPython.display import display

cwd = os.getcwd()
if cwd.endswith('notebook/PG'):
    os.chdir('../..')
    cwd = os.getcwd()

from src.pg_modelling.ligand_utils import generate_ccd_from_smiles, sanitize_ligand_name, generate_conformation

In [2]:
data_folder = Path(os.path.expanduser('~')) / 'Documents' / 'PG_modelling'
assert data_folder.is_dir()

## Loading PG ligands

In [3]:
pg_df = pd.read_csv(data_folder / 'S_zooepidemicus' / 'S_zooepidemicus_PG_1mer.csv')
pg_subset = pg_df[
    pg_df['Name'].isin([
        '(NAG)(NAM)-AqKAA[3-NH2-AAA]',
        '(NAG)(NAM)-AqKAA[3-NH2-AA]',
        '(NAG)(NAM)-AqKAA[3-NH2-A]',
        '(NAG)(NAM)-AqKAA',
    ])
].reset_index(drop=True)
pg_subset['ligand_name'] = pg_subset['Name'].apply(lambda n: sanitize_ligand_name(n))
pg_subset = pg_subset.set_index('ligand_name')
print(f'Number of fragments in subset: {len(pg_subset):,}')
pg_subset.to_csv(data_folder / 'S_zooepidemicus' / 'S_zooepidemicus_PG_subset.csv')
pg_subset

Number of fragments in subset: 4


Unnamed: 0_level_0,Name,Synonym,Formula,Monoisotopic Mass,Modifications,Degree Amidation,Degree Acetylation,Ontology,PGN Units,Glycan Units,...,Peptide,SMILES,INCHIKEY,clogP,RT,[M+H]+,[M+Na]+,[M+K]+,[M+2H]2+,[M+3H]3+
ligand_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NAG-NAM-AqKAA-3-NH2-AAA,(NAG)(NAM)-AqKAA[3-NH2-AAA],GlcNAc.MurNAc--Ala.γ-isoGln.Lys.Ala.Ala[3--NH2...,C48H82N12O22,1178.56666,,1,2,G2--P5--S3,1,2,...,AqKAA[3-NH2-AAA],CC(=O)NC1C(OC2C(CO)OC(O)C(NC(C)=O)C2OC(C)C(=O)...,ZOSAGBPSZUYFTI-UHFFFAOYSA-N,-9.2199,0,1179.57394,1201.55588,1217.52982,590.29061,393.86283
NAG-NAM-AqKAA-3-NH2-AA,(NAG)(NAM)-AqKAA[3-NH2-AA],GlcNAc.MurNAc--Ala.γ-isoGln.Lys.Ala.Ala[3--NH2...,C45H77N11O21,1107.52955,,1,2,G2--P5--S2,1,2,...,AqKAA[3-NH2-AA],CC(=O)NC1C(OC2C(CO)OC(O)C(NC(C)=O)C2OC(C)C(=O)...,DSWIWDCFYFDFQG-UHFFFAOYSA-N,-8.7246,0,1108.53683,1130.51877,1146.49271,554.77206,370.1838
NAG-NAM-AqKAA-3-NH2-A,(NAG)(NAM)-AqKAA[3-NH2-A],GlcNAc.MurNAc--Ala.γ-isoGln.Lys.Ala.Ala[3--NH2...,C42H72N10O20,1036.49243,,1,2,G2--P5--S1,1,2,...,AqKAA[3-NH2-A],CC(=O)NC1C(OC2C(CO)OC(O)C(NC(C)=O)C2OC(C)C(=O)...,RFFQXYNMAAKXPS-UHFFFAOYSA-N,-8.2293,0,1037.49971,1059.48165,1075.45559,519.2535,346.50476
NAG-NAM-AqKAA,(NAG)(NAM)-AqKAA,GlcNAc.MurNAc--Ala.γ-isoGln.Lys.Ala.Ala,C39H67N9O19,965.45532,,1,2,G2--P5,1,2,...,AqKAA,CC(=O)NC1C(OC2C(CO)OC(O)C(NC(C)=O)C2OC(C)C(=O)...,MYHLLWFYGIKMKM-UHFFFAOYSA-N,-7.734,0,966.4626,988.44454,1004.41848,483.73494,322.82572


In [6]:
ligand_folder = data_folder / 'S_zooepidemicus' / 'PG_ligands_pdb'
ligand_folder.mkdir(exist_ok=True)

for ligand_name, row in pg_subset.iterrows():
    raw_name = row['Name']
    smiles = row['SMILES']

    mol = Chem.MolFromSmiles(smiles)
    try:
        mol = generate_conformation(mol)
    except ValueError:
        print(f'Error for ligand: {raw_name}')
        raise

    output_path = ligand_folder / f'{ligand_name}.pdb'
    Chem.MolToPDBFile(mol, output_path.as_posix())

[16:14:23] Molecule does not have explicit Hs. Consider calling AddHs()
[16:14:23] Molecule does not have explicit Hs. Consider calling AddHs()
[16:14:23] Molecule does not have explicit Hs. Consider calling AddHs()
[16:14:24] Molecule does not have explicit Hs. Consider calling AddHs()
[16:14:24] Molecule does not have explicit Hs. Consider calling AddHs()
[16:14:24] Molecule does not have explicit Hs. Consider calling AddHs()
[16:14:24] Molecule does not have explicit Hs. Consider calling AddHs()
[16:14:24] Molecule does not have explicit Hs. Consider calling AddHs()
