In [15]:
import json
import os
from pathlib import Path

import pandas as pd
from Bio import SeqIO
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from IPython.display import display

cwd = os.getcwd()
if cwd.endswith('notebook/PG'):
    os.chdir('../..')
    cwd = os.getcwd()

In [2]:
data_folder = Path(os.path.expanduser('~')) / 'Documents' / 'PG_modelling'
assert data_folder.is_dir()

## Load data

In [5]:
sh3b_results_folder = data_folder / 'S_aureus' / 'Lysostaphin_SH3b__PG'
m23_results_folder = data_folder / 'S_aureus' / 'Lysostaphin_M23__PG'

assert sh3b_results_folder.is_dir() and m23_results_folder.is_dir()

In [10]:
pg_subset = pd.read_csv(data_folder / 'S_aureus' / 'S_aureus_pg_subset.csv')
print(f'Number of fragments: {len(pg_subset):,}')
pg_subset.head()

Number of fragments: 61


Unnamed: 0,Name,Synonym,Formula,Monoisotopic Mass,Modifications,Degree Amidation,Degree Acetylation,Ontology,PGN Units,Glycan Units,...,SMILES,INCHIKEY,clogP,RT,[M+H]+,[M+Na]+,[M+K]+,[M+2H]2+,[M+3H]3+,ligand_name
0,None-K[3-NH2-GGGGA],None--Lys[3--NH2--Gly.Gly.Gly.Gly.Ala],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(N)C(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)NCCCCC(N...,SZYLKPZPDXMNKU-UHFFFAOYSA-N,-4.5032,0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-GGGGA
1,None-K[3-NH2-GGGAG],None--Lys[3--NH2--Gly.Gly.Gly.Ala.Gly],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(NC(=O)CN)C(=O)NCC(=O)NCC(=O)NCC(=O)NCCCCC(N...,SOOQFNIYIPGCLP-UHFFFAOYSA-N,-4.5032,0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-GGGAG
2,None-K[3-NH2-GGAGG],None--Lys[3--NH2--Gly.Gly.Ala.Gly.Gly],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(NC(=O)CNC(=O)CN)C(=O)NCC(=O)NCC(=O)NCCCCC(N...,QTPFCZZQBOJGEK-UHFFFAOYSA-N,-4.5032,0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-GGAGG
3,None-K[3-NH2-GAGGG],None--Lys[3--NH2--Gly.Ala.Gly.Gly.Gly],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(NC(=O)CNC(=O)CNC(=O)CN)C(=O)NCC(=O)NCCCCC(N...,TVCZZGTWYXQCDJ-UHFFFAOYSA-N,-4.5032,0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-GAGGG
4,None-K[3-NH2-AGGGG],None--Lys[3--NH2--Ala.Gly.Gly.Gly.Gly],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(NC(=O)CNC(=O)CNC(=O)CNC(=O)CN)C(=O)NCCCCC(N...,QZVBVXIUIJHCOE-UHFFFAOYSA-N,-4.5032,0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-AGGGG


In [17]:
ligand_names = {tpl.ligand_name for tpl in pg_subset.itertuples()}
assert len(ligand_names) == len(pg_subset)

lower_case_name_map = {
    l.lower(): l 
    for l in ligand_names
}

In [38]:
data = {
    'protein_domain': [],
    'ligand_name': [],
    'folder': [],
    'ptm': [],
    'iptm': [],
    'confidence': [],
}
domains_data = [
    (sh3b_results_folder, 'lysostaphin_sh3b'), 
    (m23_results_folder, 'lysostaphin_m23'),
]
for result_folder, protein_domain in domains_data:
    for ligand_folder in result_folder.iterdir():
        ligand_folder_name = ligand_folder.name
        if ligand_folder_name.startswith(protein_domain):
            ligand_name_lower = ligand_folder_name.split('__')[1]
            ligand_name = lower_case_name_map.get(ligand_name_lower, ligand_name_lower.upper())

            with (ligand_folder / f'{ligand_folder_name}_summary_confidences.json').open() as f:
                scores = json.load(f)
            
            ptm = scores['ptm']
            iptm = scores['iptm']
            confidence = 0.8 * iptm + 0.2 * ptm

            data['protein_domain'].append(protein_domain)
            data['ligand_name'].append(ligand_name)
            data['folder'].append(ligand_folder_name)
            data['ptm'].append(ptm)
            data['iptm'].append(iptm)
            data['confidence'].append(confidence)


results_df = pd.DataFrame.from_dict(data).sort_values(
    'confidence', 
    ascending=False,
).set_index([
    'protein_domain',
    'ligand_name',
])
results_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,folder,ptm,iptm,confidence
protein_domain,ligand_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lysostaphin_sh3b,AqKA,lysostaphin_sh3b__aqka,0.92,0.91,0.912
lysostaphin_sh3b,AqKA-3-NH2-GGG,lysostaphin_sh3b__aqka-3-nh2-ggg,0.91,0.91,0.91
lysostaphin_sh3b,AqKAA,lysostaphin_sh3b__aqkaa,0.91,0.91,0.91
lysostaphin_sh3b,AqKAA-3-NH2-GG,lysostaphin_sh3b__aqkaa-3-nh2-gg,0.91,0.9,0.902
lysostaphin_sh3b,AqKA-3-NH2-GG,lysostaphin_sh3b__aqka-3-nh2-gg,0.9,0.9,0.9
lysostaphin_sh3b,AqKAA-3-NH2-GGG,lysostaphin_sh3b__aqkaa-3-nh2-ggg,0.9,0.88,0.884
lysostaphin_sh3b,AqKAA-3-NH2-G,lysostaphin_sh3b__aqkaa-3-nh2-g,0.9,0.88,0.884
lysostaphin_m23,G,lysostaphin_m23__g,0.85,0.89,0.882
lysostaphin_sh3b,AqKA-3-NH2-SGGGG,lysostaphin_sh3b__aqka-3-nh2-sgggg,0.9,0.87,0.876
lysostaphin_sh3b,AqKA-3-NH2-GGGGG,lysostaphin_sh3b__aqka-3-nh2-ggggg,0.9,0.87,0.876


In [39]:
results_df.loc['lysostaphin_sh3b', 'AqKA-3-NH2-GGGGG']

folder        lysostaphin_sh3b__aqka-3-nh2-ggggg
ptm                                          0.9
iptm                                        0.87
confidence                                 0.876
Name: (lysostaphin_sh3b, AqKA-3-NH2-GGGGG), dtype: object

In [36]:
results_df.loc['lysostaphin_sh3b', 'AqKA-3-NH2-GGSGG']['confidence'].round(2)

0.87