In [6]:
import json
import os
from pathlib import Path

import pandas as pd
from Bio import SeqIO
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from IPython.display import display

cwd = os.getcwd()
if cwd.endswith('notebook/PG'):
    os.chdir('../..')
    cwd = os.getcwd()

from src.pg_modelling.af3.scoring import process_af3_ligand_pulldown_results
from src.pg_modelling.protenix.scoring import process_protenix_ligand_pulldown_results

In [2]:
data_folder = Path(os.path.expanduser('~')) / 'Documents' / 'PG_modelling'
assert data_folder.is_dir()

# AF3 results

## Load data

In [3]:
sh3b_results_folder = data_folder / 'S_aureus' / 'Lysostaphin_SH3b__PG'
m23_results_folder = data_folder / 'S_aureus' / 'Lysostaphin_M23__PG'

assert sh3b_results_folder.is_dir() and m23_results_folder.is_dir()

In [4]:
pg_subset = pd.read_csv(data_folder / 'S_aureus' / 'S_aureus_pg_subset.csv')
print(f'Number of fragments: {len(pg_subset):,}')
pg_subset.head()

Number of fragments: 76


Unnamed: 0,Name,Synonym,Formula,Monoisotopic Mass,Modifications,Degree Amidation,Degree Acetylation,Ontology,PGN Units,Glycan Units,...,SMILES,INCHIKEY,clogP,RT,[M+H]+,[M+Na]+,[M+K]+,[M+2H]2+,[M+3H]3+,ligand_name
0,None-K[3-NH2-GGGGA],None--Lys[3--NH2--Gly.Gly.Gly.Gly.Ala],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(N)C(=O)NCC(=O)NCC(=O)NCC(=O)NCC(=O)NCCCCC(N...,SZYLKPZPDXMNKU-UHFFFAOYSA-N,-4.5032,0.0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-GGGGA
1,None-K[3-NH2-GGGAG],None--Lys[3--NH2--Gly.Gly.Gly.Ala.Gly],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(NC(=O)CN)C(=O)NCC(=O)NCC(=O)NCC(=O)NCCCCC(N...,SOOQFNIYIPGCLP-UHFFFAOYSA-N,-4.5032,0.0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-GGGAG
2,None-K[3-NH2-GGAGG],None--Lys[3--NH2--Gly.Gly.Ala.Gly.Gly],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(NC(=O)CNC(=O)CN)C(=O)NCC(=O)NCC(=O)NCCCCC(N...,QTPFCZZQBOJGEK-UHFFFAOYSA-N,-4.5032,0.0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-GGAGG
3,None-K[3-NH2-GAGGG],None--Lys[3--NH2--Gly.Ala.Gly.Gly.Gly],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(NC(=O)CNC(=O)CNC(=O)CN)C(=O)NCC(=O)NCCCCC(N...,TVCZZGTWYXQCDJ-UHFFFAOYSA-N,-4.5032,0.0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-GAGGG
4,None-K[3-NH2-AGGGG],None--Lys[3--NH2--Ala.Gly.Gly.Gly.Gly],C17H31N7O7,445.2285,EPase P2,0,0,P3--S5,1,0,...,CC(NC(=O)CNC(=O)CNC(=O)CNC(=O)CN)C(=O)NCCCCC(N...,QZVBVXIUIJHCOE-UHFFFAOYSA-N,-4.5032,0.0,446.23578,468.21772,484.19166,223.62153,149.41678,K-3-NH2-AGGGG


In [5]:
results_df = pd.concat([
    process_af3_ligand_pulldown_results('lysostaphin_sh3b', sh3b_results_folder, run_posebusters=True),
    process_af3_ligand_pulldown_results('lysostaphin_m23', m23_results_folder, run_posebusters=True),
])
results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,folder,structure_file,ptm,iptm,confidence,posebusters_score,posebusters_errors
protein_name,ligand_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lysostaphin_sh3b,LIG-PG-AqKA,lysostaphin_sh3b__aqka,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.92,0.91,0.912,20,
lysostaphin_sh3b,LIG-PG-AqKA-3-NH2-GGG,lysostaphin_sh3b__aqka-3-nh2-ggg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.91,0.91,0.910,20,
lysostaphin_sh3b,LIG-PG-AqKAA,lysostaphin_sh3b__aqkaa,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.91,0.91,0.910,20,
lysostaphin_sh3b,LIG-PG-AqKAA-3-NH2-GG,lysostaphin_sh3b__aqkaa-3-nh2-gg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.91,0.90,0.902,20,
lysostaphin_sh3b,LIG-PG-AqKA-3-NH2-GG,lysostaphin_sh3b__aqka-3-nh2-gg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.90,0.90,0.900,20,
...,...,...,...,...,...,...,...,...
lysostaphin_m23,LIG-PG-AqKAA-3-NH2-GGGGA,lysostaphin_m23__aqkaa-3-nh2-gggga,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.71,0.52,0.558,20,
lysostaphin_m23,LIG-PG-AqKAA-3-NH2-SGGGG,lysostaphin_m23__aqkaa-3-nh2-sgggg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.79,0.65,0.678,19,minimum_distance_to_protein
lysostaphin_m23,LIG-PG-GSGGG,lysostaphin_m23__gsggg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.76,0.61,0.640,19,minimum_distance_to_protein
lysostaphin_m23,LIG-PG-AqKAA-3-NH2-GGG,lysostaphin_m23__aqkaa-3-nh2-ggg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.76,0.60,0.632,19,minimum_distance_to_protein


In [9]:
results_df.loc['lysostaphin_sh3b'].head(20)

Unnamed: 0_level_0,folder,structure_file,ptm,iptm,confidence,posebusters_score,posebusters_errors
ligand_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LIG-PG-AqKA,lysostaphin_sh3b__aqka,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.92,0.91,0.912,20,
LIG-PG-AqKA-3-NH2-GGG,lysostaphin_sh3b__aqka-3-nh2-ggg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.91,0.91,0.91,20,
LIG-PG-AqKAA,lysostaphin_sh3b__aqkaa,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.91,0.91,0.91,20,
LIG-PG-AqKAA-3-NH2-GG,lysostaphin_sh3b__aqkaa-3-nh2-gg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.91,0.9,0.902,20,
LIG-PG-AqKA-3-NH2-GG,lysostaphin_sh3b__aqka-3-nh2-gg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.9,0.9,0.9,20,
LIG-PG-AqKAA-3-NH2-GGG,lysostaphin_sh3b__aqkaa-3-nh2-ggg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.9,0.88,0.884,20,
LIG-PG-AqKAA-3-NH2-G,lysostaphin_sh3b__aqkaa-3-nh2-g,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.9,0.88,0.884,20,
LIG-PG-AqKA-3-NH2-GGGGG,lysostaphin_sh3b__aqka-3-nh2-ggggg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.9,0.87,0.876,20,
LIG-PG-AqKA-3-NH2-SGGGG,lysostaphin_sh3b__aqka-3-nh2-sgggg,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.9,0.87,0.876,20,
LIG-PG-AqKA-3-NH2-G,lysostaphin_sh3b__aqka-3-nh2-g,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.89,0.87,0.874,20,


### Subset

Only a handful of instances we care most about: 
- naked stem
- stem + GGGGG bridge
- stem + GGAGG bridge
- stem + GGSGG bridge
- naked bridge GGGGG
- naked bridge GGAGG
- naked bridge GGSGG

In [4]:
sh3b_results_folder_subset = data_folder / 'S_aureus' / 'Lysostaphin_SH3b__PG_subset'
results_subset = process_af3_ligand_pulldown_results('lysostaphin_sh3b', sh3b_results_folder_subset, run_posebusters=True, score_all_sample=True)
results_subset

Unnamed: 0_level_0,Unnamed: 1_level_0,structure_file,ptm,iptm,confidence,posebusters_score,posebusters_errors
protein_name,ligand_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
lysostaphin_sh3b,LIG-PG-AqKAA,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.91,0.91,0.91,20,
lysostaphin_sh3b,LIG-PG-AqKAA-3-NH2-GGAGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.88,0.84,0.848,20,
lysostaphin_sh3b,LIG-PG-AqKAA-3-NH2-GGGGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.88,0.84,0.848,20,
lysostaphin_sh3b,LIG-PG-GGAGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.87,0.83,0.838,20,
lysostaphin_sh3b,LIG-PG-AqKAA-3-NH2-GGSGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.87,0.83,0.838,20,
lysostaphin_sh3b,LIG-PG-GGSGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.87,0.8,0.814,20,
lysostaphin_sh3b,LIG-PG-GGGGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.86,0.78,0.796,20,


# Protenix results

In [63]:
protenix_sh3b_folder = data_folder / 'S_aureus' / 'protenix__Lysostaphin_SH3b__PG'
assert protenix_sh3b_folder.is_dir()

In [67]:
data = {
    'protein_domain': [],
    'ligand_name': [],
    'folder': [],
    'seed': [],
    'sample': [],
    'ptm': [],
    'iptm': [],
    'confidence': [],
}
domains_data = [
    (protenix_sh3b_folder, 'lysostaphin_sh3b'), 
]
for result_folder, protein_domain in domains_data:
    for ligand_folder in result_folder.iterdir():
        if not ligand_folder.name.startswith('Lysostaphin'):
            continue

        ligand_folder_name = ligand_folder.name
        try:
            ligand_name = ligand_folder_name.split('__')[1]
        except IndexError:
            print(ligand_folder_name)
            raise

        for seed_folder in ligand_folder.iterdir():
            seed = seed_folder.name
            for results_json_file in (seed_folder / 'predictions').glob('*_sample_*.json'):
                sample = results_json_file.name.split('_')[-1].replace('.json', '')
                with results_json_file.open() as f:
                    scores = json.load(f)
                
                ptm = scores['ptm']
                iptm = scores['iptm']
                confidence = 0.8 * iptm + 0.2 * ptm

                data['protein_domain'].append(protein_domain)
                data['ligand_name'].append(ligand_name)
                data['folder'].append(ligand_folder_name)
                data['seed'].append(seed)
                data['sample'].append(sample)
                data['ptm'].append(ptm)
                data['iptm'].append(iptm)
                data['confidence'].append(confidence)

protenix_results_df = pd.DataFrame.from_dict(data).sort_values(
    'confidence', 
    ascending=False,
).drop_duplicates([
    'protein_domain', 
    'ligand_name'
]).set_index([
    'protein_domain',
    'ligand_name',
])
protenix_results_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,folder,seed,sample,ptm,iptm,confidence
protein_domain,ligand_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
lysostaphin_sh3b,AqKAA,Lysostaphin_P10547_SH3b_413_481__AqKAA,seed_448,0,0.97766,0.980395,0.979848
lysostaphin_sh3b,AqKAA-3-NH2-G,Lysostaphin_P10547_SH3b_413_481__AqKAA-3-NH2-G,seed_7,0,0.975524,0.977005,0.976709
lysostaphin_sh3b,GG,Lysostaphin_P10547_SH3b_413_481__GG,seed_448,0,0.978202,0.974952,0.975602
lysostaphin_sh3b,GGG,Lysostaphin_P10547_SH3b_413_481__GGG,seed_408,0,0.978032,0.973466,0.97438
lysostaphin_sh3b,G,Lysostaphin_P10547_SH3b_413_481__G,seed_7,0,0.975487,0.96718,0.968841
lysostaphin_sh3b,GGGG,Lysostaphin_P10547_SH3b_413_481__GGGG,seed_7,0,0.976831,0.966736,0.968755
lysostaphin_sh3b,GSGGG,Lysostaphin_P10547_SH3b_413_481__GSGGG,seed_448,0,0.975734,0.965184,0.967294
lysostaphin_sh3b,AqKA-3-NH2-G,Lysostaphin_P10547_SH3b_413_481__AqKA-3-NH2-G,seed_7,0,0.969498,0.965232,0.966086
lysostaphin_sh3b,AqKAA-3-NH2-GG,Lysostaphin_P10547_SH3b_413_481__AqKAA-3-NH2-GG,seed_408,0,0.970119,0.964997,0.966021
lysostaphin_sh3b,AGGGG,Lysostaphin_P10547_SH3b_413_481__AGGGG,seed_7,0,0.975324,0.962996,0.965462


### Protenix results subset

In [7]:
protenix_sh3b_folder = data_folder / 'S_aureus' / 'protenix__Lysostaphin_SH3b__PG_subset'
protenix_results_subset = process_protenix_ligand_pulldown_results('Lysostaphin_P10547_SH3b', protenix_sh3b_folder, run_posebusters=True, score_all_sample=True)
protenix_results_subset

Unnamed: 0_level_0,Unnamed: 1_level_0,structure_file,ptm,iptm,confidence,posebusters_score,posebusters_errors
protein_name,ligand_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Lysostaphin_P10547_SH3b,AqKAA,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.97766,0.980395,0.979848,20,
Lysostaphin_P10547_SH3b,GGGGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.975178,0.962263,0.964846,20,
Lysostaphin_P10547_SH3b,GGSGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.973175,0.960355,0.962919,20,
Lysostaphin_P10547_SH3b,GGAGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.973456,0.959573,0.962349,20,
Lysostaphin_P10547_SH3b,AqKAA-3-NH2-GGSGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.963391,0.93994,0.94463,20,
Lysostaphin_P10547_SH3b,AqKAA-3-NH2-GGGGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.953554,0.916454,0.923874,20,
Lysostaphin_P10547_SH3b,AqKAA-3-NH2-GGAGG,/Users/rs1521/Documents/PG_modelling/S_aureus/...,0.943727,0.895338,0.905016,20,
