In [1]:
import json
import os
from pathlib import Path

import pandas as pd
from Bio import SeqIO
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from IPython.display import display

cwd = os.getcwd()
if cwd.endswith('notebook/PG'):
    os.chdir('../..')
    cwd = os.getcwd()

In [4]:
data_folder = Path(os.path.expanduser('~')) / 'Documents' / 'PG_modelling'
assert data_folder.is_dir()

protenix_trd_folder = data_folder / 'S_zooepidemicus' / 'protenix__TRD_pulldown'
assert protenix_trd_folder.is_dir()

In [5]:
pg_subset = pd.read_csv(data_folder / 'S_zooepidemicus' / 'S_zooepidemicus_pg_subset.csv')
print(f'Number of fragments: {len(pg_subset):,}')
pg_subset.head()

Number of fragments: 4


Unnamed: 0,ligand_name,Name,Synonym,Formula,Monoisotopic Mass,Modifications,Degree Amidation,Degree Acetylation,Ontology,PGN Units,...,Peptide,SMILES,INCHIKEY,clogP,RT,[M+H]+,[M+Na]+,[M+K]+,[M+2H]2+,[M+3H]3+
0,NAG-NAM-AqKAA-3-NH2-AAA,(NAG)(NAM)-AqKAA[3-NH2-AAA],GlcNAc.MurNAc--Ala.γ-isoGln.Lys.Ala.Ala[3--NH2...,C48H82N12O22,1178.56666,,1,2,G2--P5--S3,1,...,AqKAA[3-NH2-AAA],CC(=O)NC1C(OC2C(CO)OC(O)C(NC(C)=O)C2OC(C)C(=O)...,ZOSAGBPSZUYFTI-UHFFFAOYSA-N,-9.2199,0,1179.57394,1201.55588,1217.52982,590.29061,393.86283
1,NAG-NAM-AqKAA-3-NH2-AA,(NAG)(NAM)-AqKAA[3-NH2-AA],GlcNAc.MurNAc--Ala.γ-isoGln.Lys.Ala.Ala[3--NH2...,C45H77N11O21,1107.52955,,1,2,G2--P5--S2,1,...,AqKAA[3-NH2-AA],CC(=O)NC1C(OC2C(CO)OC(O)C(NC(C)=O)C2OC(C)C(=O)...,DSWIWDCFYFDFQG-UHFFFAOYSA-N,-8.7246,0,1108.53683,1130.51877,1146.49271,554.77206,370.1838
2,NAG-NAM-AqKAA-3-NH2-A,(NAG)(NAM)-AqKAA[3-NH2-A],GlcNAc.MurNAc--Ala.γ-isoGln.Lys.Ala.Ala[3--NH2...,C42H72N10O20,1036.49243,,1,2,G2--P5--S1,1,...,AqKAA[3-NH2-A],CC(=O)NC1C(OC2C(CO)OC(O)C(NC(C)=O)C2OC(C)C(=O)...,RFFQXYNMAAKXPS-UHFFFAOYSA-N,-8.2293,0,1037.49971,1059.48165,1075.45559,519.2535,346.50476
3,NAG-NAM-AqKAA,(NAG)(NAM)-AqKAA,GlcNAc.MurNAc--Ala.γ-isoGln.Lys.Ala.Ala,C39H67N9O19,965.45532,,1,2,G2--P5,1,...,AqKAA,CC(=O)NC1C(OC2C(CO)OC(O)C(NC(C)=O)C2OC(C)C(=O)...,MYHLLWFYGIKMKM-UHFFFAOYSA-N,-7.734,0,966.4626,988.44454,1004.41848,483.73494,322.82572


In [6]:
data = {
    'protein_domain': [],
    'ligand_name': [],
    'folder': [],
    'seed': [],
    'ptm': [],
    'iptm': [],
    'confidence': [],
}
domains_data = [
    (protenix_trd_folder, 'ZoocinA_TRD'), 
]
for result_folder, protein_domain in domains_data:
    for ligand_folder in result_folder.iterdir():
        if not ligand_folder.name.startswith('ZoocinA'):
            continue

        ligand_folder_name = ligand_folder.name
        try:
            ligand_name = ligand_folder_name.split('__')[1]
        except IndexError:
            print(ligand_folder_name)
            raise

        for seed_folder in ligand_folder.iterdir():
            seed = seed_folder.name
            for results_json_file in (seed_folder / 'predictions').glob('*_sample_0.json'):
                sample = results_json_file.name.split('_')[-1].replace('.json', '')
                with results_json_file.open() as f:
                    scores = json.load(f)
                
                ptm = scores['ptm']
                iptm = scores['iptm']
                confidence = 0.8 * iptm + 0.2 * ptm

                data['protein_domain'].append(protein_domain)
                data['ligand_name'].append(ligand_name)
                data['folder'].append(ligand_folder_name)
                data['seed'].append(seed)
                data['ptm'].append(ptm)
                data['iptm'].append(iptm)
                data['confidence'].append(confidence)

protenix_results_df = pd.DataFrame.from_dict(data).sort_values(
    'confidence', 
    ascending=False,
).drop_duplicates([
    'protein_domain', 
    'ligand_name'
]).set_index([
    'protein_domain',
    'ligand_name',
])
protenix_results_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,folder,seed,ptm,iptm,confidence
protein_domain,ligand_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ZoocinA_TRD,NAG-NAM-AqKAA-3-NH2-A,ZoocinA_TRD_O54308_179_283__NAG-NAM-AqKAA-3-NH2-A,seed_822,0.973315,0.957624,0.960762
ZoocinA_TRD,NAG-NAM-AqKAA-3-NH2-AA,ZoocinA_TRD_O54308_179_283__NAG-NAM-AqKAA-3-NH...,seed_49,0.965084,0.936944,0.942572
ZoocinA_TRD,NAG-NAM-AqKAA-3-NH2-AAA,ZoocinA_TRD_O54308_179_283__NAG-NAM-AqKAA-3-NH...,seed_85,0.955667,0.917386,0.925043
ZoocinA_TRD,NAG-NAM-AqKAA,ZoocinA_TRD_O54308_179_283__NAG-NAM-AqKAA,seed_49,0.955739,0.911375,0.920248
