In [1]:
import os
import pickle
import re
import sys; sys.path.append('../..')

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set_theme()
from matplotlib.patches import Rectangle

import bin.params as p
import bin.utils as u

In [3]:
DATASET = 'test' # DO NOT FORGET TO CHANGE IF NECESSARY

# july2024
FASTA_ALIGNED_CLEANED_DIR_PATH = f'{p.DATA_DIR}/csv/fasta_aligned_cleanedJuly2024'
STRUCTURE_UPLOADED_PER_YEAR_PNG_PATH = f'{p.DATA_DIR}/png/structureUploadedPerYearJuly2024.png'
TEMPORAL_SPLIT_CUMULATIVE_WITH_SETS = f'{p.DATA_DIR}/png/temporal_splitJuly2024.png'
METADATA_DIR_PATH = f'{p.DATA_DIR}/csv/metadataJuly2024'

# old
FASTA_ALIGNED_CLEANED_DIR_PATH = f'{p.DATA_DIR}/csv/fasta_aligned_cleaned'
STRUCTURE_UPLOADED_PER_YEAR_PNG_PATH  = f'{p.DATA_DIR}/png/numberOfStructuresUploadedPerYear.png'
TEMPORAL_SPLIT_CUMULATIVE_WITH_SETS = f'{p.DATA_DIR}/png/temporal_split.png'
METADATA_DIR_PATH = f'{p.DATA_DIR}/csv/metadata' 

# show
(FASTA_ALIGNED_CLEANED_DIR_PATH, 
 STRUCTURE_UPLOADED_PER_YEAR_PNG_PATH, 
 TEMPORAL_SPLIT_CUMULATIVE_WITH_SETS, 
 METADATA_DIR_PATH, 
 PDB_DIR, TRAIN_CUM_PERCENTAGE, VALID_CUM_PERCENTAGE)

('../../data/csv/fasta_aligned_cleaned',
 '../../data/png/numberOfStructuresUploadedPerYear.png',
 '../../data/png/temporal_split.png',
 '../../data/csv/metadata',
 '../../data/pdb',
 70,
 83)

In [13]:
fasta_df, sasa_df = u.load_dataset(['train', 'val'], chains='H')
fasta_df.index = fasta_df['Id']; sasa_df.index = sasa_df['Id']; 
fasta_df = fasta_df.drop(columns=['Id']); sasa_df = sasa_df.drop(columns=['Id']); 
fasta_keys = fasta_df.index.map(lambda k: k.lower()[:4])

load_dataset: ['train', 'val'], metadata file path: ../../data/csv/metadata/metadata_H.csv, chains: H, shape: (2643, 19)
load_dataset: ['train', 'val'], X file path: ../../data/csv/fasta_aligned_cleaned/fasta_aho_H.csv, chains: H, shape: (2643, 165)
load_dataset: ['train', 'val'], Y file path: ../../data/csv/sasa_aligned/sasa_H.csv, chains: H, shape: (2643, 165)


In [21]:
def get_summary_file_path(dataset):
    # temporary fix for July2024
    if dataset == 'test_new_234':
        summary_file_path = 'all_structures_2024/sabdab_summary_all.tsv'
    else:
        # get sabdab tsv file 
        summary_file_paths = [fn for fn in os.listdir(PDB_DIR) if fn.endswith('_summary.tsv')]
        assert len(summary_file_paths) == 1, f'multiple summary files?? {list(summary_file_paths)}'
        summary_file_path = summary_file_paths[0]
    
    summary_file_path_complete = f'{PDB_DIR}/{summary_file_path}'
    print('complete summary file path:', summary_file_path_complete) 
    return summary_file_path_complete

def load_metadata(dataset, whitelist = []):
    path = get_summary_file_path(dataset)
    summaries_df = pd.read_csv(path, sep='\t')
    if whitelist:
        whitelist = map(lambda k: k.lower()[:4], whitelist)
        summaries_df = summaries_df[ summaries_df['pdb'].isin(whitelist) ]
    print(f'loaded {dataset} | shape: {summaries_df.shape} | columns: {list(summaries_df.columns)}')
    return summaries_df

def generate_short_metadata_df(summaries_df, interesting_columns = []):
    #date_uploaded_df = summaries_df[ summaries_df['pdb'].isin(fasta_keys) ].drop_duplicates(subset=['pdb', 'date'])[interesting_columns]
    date_uploaded_df = summaries_df.drop_duplicates(subset=['pdb', 'date'])[interesting_columns]
    date_uploaded_df.index = date_uploaded_df['pdb']
    date_uploaded_df.sort_index(inplace=True)
    date_uploaded_df['date'] = date_uploaded_df['date'].astype('datetime64[ns]')
    date_uploaded_df['year'] = date_uploaded_df['date'].dt.year
    date_uploaded_df.drop(columns='pdb', errors='ignore', inplace=True)
    print(f'short metadata df | shape: {date_uploaded_df.shape} | columns: {list(date_uploaded_df.columns)}')
    return date_uploaded_df

In [22]:
m_df = load_metadata('test', whitelist=list(fasta_df.index))

complete summary file path: ../../data/pdb/20220601_0621156_summary.tsv
loaded test | shape: (4298, 30) | columns: ['pdb', 'Hchain', 'Lchain', 'model', 'antigen_chain', 'antigen_type', 'antigen_het_name', 'antigen_name', 'short_header', 'date', 'compound', 'organism', 'heavy_species', 'light_species', 'antigen_species', 'authors', 'resolution', 'method', 'r_free', 'r_factor', 'scfv', 'engineered', 'heavy_subclass', 'light_subclass', 'light_ctype', 'affinity', 'delta_g', 'affinity_method', 'temperature', 'pmid']


In [37]:
print(
    (m_df.groupby('pdb')['resolution'].std() > 0).sum()
)

0


In [54]:
i = (m_df.groupby('pdb').size()>2).index
#m_df.loc[list(i)]
m_df

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,1mhp,H,L,0,A,protein,,"integrin alpha 1, (residues 169-360)",IMMUNE SYSTEM,08/20/02,...,False,True,IGHV3,IGKV1D,Kappa,,,,,
1,1mhp,X,Y,0,B,protein,,"integrin alpha 1, (residues 169-360)",IMMUNE SYSTEM,08/20/02,...,False,True,IGHV3,IGKV1D,Kappa,,,,,
2,2hh0,H,L,0,P,peptide,,prion protein,IMMUNE SYSTEM,06/27/06,...,True,True,IGHV14,IGKV9,Kappa,,,,,
9,1mhh,D,C,0,,,,,IMMUNE SYSTEM,08/20/02,...,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.27819680469229,Unknown,,TBD
10,1mhh,B,A,0,,,,,IMMUNE SYSTEM,08/20/02,...,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.27819680469229,Unknown,,TBD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7575,6azm,B,A,0,E,peptide,,circumsporozoite protein nanp 5-mer,IMMUNE SYSTEM,09/11/17,...,False,True,IGHV3,IGKV4,Kappa,,,,,
7576,6azm,D,C,0,F,peptide,,circumsporozoite protein nanp 5-mer,IMMUNE SYSTEM,09/11/17,...,False,True,IGHV3,IGKV4,Kappa,,,,,
7577,1sbs,H,L,0,,,,,MONOCLONAL ANTIBODY,04/08/98,...,False,False,IGHV6,IGKV8,Kappa,,,,,
7578,6azk,B,A,0,,,,,IMMUNE SYSTEM,09/11/17,...,False,True,IGHV2,IGKV5,Kappa,2.2999999999999996e-06,-7.691979474199363,Unknown,,TBD


In [16]:
interesting_columns = ['pdb', 'resolution', 'method', 'r_factor', 'date']
date_df = generate_short_metadata_df(m_df, interesting_columns)
date_df

short metadata df | shape: (4325, 5) | columns: ['resolution', 'method', 'r_factor', 'date', 'year']


Unnamed: 0_level_0,resolution,method,r_factor,date,year
pdb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12e8,1.90,X-RAY DIFFRACTION,0.221,1998-03-14,1998
15c8,2.50,X-RAY DIFFRACTION,0.190,1998-03-18,1998
1a0q,2.30,X-RAY DIFFRACTION,0.203,1997-12-05,1997
1a14,2.50,X-RAY DIFFRACTION,0.200,1997-12-21,1997
1a2y,1.50,X-RAY DIFFRACTION,0.203,1998-01-13,1998
...,...,...,...,...,...
7z1b,2.30,X-RAY DIFFRACTION,0.198,2022-03-23,2022
7z1c,1.90,X-RAY DIFFRACTION,0.175,2022-03-23,2022
7z1d,1.55,X-RAY DIFFRACTION,0.167,2009-01-08,2009
7z1e,1.59,X-RAY DIFFRACTION,0.161,2009-01-08,2009
