# 4. Generate Relative SASA (RSA) data

Generate SASA sequences for chains contained in PDB files.

For this, we will use `freesasa` tool (https://freesasa.github.io)

If you wish to start generating SASA sequences anew, make sure you delete `data/csv/freesasa_status.csv` file first (given it exists).

---

## Installing FreeSASA

In [12]:
import sys; sys.path.append('../..')
import bin.params as p

Altough the Python `freesasa` module is out there, it does not provide residue-depth freesasa values, which we need. 

Therefore we will install the package and use the tool it from `bash`.

**Note**: check the freesasa.github.io, there may be newer version of freesasa available and the current one may give error 404.

In [None]:
command = f"""
echo 'Installing prerequisite: JSON-C ...'
brew install json-c

echo 'Downloading freesasa ...'
cd {p.BIN_DIR}
wget https://freesasa.github.io/freesasa-2.1.1.zip

echo 'Unzipping ...'
unzip -qq freesasa-2.1.1.zip

echo 'Installing ...'
cd freesasa-2.1.1
./configure 
make 
make install
"""
print(command)

---

## Setup

In [61]:
import collections
import os
import pickle
import re
import subprocess
import sys; sys.path.append('../..')
import time

import numpy as np
import pandas as pd
from Bio.Data.IUPACData import protein_letters_3to1
from tqdm.notebook import tqdm

import bin.params as p
import bin.utils as u

In [53]:
# old
FASTA_UNALIGNED_CSV_PATH = f'{p.DATA_DIR}/csv/fasta_unaligned/fasta_all.csv'
FREESASA_STATUS_PATH = f'{p.DATA_DIR}/csv/freesasa_status.csv'
SASA_RELATIVE_UNALIGNED_DIR = f'{p.DATA_DIR}/csv/sasa_relative_unaligned'
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/incremental' 

# july2024
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/incrementalJuly2024'
SASA_RELATIVE_UNALIGNED_DIR = f'{p.DATA_DIR}/csv/sasa_relative_unalignedJuly2024'
FREESASA_STATUS_PATH = f'{p.DATA_DIR}/csv/freesasa_statusJuly2024.csv'
FASTA_UNALIGNED_CSV_PATH = f'{p.DATA_DIR}/csv/fasta_unalignedJuly2024/fasta_all.csv'

# IB3 july2024
FASTA_UNALIGNED_CSV_PATH = f'{p.DATA_DIR}/csv/fasta_unalignedIB3July2024/fasta_all.csv'
FREESASA_STATUS_PATH = f'{p.DATA_DIR}/csv/freesasa_statusIB3July2024.csv'
SASA_RELATIVE_UNALIGNED_DIR = f'{p.DATA_DIR}/csv/sasa_relative_unalignedIB3July2024'
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/immunobuilder3July2024_raw'

# FIXBUG
FASTA_UNALIGNED_CSV_PATH = f'{p.DATA_DIR}/csv/fasta_unaligned_fixbug/fasta_all.csv'
FREESASA_STATUS_PATH = f'{p.DATA_DIR}/csv/freesasa_status.csv'
SASA_RELATIVE_UNALIGNED_DIR = f'{p.DATA_DIR}/csv/sasa_relative_unaligned'
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/incremental' 

# IB 2 july2024
FASTA_UNALIGNED_CSV_PATH = f'{p.DATA_DIR}/csv/fasta_unalignedIB2July2024/fasta_all.csv'
FREESASA_STATUS_PATH = f'{p.DATA_DIR}/csv/freesasa_statusIBJuly2024.csv'
SASA_RELATIVE_UNALIGNED_DIR = f'{p.DATA_DIR}/csv/sasa_relative_unalignedIB2July2024'
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/immunobuilderJuly2024_{p.FINAL_NUMBERING_SCHEME}'
FASTA_ALIGNED_DIR = f"{p.DATA_DIR}/csv/fasta_alignedIB2July2024" # used only if AHO PDB files were employed
SASA_ALIGNED_DIR_PATH = f'{p.DATA_DIR}/csv/sasa_alignedIB2July2024' # used only if AHO PDB files were employed

(FASTA_UNALIGNED_CSV_PATH, FREESASA_STATUS_PATH, SASA_RELATIVE_UNALIGNED_DIR, 
 CLEANED_PDB_DIR, FASTA_ALIGNED_DIR, SASA_ALIGNED_DIR_PATH)

('../../data/csv/fasta_unalignedIB2July2024/fasta_all.csv',
 '../../data/csv/freesasa_statusIBJuly2024.csv',
 '../../data/csv/sasa_relative_unalignedIB2July2024',
 '../../data/pdb/immunobuilderJuly2024_aho',
 '../../data/csv/fasta_alignedIB2July2024',
 '../../data/csv/sasa_alignedIB2July2024')

In [54]:
if os.path.exists(FREESASA_STATUS_PATH):
    freesasa_status_df = pd.read_csv(FREESASA_STATUS_PATH, index_col=0)
    print('loading freesasa status from file', FREESASA_STATUS_PATH)
else:
    freesasa_status_df = pd.DataFrame(columns=['structure_code', 'status'])
    print('creating freesasa status anew', FREESASA_STATUS_PATH)

creating freesasa status anew ../../data/csv/freesasa_statusIBJuly2024.csv


In [55]:
command = f'mkdir -p {SASA_RELATIVE_UNALIGNED_DIR} {FASTA_UNALIGNED_CSV_PATH}'
! $command

mkdir: ../../data/csv/fasta_unalignedIB2July2024/fasta_all.csv: File exists


In [56]:
sasa_path = f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv'
if os.path.exists(sasa_path):
    old_sasa_df = pd.read_csv(sasa_path, index_col=0) 
    print('loading sasa from file:', sasa_path)
else:
    print('no sasa loaded yet')
    old_sasa_df = None

no sasa loaded yet


----

## Generate SASA 

**Experimental: join the FASTAS of new test sequences with the old ones. Do the same for FASTA (to be removed)**

In [57]:
! mkdir -p ../../data/csv/fasta_unaligned_fixbug ../../data/csv/sasa_relative_unaligned_fixbug

In [53]:
# load old-test and the new test
fasta_unaligned_old_df = pd.read_csv(f'{p.DATA_DIR}/csv/fasta_unaligned/fasta_all.csv', index_col=0)
fasta_unaligned_new_df = pd.read_csv(f'{p.DATA_DIR}/csv/fasta_unalignedJuly2024/fasta_all.csv', index_col=0)
print('fasta_unaligned_old_df.shape', fasta_unaligned_old_df.shape, 'fasta_unaligned_new_df.shape', fasta_unaligned_new_df.shape)

# find common sequences
common_sequences = list(fasta_unaligned_old_df.index.intersection(fasta_unaligned_new_df.index))
print('len(common_sequences)', len(common_sequences), 'common sequences:', common_sequences)

#common_old_df, common_new_df = fasta_unaligned_old_df.loc[common_sequences], fasta_unaligned_new_df.loc[common_sequences]
# common_old_df.equals(common_new_df) -> False
#c_old, c_new = u.nondash_counts_columns(common_old_df), u.nondash_counts_columns(common_new_df)
#(c_old-c_new).describe()
#for k, v in c_old.items():
#    print(k, v)

# remove common sequences from fasta_unaligned
fasta_unaligned_new_df_duprems = fasta_unaligned_new_df.drop(index=common_sequences)
# join the old and new. The common (duplicate) sequences were removed and the one copy of them remains there
fasta_unaligned_fixbug_df = pd.concat([fasta_unaligned_new_df_duprems, fasta_unaligned_old_df], axis=0).sort_index().replace(np.nan, '-')
assert len(fasta_unaligned_fixbug_df.columns) == max(len(fasta_unaligned_old_df.columns), len(fasta_unaligned_new_df_duprems.columns))
print(fasta_unaligned_fixbug_df.shape[0], fasta_unaligned_new_df_duprems.shape[0], fasta_unaligned_old_df.shape[0], len(common_sequences))
assert fasta_unaligned_fixbug_df.shape[0] == (fasta_unaligned_new_df.shape[0] + fasta_unaligned_old_df.shape[0] - len(common_sequences))

fasta_unaligned_fixbug_df.to_csv(f'{p.DATA_DIR}/csv/fasta_unaligned_fixbug/fasta_all.csv')
fasta_unaligned_fixbug_L_df = u.select_only_chain_sequences(fasta_unaligned_fixbug_df, 'L')
fasta_unaligned_fixbug_L_df.to_csv(f'{p.DATA_DIR}/csv/fasta_unaligned_fixbug/fasta_L.csv')
fasta_unaligned_fixbug_H_df = u.select_only_chain_sequences(fasta_unaligned_fixbug_df, 'H')
fasta_unaligned_fixbug_H_df.to_csv(f'{p.DATA_DIR}/csv/fasta_unaligned_fixbug/fasta_H.csv')

assert (fasta_unaligned_fixbug_L_df.shape[0] + fasta_unaligned_fixbug_H_df.shape[0]) == fasta_unaligned_fixbug_df.shape[0]
print(fasta_unaligned_fixbug_L_df.shape, fasta_unaligned_fixbug_H_df.shape)

fasta_unaligned_old_df.shape (6572, 444) fasta_unaligned_new_df.shape (1776, 595)
len(common_sequences) 64 common sequences: ['7QU1:H', '7QU1:L', '7QU2:H', '7QU2:L', '7R40:H', '7R40:L', '7S5P:H', '7S5P:L', '7S5Q:H', '7S5Q:L', '7S5R:H', '7S5R:L', '7TE4:H', '7TE4:L', '7TLY:H', '7TLY:L', '7TN0:H', '7TN0:L', '7TOW:H', '7TOW:L', '7TP3:H', '7TP3:L', '7TP4:H', '7TP4:L', '7TQA:H', '7TQA:L', '7TTM:H', '7TTM:L', '7TTX:H', '7TTX:L', '7TUY:H', '7TUY:L', '7U2D:H', '7U2D:L', '7U2E:H', '7U2E:L', '7UAP:H', '7UAP:L', '7URQ:H', '7URQ:L', '7URS:H', '7URS:L', '7WKX:H', '7WKX:L', '7WLZ:H', '7WLZ:L', '7WPE:H', '7WPE:L', '7WPF:H', '7WPF:L', '7WPH:H', '7WPH:L', '7WPV:H', '7WPV:L', '7WRV:H', '7WRV:L', '7X08:H', '7X08:L', '7X9E:H', '7X9E:L', '7Z0X:H', '7Z0X:L', '7Z0Y:H', '7Z0Y:L']
8284 1712 6572 64
(4142, 595) (4142, 595)


In [59]:
sasa_unaligned_old_df = pd.read_csv(f'{p.DATA_DIR}/csv/sasa_relative_unaligned/sasa_all.csv', index_col=0)
sasa_unaligned_new_df = pd.read_csv(f'{p.DATA_DIR}/csv/sasa_relative_unalignedJuly2024/sasa_all.csv', index_col=0)

common_sequences = list(sasa_unaligned_old_df.index.intersection(sasa_unaligned_new_df.index))
print('len(common_sequences)', len(common_sequences), 'common sequences:', common_sequences)

sasa_unaligned_new_df_duprems = sasa_unaligned_new_df.drop(index=common_sequences)
sasa_unaligned_fixbug_df = pd.concat([sasa_unaligned_new_df_duprems, sasa_unaligned_old_df], axis=0).sort_index()

assert len(sasa_unaligned_fixbug_df.columns) == max(len(sasa_unaligned_old_df.columns), len(sasa_unaligned_new_df_duprems.columns))
print(sasa_unaligned_fixbug_df.shape[0], sasa_unaligned_new_df_duprems.shape[0], sasa_unaligned_old_df.shape[0], len(common_sequences))
assert sasa_unaligned_fixbug_df.shape[0] == (sasa_unaligned_new_df.shape[0] + sasa_unaligned_old_df.shape[0] - len(common_sequences))

sasa_unaligned_fixbug_df.to_csv(f'{p.DATA_DIR}/csv/sasa_relative_unaligned_fixbug/sasa_all.csv')
sasa_unaligned_fixbug_L_df = u.select_only_chain_sequences(sasa_unaligned_fixbug_df, 'L')
sasa_unaligned_fixbug_L_df.to_csv(f'{p.DATA_DIR}/csv/sasa_relative_unaligned_fixbug/sasa_L.csv')
sasa_unaligned_fixbug_H_df = u.select_only_chain_sequences(sasa_unaligned_fixbug_df, 'H')
sasa_unaligned_fixbug_H_df.to_csv(f'{p.DATA_DIR}/csv/sasa_relative_unaligned_fixbug/sasa_H.csv')

assert (sasa_unaligned_fixbug_L_df.shape[0] + sasa_unaligned_fixbug_H_df.shape[0]) == sasa_unaligned_fixbug_df.shape[0]
print(sasa_unaligned_fixbug_L_df.shape, sasa_unaligned_fixbug_H_df.shape)

len(common_sequences) 64 common sequences: ['7QU1:H', '7QU1:L', '7QU2:H', '7QU2:L', '7R40:H', '7R40:L', '7S5P:H', '7S5P:L', '7S5Q:H', '7S5Q:L', '7S5R:H', '7S5R:L', '7TE4:H', '7TE4:L', '7TLY:H', '7TLY:L', '7TN0:H', '7TN0:L', '7TOW:H', '7TOW:L', '7TP3:H', '7TP3:L', '7TP4:H', '7TP4:L', '7TQA:H', '7TQA:L', '7TTM:H', '7TTM:L', '7TTX:H', '7TTX:L', '7TUY:H', '7TUY:L', '7U2D:H', '7U2D:L', '7U2E:H', '7U2E:L', '7UAP:H', '7UAP:L', '7URQ:H', '7URQ:L', '7URS:H', '7URS:L', '7WKX:H', '7WKX:L', '7WLZ:H', '7WLZ:L', '7WPE:H', '7WPE:L', '7WPF:H', '7WPF:L', '7WPH:H', '7WPH:L', '7WPV:H', '7WPV:L', '7WRV:H', '7WRV:L', '7X08:H', '7X08:L', '7X9E:H', '7X9E:L', '7Z0X:H', '7Z0X:L', '7Z0Y:H', '7Z0Y:L']
8284 1712 6572 64
(4142, 595) (4142, 595)


----

**Load the unaligned FASTA csv dataset:**

In [57]:
fasta_unaligned_df = pd.read_csv(FASTA_UNALIGNED_CSV_PATH, index_col=0)
print(f'fasta unaligned path: {FASTA_UNALIGNED_CSV_PATH}')

fasta_unaligned_df.columns = [int(c) for c in fasta_unaligned_df.columns]
print('fasta_unaligned_df.shape', fasta_unaligned_df.shape)
fasta_unaligned_df.head()

fasta unaligned path: ../../data/csv/fasta_unalignedIB2July2024/fasta_all.csv
fasta_unaligned_df.shape (1758, 156)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,146,147,148,149,150,151,152,153,154,155
7DF1:H,V,Q,L,V,Q,S,X,G,A,E,...,S,-,-,-,-,-,-,-,-,-
7DF1:L,D,I,V,M,T,Q,S,P,D,S,...,I,-,-,-,-,-,-,-,-,-
7E9O:H,Q,V,Q,L,L,E,S,X,G,G,...,V,S,S,-,-,-,-,-,-,-
7E9O:L,D,I,V,M,T,Q,S,P,L,S,...,I,K,R,-,-,-,-,-,-,-
7FGJ:H,V,Q,L,Q,Q,S,X,G,A,A,...,S,S,-,-,-,-,-,-,-,-


In [58]:
def get_sasa(structure_code, output_lines):
    matches = []
    sasa_dict = collections.defaultdict(list)
    for line in output_lines:
        tokens = re.split(' +', line)
        if tokens[0] != 'RES':
            continue
        chain = tokens[2]
        chain_id = f"{structure_code.upper()}:{chain}"
        total_side_rel_rsa = tokens[7] # 'Total-Side REL column'
        sasa_dict[chain_id].append(total_side_rel_rsa)
        matches.append(True)
    return sasa_dict, matches

# ---------------------------

def get_sasa_aho(structure_code, output_lines):
    matches = []
    sasa_L_dict = collections.defaultdict(dict)
    sasa_H_dict = collections.defaultdict(dict)
    fasta_L_dict = collections.defaultdict(dict)
    fasta_H_dict = collections.defaultdict(dict)
    for line in output_lines:
        tokens = re.split(' +', line)
        if tokens[0] != 'RES':
            continue
            
        aa = protein_letters_3to1[tokens[1].capitalize()]
        chain = tokens[2]
        chain_id = f"{structure_code.upper()}:{chain}"
        position = tokens[3] 
        total_side_rel_rsa = tokens[7] # 'Total-Side REL column'
        
        if total_side_rel_rsa == 'N/A':
            total_side_rel_rsa = np.nan
            
        if chain == 'L':
            fasta_L_dict[chain_id][position] = aa
            sasa_L_dict[chain_id][position] = total_side_rel_rsa
        elif chain == 'H':
            fasta_H_dict[chain_id][position] = aa
            sasa_H_dict[chain_id][position] = total_side_rel_rsa
        else:
            raise ValueError('invalid chain type')
        matches.append(True)
    return sasa_L_dict, sasa_H_dict, fasta_L_dict, fasta_H_dict, matches

In [None]:
# freesasa 7df1.pdb --depth residue --format rsa

In [None]:
# ! cat ../../data/pdb/immunobuilderJuly2024_aho/8QH0.pdb

In [None]:
# INVESTIGATION ONLY 
# comparison of numbers of output lines in incremental folder PDBs vs IB predictions
CLEANED_PDB_IB_DIR = f'{p.DATA_DIR}/pdb/immunobuilder2July2024_{p.FINAL_NUMBERING_SCHEME}'
CLEANED_PDB_IB_DIR = f'{p.DATA_DIR}/pdb/immunobuilder3July2024_raw'

CLEANED_PDB_CLASSIC_DIR = f'{p.DATA_DIR}/pdb/incrementalJuly2024'
filenames = [fn for fn in os.listdir(CLEANED_PDB_IB_DIR) if fn.endswith('.pdb') and fn[:4] not in fidf.index]

for filename in tqdm(filenames, desc='Computing SASA for PDB files..'):
    structure_code = filename.split('.')[0] 
    #print(subprocess.check_output(["ls", "../.."]).decode(sys.stdout.encoding).split(os.linesep))
    f_command_ib = ['freesasa', f"{CLEANED_PDB_IB_DIR}/{structure_code.upper()}.pdb", '--depth', 'residue', '--format', 'rsa']
    print(' '.join(f_command_ib))
    print(' '.join(f_command_classic))
    f_command_classic = ['freesasa', f"{CLEANED_PDB_CLASSIC_DIR}/{structure_code.upper()}.pdb", '--depth', 'residue', '--format', 'rsa']
    ib_output_lines_no = len(subprocess.check_output(f_command_ib).decode(sys.stdout.encoding).split(os.linesep))
    classic_output_lines_no = len(subprocess.check_output(f_command_classic).decode(sys.stdout.encoding).split(os.linesep))
    print('IB output lines', ib_output_lines_no, '| Classic output lines', classic_output_lines_no)
    break
    

In [62]:
new_status_data = collections.defaultdict(list)
df_dict, sasa_dict = dict(), dict()
df_sasa_L_dict, df_sasa_H_dict = dict(), dict() # used only on AHO PDBs, otherwise neskodne
df_fasta_L_dict, df_fasta_H_dict = dict(), dict() # used only on AHO PDBs, otherwise neskodne

fidf = freesasa_status_df.copy(); fidf.index = fidf['structure_code']
filenames = [fn for fn in os.listdir(CLEANED_PDB_DIR) if fn.endswith('.pdb') and fn[:4] not in fidf.index]
print('input dir path:',  CLEANED_PDB_DIR)

for filename in tqdm(filenames, desc='Computing SASA for PDB files..'):
    structure_code = filename.split('.')[0] 
    freesasa_command = ['freesasa', os.path.join(CLEANED_PDB_DIR, filename), 
                        '--depth', 'residue', 
                        '--format', 'rsa']    
    try:
        time_start = time.time()
        output_bytes = subprocess.check_output(freesasa_command)
        output_lines = output_bytes.decode(sys.stdout.encoding).split(os.linesep)
        time_end = time.time()
        time_diff = time_end - time_start
        new_status_data['freesasa_time'] = time_diff
        # print("\n".join(output_lines))
        # sasa_dict, matches = get_sasa_aho(structure_code, output_lines); print(sasa_dict); break # todo remove this line later
    except IOError:
        print('problem', filename, ' '.join(freesasa_command))
        new_status_data['structure_code'].append(structure_code); new_status_data['status'] = 'freesasa-error'
        continue

    light_sasa, heavy_sasa = [], []
    fasta_L_dict, fasta_H_dict = dict(), dict()
    if 'aho' in CLEANED_PDB_DIR:
        sasa_L_dict, sasa_H_dict,fasta_L_dict, fasta_H_dict, matches = get_sasa_aho(structure_code, output_lines)
    else:
        sasa_dict, matches = get_sasa(structure_code, output_lines)

    if not sasa_dict and not matches:
        print(f'Structure {structure_code} is missing some chain in unaligned_fasta dataset')
        new_status_data['structure_code'].append(structure_code); new_status_data['status'] = 'chain-missing'
        continue

    # report any mismatches between FASTA sequence and FREESASA-generated RSA aminoacid records
    if matches.count(True) != len(matches):
        print(f'{len(matches) - matches.count(True)} mismatches at {structure_code}')
        new_status_data['structure_code'].append(structure_code); new_status_data['status'] = 'fasta-sasa-mismatch'
        continue

    # this is like df_dict += sasa_dict
    # we want to add up all the small dicts to the big one 
    if 'aho' in CLEANED_PDB_DIR: 
        df_sasa_L_dict = {**df_sasa_L_dict, **sasa_L_dict}
        df_sasa_H_dict = {**df_sasa_H_dict, **sasa_H_dict}
        df_fasta_L_dict = {**df_fasta_L_dict, **fasta_L_dict}
        df_fasta_H_dict = {**df_fasta_H_dict, **fasta_H_dict}
    else:
        df_dict = {**df_dict, **sasa_dict}

    # save progress
    new_status_data['structure_code'].append(structure_code); new_status_data['status'] = 'ok'

input dir path: ../../data/pdb/immunobuilderJuly2024_aho


Computing SASA for PDB files..:   0%|          | 0/879 [00:00<?, ?it/s]

**There are no mismatches between FASTA and FREESASA-generated aminoacid sequences, nor any other `freesasa` related problems.**

**View summary and store the whole status dataframe to `.csv` file:**

In [64]:
new_freesasa_status_df = pd.DataFrame(new_status_data)
freesasa_status_df = pd.concat([freesasa_status_df, new_freesasa_status_df]).drop_duplicates()
freesasa_status_df.index = np.arange(0, len(freesasa_status_df))
print('storing to:', FREESASA_STATUS_PATH)
freesasa_status_df.to_csv(FREESASA_STATUS_PATH)
print(freesasa_status_df['status'].value_counts())
freesasa_status_df.head(n=1)

storing to: ../../data/csv/freesasa_statusIBJuly2024.csv
ok    879
Name: status, dtype: int64


Unnamed: 0,structure_code,status,freesasa_time
0,8QH0,ok,0.035746


In [66]:
freesasa_status_df['freesasa_time'].describe()

count    879.000000
mean       0.035746
std        0.000000
min        0.035746
25%        0.035746
50%        0.035746
75%        0.035746
max        0.035746
Name: freesasa_time, dtype: float64

----

### IB-AHO comparison with our results

In [67]:
if 'aho' in CLEANED_PDB_DIR and 'immunobuilder' in CLEANED_PDB_DIR:
    # fasta
    new_fasta_H_df = pd.DataFrame(list(df_fasta_H_dict.values()), index=list(df_fasta_H_dict.keys())).sort_index().replace(np.nan, '-')
    new_fasta_L_df = pd.DataFrame(list(df_fasta_L_dict.values()), index=list(df_fasta_L_dict.keys())).sort_index().replace(np.nan, '-')
    new_fasta_H_df.to_csv(f"{FASTA_ALIGNED_DIR}/fasta_aho_H.csv")
    new_fasta_L_df.to_csv(f"{FASTA_ALIGNED_DIR}/fasta_aho_L.csv")
    print('saving to dir:', FASTA_ALIGNED_DIR)
    print('new_fasta_H_df', new_fasta_H_df.shape, new_fasta_H_df)
    print('new_fasta_L_df', new_fasta_L_df.shape, new_fasta_L_df)
    
    # sasa
    new_sasa_H_df = pd.DataFrame(list(df_sasa_H_dict.values()), index=list(df_sasa_H_dict.keys())).sort_index()
    new_sasa_L_df = pd.DataFrame(list(df_sasa_L_dict.values()), index=list(df_sasa_L_dict.keys())).sort_index()
    new_sasa_H_df.to_csv(f"{SASA_ALIGNED_DIR_PATH}/sasa_H.csv")
    new_sasa_L_df.to_csv(f"{SASA_ALIGNED_DIR_PATH}/sasa_L.csv")
    print('saving to dir:', SASA_ALIGNED_DIR_PATH)
    print('new_sasa_H_df', new_sasa_H_df.shape, new_sasa_H_df)
    print('new_sasa_L_df', new_sasa_L_df.shape, new_sasa_L_df)
    # and now skip to results viz (TODO) copy the sasa to test results Y + do some transformations
    # TODO veci z notebooku 5 atd

saving to dir: ../../data/csv/fasta_alignedIB2July2024
new_fasta_H_df (879, 154)         1  2  3  4  5  6  7  9 10 11  ... 36E 85A 85B 85C 85D 85E 85F 85G 63  \
7DF1:H  -  V  Q  L  V  Q  S  G  A  E  ...   -   -   -   -   -   -   -   -  -   
7E9O:H  Q  V  Q  L  L  E  S  G  G  G  ...   -   -   -   -   -   -   -   -  -   
7FGJ:H  -  V  Q  L  Q  Q  S  G  A  A  ...   -   -   -   -   -   -   -   -  -   
7FGK:H  Q  V  Q  L  Q  Q  S  G  A  A  ...   -   -   -   -   -   -   -   -  -   
7FGL:H  Q  V  Q  L  Q  Q  S  G  A  A  ...   -   -   -   -   -   -   -   -  -   
...    .. .. .. .. .. .. .. .. .. ..  ...  ..  ..  ..  ..  ..  ..  ..  .. ..   
8Y6A:H  Q  V  Q  L  V  Q  S  G  A  E  ...   -   -   -   -   -   -   -   -  -   
8Y6H:H  E  V  Q  L  Q  E  S  G  P  E  ...   -   -   -   -   -   -   -   -  -   
8Y6I:H  E  V  Q  L  Q  E  S  G  P  E  ...   -   -   -   -   -   -   -   -  -   
8YX1:H  -  L  Q  L  Q  E  S  G  P  G  ...   -   -   -   -   -   -   -   -  -   
8YX9:H  E  V  Q  L  V  E  S  G  G  G  .

In [50]:
def renumber_columns(df_path):
    df = pd.read_csv(df_path, index_col=0)
    df = u.sort_numbering_columns(df)
    print(df_path, df.shape, df.columns)
    print('-----------------------------')
    print()
    df.to_csv(df_path)
    return df

fasta_H_df = renumber_columns(f"{FASTA_ALIGNED_DIR}/fasta_aho_H.csv")
fasta_L_df = renumber_columns(f"{FASTA_ALIGNED_DIR}/fasta_aho_L.csv")
sasa_H_df = renumber_columns(f"{SASA_ALIGNED_DIR_PATH}/sasa_H.csv")
sasa_L_df = renumber_columns(f"{SASA_ALIGNED_DIR_PATH}/sasa_L.csv")

../../data/csv/fasta_alignedIB2July2024/fasta_aho_H.csv (879, 154) Index(['1', '2', '3', '4', '5', '6', '7', '9', '10', '11',
       ...
       '140', '141', '142', '143', '144', '145', '146', '147', '148', '149'],
      dtype='object', length=154)
-----------------------------

../../data/csv/fasta_alignedIB2July2024/fasta_aho_L.csv (879, 129) Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '140', '141', '142', '143', '144', '145', '146', '147', '148', '149'],
      dtype='object', length=129)
-----------------------------

../../data/csv/sasa_alignedIB2July2024/sasa_H.csv (879, 154) Index(['1', '2', '3', '4', '5', '6', '7', '9', '10', '11',
       ...
       '140', '141', '142', '143', '144', '145', '146', '147', '148', '149'],
      dtype='object', length=154)
-----------------------------

../../data/csv/sasa_alignedIB2July2024/sasa_L.csv (879, 129) Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '140', '141', '142', '143', '1

In [51]:
fasta_test_new_234_H, sasa_test_new_234_H = u.load_dataset('test_new_234', chains='H')
fasta_test_new_234_H.index = fasta_test_new_234_H['Id']
fasta_test_new_234_H = fasta_test_new_234_H.drop(columns=['Id'])

ib_seqs = set(fasta_H_df.index)
classic_seqs = set(fasta_test_new_234_H.index)
missing_in_classic, missing_in_ib = ib_seqs.difference(classic_seqs), classic_seqs.difference(ib_seqs)
print('sequences missing in sabdab data', missing_in_classic)
print('sequences missing in IB data', missing_in_ib)

load_dataset: test_new_234, metadata file path: ../../data/csv/metadataJuly2024/metadata_H.csv, chains: H, shape: (888, 19)
load_dataset: test_new_234, X file path: ../../data/csv/fasta_aligned_cleaned_dlJuly2024/fasta_aho_H.csv, chains: H, shape: (888, 165)
load_dataset: test_new_234, Y file path: ../../data/csv/sasa_aligned_dlJuly2024/sasa_H.csv, chains: H, shape: (888, 165)
sequences missing in sabdab data set()
sequences missing in IB data {'7YAR:H', '7X08:H', '8HHY:H', '8VZO:H', '7UL3:H', '8DGV:H', '8G2M:H', '8SVE:H', '8IV0:H'}


In [5]:
fasta_test_new_234_H.head(n=1)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7DF1:H,-,V,Q,L,V,Q,S,-,G,A,...,Q,G,T,-,M,V,T,V,S,-


In [6]:
ib_seqs_c = set(fasta_H_df.columns)
classic_seqs_c = set(fasta_test_new_234_H.columns)
missing_in_classic_c = ib_seqs_c.difference(classic_seqs_c)
missing_in_ib_c = classic_seqs_c.difference(ib_seqs_c)
print('missing in classic columns', missing_in_classic_c)
print('missing in IB columns', missing_in_ib_c)

missing in classic columns {'36E', '36C', '36B', '36D', '36A'}
missing in IB columns {'123', '123D', '120', '123C', '63B', '8', '122', '125', '123A', '124', '85H', '143A', '126', '121', '123B'}


In [7]:
# add columns from CLASSIC to IB and make them empty
add_columns = list(missing_in_ib_c)
fasta_H_df[add_columns] = '-'
sasa_H_df[add_columns] = np.nan

# remove columns from IB that are not in CLASSIC
fasta_H_df = fasta_H_df.drop(columns=missing_in_classic_c)
sasa_H_df = sasa_H_df.drop(columns=missing_in_classic_c)

In [11]:
fasta_H_df = u.sort_numbering_columns(fasta_H_df)
sasa_H_df = u.sort_numbering_columns(sasa_H_df)
assert list(sasa_H_df.columns) == list(fasta_H_df.columns)
assert list(fasta_test_new_234_H.columns) == list(fasta_H_df.columns)

In [12]:
fasta_H_df.head(n=1)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
7DF1:H,-,V,Q,L,V,Q,S,-,G,A,...,Q,G,T,-,M,V,T,V,S,-


In [13]:
sasa_H_df.head(n=1)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
7DF1:H,,47.3,78.8,0.0,72.2,0.0,27.0,,,73.9,...,54.3,,0.0,,41.7,0.0,50.1,0.0,83.8,


In [14]:
TEST_PREDICTIONS_RAW_DIR_PATH = f'{p.DATA_DIR}/csv/test/raw_predictions'
TEST_PREDICTIONS_DIR_PATH = f'{p.DATA_DIR}/csv/test/predictions'
FEATURES = 'whole_sequence_all_H'
MODEL_NAME = 'IB'
raw_preds_path = f'{TEST_PREDICTIONS_RAW_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'
preds_path = f'{TEST_PREDICTIONS_DIR_PATH}/{FEATURES}_{MODEL_NAME}.csv'

In [15]:
print('raw_preds_path', raw_preds_path)
print('sasa_H_df.shape', sasa_H_df.shape)
sasa_H_df.to_csv(raw_preds_path)

raw_preds_path ../../data/csv/test/raw_predictions/whole_sequence_all_H_IB.csv
sasa_H_df.shape (879, 164)


In [16]:
sasa_H_df = pd.read_csv(raw_preds_path, index_col=0)
sasa_H_df.head(n=1)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,141,142,143,143A,144,145,146,147,148,149
7DF1:H,,47.3,78.8,0.0,72.2,0.0,27.0,,,73.9,...,54.3,,0.0,,41.7,0.0,50.1,0.0,83.8,


In [18]:
res_sasa_H_ids = sasa_H_df.index.to_series().reset_index(drop=True).rename('sequence_id')
res_sasa_H_df = u.positionize_sasa_df(sasa_H_df, res_sasa_H_ids)
res_sasa_H_df['position'] = res_sasa_H_df['position'].astype('string')
res_sasa_H_df['position'] = res_sasa_H_df['position'].apply(lambda x: str(x))
assert list(res_sasa_H_df['sequence_id'].unique()) == list(sasa_H_df.index)
print('positionized SASA shape', res_sasa_H_df.shape, 
      '| number of unique sequence_ids in res_sasa_H_df', res_sasa_H_df['sequence_id'].nunique(),
      '| number of unique positions', res_sasa_H_df['position'].nunique())
print('saving to', preds_path)
res_sasa_H_df.to_csv(preds_path)
res_sasa_H_df.head(n=3)

positionized SASA shape (144156, 3) | number of unique sequence_ids in res_sasa_H_df 879 | number of unique positions 164
saving to ../../data/csv/test/predictions/whole_sequence_all_H_IB.csv


Unnamed: 0,sequence_id,position,prediction
0,7DF1:H,1,
1,7E9O:H,1,108.6
2,7FGJ:H,1,


In [19]:
res_sasa_H_df = pd.read_csv(preds_path, index_col=0, low_memory=False)
res_sasa_H_df.head(n=1)

Unnamed: 0,sequence_id,position,prediction
0,7DF1:H,1,


----

### Continueing with the classic processing

**View and store `sasa` dataframe:**

In [None]:
# do not execute these cells if you are running IB predictions now, skip
new_sasa_df = pd.DataFrame(list(df_dict.values()), index=list(df_dict.keys())).sort_index()
sasa_df = new_sasa_df if old_sasa_df is None else pd.concat([old_sasa_df, new_sasa_df])
print('freesasa results unaligned | sasa_df shape:', sasa_df.shape, 'fasta unaligned shape:', fasta_unaligned_df.shape)
sasa_df.to_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv')
sasa_df.head()

In [None]:
sasa_df.count(axis=0)

**Check the numbers of SASA and FASTA values:**

In [None]:
# suddenly we are missing many SASA values... - investigating
# SASA_RELATIVE_UNALIGNED_DIR = f'{p.DATA_DIR}/csv/sasa_relative_unalignedJuly2024'
fasta_unaligned_older_df = pd.read_csv(f'{p.DATA_DIR}/csv/fasta_unalignedJuly2024/fasta_all.csv', index_col=0)
fasta_unaligned_ib1_df = pd.read_csv(f'{p.DATA_DIR}/csv/fasta_unalignedIBJuly2024/fasta_all.csv', index_col=0)
fasta_unaligned_ib2_df = pd.read_csv(f'{p.DATA_DIR}/csv/fasta_unalignedIB2July2024/fasta_all.csv', index_col=0)
fasta_unaligned_ib3_df = pd.read_csv(f'{p.DATA_DIR}/csv/fasta_unalignedIB3July2024/fasta_all.csv', index_col=0)
print('mean # unaligned-FASTA values per test_new_234 sequence - classic:', u.nondash_counts(fasta_unaligned_older_df).mean(),
     'IB method 1:', u.nondash_counts(fasta_unaligned_ib1_df).mean(), 
     'IB method 2:', u.nondash_counts(fasta_unaligned_ib2_df).mean(),
     'IB method 3:', u.nondash_counts(fasta_unaligned_ib3_df).mean())

sasa_older_df = pd.read_csv(f'{p.DATA_DIR}/csv/sasa_relative_unalignedJuly2024/sasa_all.csv', index_col=0)
sasa_ib1_df = pd.read_csv(f'{p.DATA_DIR}/csv/sasa_relative_unalignedIBJuly2024/sasa_all.csv', index_col=0)
sasa_ib2_df = pd.read_csv(f'{p.DATA_DIR}/csv/sasa_relative_unalignedIB2July2024/sasa_all.csv', index_col=0)
sasa_ib3_df = pd.read_csv(f'{p.DATA_DIR}/csv/sasa_relative_unalignedIB3July2024/sasa_all.csv', index_col=0)
print('mean # SASA values per test_new_234 sequence - classic:', sasa_older_df.count(axis=1).mean(),
     'IB method 1:', sasa_ib1_df.count(axis=1).mean(), 
     'IB method 2:', sasa_ib2_df.count(axis=1).mean(),
     'IB method 3:', sasa_ib2_df.count(axis=1).mean())

**Create L and H separate dataframes as well:**

In [None]:
sasa_df = pd.read_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv', index_col=0)
sasa_L_df = u.select_only_chain_sequences(sasa_df, 'L')
sasa_L_df.to_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_L.csv')
sasa_H_df = u.select_only_chain_sequences(sasa_df, 'H')
sasa_H_df.to_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_H.csv')
print('sasa_df.shape', sasa_df.shape, 'sasa_L_df.shape', sasa_L_df.shape, 'sasa_H_df.shape', sasa_H_df.shape)

## Sanity check

**Check if both `fasta` and `sasa` datasets contain exactly the same sets of keys:**

In [None]:
fasta_df = pd.read_csv(FASTA_UNALIGNED_CSV_PATH, index_col=0)
sasa_df  = pd.read_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv', index_col=0)
print('FASTA_UNALIGNED_CSV_PATH:', FASTA_UNALIGNED_CSV_PATH, 'SASA_RELATIVE_UNALIGNED_DIR:', SASA_RELATIVE_UNALIGNED_DIR)

In [None]:
fasta_abundant = fasta_df.index.difference(sasa_df.index)
sasa_abundant  = sasa_df.index.difference(fasta_df.index)
assert fasta_abundant.empty and sasa_abundant.empty # .empty is just a boolean property

**It appears they do :)**