# 4. Generate Relative SASA (RSA) data

Generate SASA sequences for chains contained in PDB files.

For this, we will use `freesasa` tool (https://freesasa.github.io)

If you wish to start generating SASA sequences anew, make sure you delete `data/csv/freesasa_status.csv` file first (given it exists).

---

## Installing FreeSASA

In [3]:
import sys; sys.path.append('../..')
import bin.params as p

Altough the Python `freesasa` module is out there, it does not provide residue-depth freesasa values, which we need. 

Therefore we will install the package and use the tool it from `bash`.

**Note**: check the freesasa.github.io, there may be newer version of freesasa available and the current one may give error 404.

In [4]:
command = f"""
echo 'Installing prerequisite: JSON-C ...'
brew install json-c

echo 'Downloading freesasa ...'
cd {p.BIN_DIR}
wget https://freesasa.github.io/freesasa-2.1.1.zip

echo 'Unzipping ...'
unzip -qq freesasa-2.1.1.zip

echo 'Installing ...'
cd freesasa-2.1.1
./configure 
make 
make install
"""
print(command)


echo 'Downloading freesasa ...'
cd ../../bin
wget https://freesasa.github.io/freesasa-2.1.1.zip

echo 'Unzipping ...'
unzip -qq freesasa-2.1.1.zip

echo 'Installing ...'
cd freesasa-2.1.1
./configure 
make 
make install



---

## Setup

In [24]:
import collections
import os
import pickle
import re
import subprocess
import sys; sys.path.append('../..')

import numpy as np
import pandas as pd
from Bio.Data.IUPACData import protein_letters_3to1
from tqdm.notebook import tqdm

import bin.params as p
import bin.utils as u

In [15]:
FASTA_UNALIGNED_CSV_PATH = f'{p.DATA_DIR}/csv/fasta_unalignedJuly2024/fasta_all.csv' # july2024
# FASTA_UNALIGNED_CSV_PATH = f'{p.DATA_DIR}/csv/fasta_unaligned/fasta_all.csv' # old
FREESASA_STATUS_PATH = f'{p.DATA_DIR}/csv/freesasa_statusJuly2024.csv' # july2024
# FREESASA_STATUS_PATH = f'{p.DATA_DIR}/csv/freesasa_status.csv' # old
SASA_RELATIVE_UNALIGNED_DIR = f'{p.DATA_DIR}/csv/sasa_relative_unalignedJuly2024' # july2024
# SASA_RELATIVE_UNALIGNED_DIR = f'{p.DATA_DIR}/csv/sasa_relative_unaligned' # old
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/incrementalJuly2024' # july2024
#CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/incremental' # old

FASTA_UNALIGNED_CSV_PATH, FREESASA_STATUS_PATH, SASA_RELATIVE_UNALIGNED_DIR, CLEANED_PDB_DIR

('../../data/csv/fasta_unalignedJuly2024/fasta_all.csv',
 '../../data/csv/freesasa_statusJuly2024.csv',
 '../../data/csv/sasa_relative_unalignedJuly2024',
 '../../data/pdb/incrementalJuly2024')

In [11]:
if os.path.exists(FREESASA_STATUS_PATH):
    freesasa_status_df = pd.read_csv(FREESASA_STATUS_PATH, index_col=0)
    print('loading freesasa status from file')
else:
    freesasa_status_df = pd.DataFrame(columns=['structure_code', 'status'])
    print('creating freesasa status anew')

creating freesasa status anew


In [33]:
command = f'mkdir -p {SASA_RELATIVE_UNALIGNED_DIR}'
! $command

In [34]:
if os.path.exists(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv'):
    old_sasa_df = pd.read_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv', index_col=0) 
    print('loading sasa from file')
else:
    print('no sasa loaded yet')
    old_sasa_df = None

no sasa loaded yet


## Generate SASA 

**Load the unaligned FASTA csv dataset:**

In [13]:
fasta_unaligned_df = pd.read_csv(FASTA_UNALIGNED_CSV_PATH, index_col=0)
fasta_unaligned_df.columns = [int(c) for c in fasta_unaligned_df.columns]
fasta_unaligned_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,585,586,587,588,589,590,591,592,593,594
7DF1:H,V,Q,L,V,Q,S,G,A,E,V,...,-,-,-,-,-,-,-,-,-,-
7DF1:L,D,I,V,M,T,Q,S,P,D,S,...,-,-,-,-,-,-,-,-,-,-
7E9O:H,Q,V,Q,L,L,E,S,G,G,G,...,-,-,-,-,-,-,-,-,-,-
7E9O:L,D,I,V,M,T,Q,S,P,L,S,...,-,-,-,-,-,-,-,-,-,-
7FGJ:H,V,Q,L,Q,Q,S,G,A,A,L,...,-,-,-,-,-,-,-,-,-,-


In [28]:
def get_sasa(structure_code, output_lines):
    matches = []
    sasa_dict = collections.defaultdict(list)
    for line in output_lines:
        tokens = re.split(' +', line)
        if tokens[0] != 'RES':
            continue
        chain = tokens[2]
        chain_id = f"{structure_code.upper()}:{chain}"
        total_side_rel_rsa = tokens[7]
        sasa_dict[chain_id].append(total_side_rel_rsa)
        matches.append(True)
    return sasa_dict, matches

In [30]:
df_dict, new_status_data = dict(), collections.defaultdict(list)
fidf = freesasa_status_df.copy(); fidf.index = fidf['structure_code']
filenames = [fn for fn in os.listdir(CLEANED_PDB_DIR) if fn.endswith('.pdb') and fn[:4] not in fidf.index]

for filename in tqdm(filenames, desc='Computing SASA for PDB files..'):
    structure_code = filename.split('.')[0] 
    freesasa_command = ['freesasa', os.path.join(CLEANED_PDB_DIR, filename), 
                        '--depth', 'residue', 
                        '--format', 'rsa']    
    try:
        output_bytes = subprocess.check_output(freesasa_command)
        output_lines = output_bytes.decode(sys.stdout.encoding).split(os.linesep)
        # print("\n".join(output_lines))
    except IOError:
        print('problem', filename, ' '.join(freesasa_command))
        new_status_data['structure_code'].append(structure_code); new_status_data['status'] = 'freesasa-error'
        continue

    light_sasa, heavy_sasa = [], []
    sasa_dict, matches = get_sasa(structure_code, output_lines)

    if not sasa_dict and not matches:
        print(f'Structure {structure_code} is missing some chain in unaligned_fasta dataset')
        new_status_data['structure_code'].append(structure_code); new_status_data['status'] = 'chain-missing'
        continue

    # report any mismatches between FASTA sequence and FREESASA-generated RSA aminoacid records
    if matches.count(True) != len(matches):
        print(f'{len(matches) - matches.count(True)} mismatches at {structure_code}')
        new_status_data['structure_code'].append(structure_code); new_status_data['status'] = 'fasta-sasa-mismatch'
        continue

    # this is like df_dict += sasa_dict
    # we want to add up all the small dicts to the big one 
    df_dict = {**df_dict, **sasa_dict}

    # save progress
    new_status_data['structure_code'].append(structure_code); new_status_data['status'] = 'ok'

Computing SASA for PDB files..:   0%|          | 0/888 [00:00<?, ?it/s]

**There are no mismatches between FASTA and FREESASA-generated aminoacid sequences, nor any other `freesasa` related problems.**

**View summary and store the whole status dataframe to `.csv` file:**

In [31]:
new_freesasa_status_df = pd.DataFrame(new_status_data)
freesasa_status_df = pd.concat([freesasa_status_df, new_freesasa_status_df]).drop_duplicates()
freesasa_status_df.index = np.arange(0, len(freesasa_status_df))
freesasa_status_df.to_csv(FREESASA_STATUS_PATH)
freesasa_status_df['status'].value_counts()

ok    888
Name: status, dtype: int64

**View and store `sasa` dataframe:**

In [35]:
new_sasa_df = pd.DataFrame(list(df_dict.values()), index=list(df_dict.keys())).sort_index()
sasa_df = new_sasa_df if old_sasa_df is None else pd.concat([old_sasa_df, new_sasa_df])
sasa_df.to_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv')
sasa_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,585,586,587,588,589,590,591,592,593,594
7DF1:H,62.1,74.7,0.0,55.9,0.0,29.8,,69.3,30.9,82.2,...,,,,,,,,,,
7DF1:L,74.7,3.9,78.1,0.0,75.8,0.0,78.1,41.7,87.4,73.2,...,,,,,,,,,,
7E9O:H,93.4,38.7,58.4,1.1,72.5,19.0,75.6,,,,...,,,,,,,,,,
7E9O:L,88.5,16.0,82.5,0.1,67.6,0.5,46.2,55.8,73.9,45.6,...,,,,,,,,,,
7FGJ:H,28.6,74.3,0.2,53.6,0.0,33.5,,25.7,25.8,14.4,...,,,,,,,,,,


In [36]:
sasa_df.shape, fasta_unaligned_df.shape

((1776, 595), (1776, 595))

**Create L and H separate dataframes as well:**

In [38]:
sasa_df = pd.read_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv', index_col=0)
u.select_only_chain_sequences(sasa_df, 'L').to_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_L.csv')
u.select_only_chain_sequences(sasa_df, 'H').to_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_H.csv')

## Sanity check

**Check if both `fasta` and `sasa` datasets contain exactly the same sets of keys:**

In [39]:
fasta_df = pd.read_csv(FASTA_UNALIGNED_CSV_PATH, index_col=0)
sasa_df  = pd.read_csv(f'{SASA_RELATIVE_UNALIGNED_DIR}/sasa_all.csv', index_col=0)

In [40]:
fasta_abundant = fasta_df.index.difference(sasa_df.index)
fasta_abundant

Index([], dtype='object')

In [41]:
sasa_abundant  = sasa_df.index.difference(fasta_df.index)
sasa_abundant

Index([], dtype='object')

**It appears they do :)**