# 3. Generate FASTA data

Convert cleaned PDB files into FASTA aminoacid sequence files. \
Each generated FASTA file contains light chain sequence
and heavy chain sequence.

Then, generate `.csv` files out of the FASTA sequences so the sequences could be used conveniently in DataFrames.

---

## Setup

In [64]:
import os
import subprocess
import sys; sys.path.append('../..')

import pandas as pd
import warnings
from Bio import SeqIO
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from tqdm.notebook import tqdm

import bin.params as p
import bin.utils as utils
import _file_paths as fp                

In [69]:
def pdb2fasta(input_file_path, output_file_path):
    SeqIO.convert(input_file_path, 'pdb-atom', output_file_path, 'fasta')

In [70]:
# CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/incremental'
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/incrementalJuly2024' # July2024
# CONVERTED_FASTA_DIR = f'{p.DATA_DIR}/sequences/incremental-converted' # old
CONVERTED_FASTA_DIR = f'{p.DATA_DIR}/sequences/incremental-convertedJuly2024' #july2024
# FASTA_UNALIGNED_CSV_DIR = f'{p.DATA_DIR}/csv/fasta_unaligned' # old
FASTA_UNALIGNED_CSV_DIR = f'{p.DATA_DIR}/csv/fasta_unalignedJuly2024' #july2024

# IB july2024
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/immunobuilderJuly2024_{p.FINAL_NUMBERING_SCHEME}'
CONVERTED_FASTA_DIR = f'{p.DATA_DIR}/sequences/incremental-convertedIB2July2024'
FASTA_UNALIGNED_CSV_DIR = f'{p.DATA_DIR}/csv/fasta_unalignedIB2July2024'

# IB3 july2024
CLEANED_PDB_DIR = f'{p.DATA_DIR}/pdb/immunobuilder3July2024_raw'
CONVERTED_FASTA_DIR = f'{p.DATA_DIR}/sequences/incremental-convertedIB3July2024'
FASTA_UNALIGNED_CSV_DIR = f'{p.DATA_DIR}/csv/fasta_unalignedIB3July2024'

# FIXBUG

(CLEANED_PDB_DIR, CONVERTED_FASTA_DIR, FASTA_UNALIGNED_CSV_DIR)

('../../data/pdb/immunobuilder3July2024_raw',
 '../../data/sequences/incremental-convertedIB3July2024',
 '../../data/csv/fasta_unalignedIB3July2024')

In [71]:
command = f'mkdir -p {CONVERTED_FASTA_DIR} {FASTA_UNALIGNED_CSV_DIR}'
! $command

---

## PDB -> FASTA

**Export one short FASTA file per PDB structure:**

In [72]:
error_file_names = []
filenames = [fn for fn in os.listdir(CLEANED_PDB_DIR) if fn.endswith('.pdb')]
len(filenames)

879

In [73]:
for filename in tqdm(filenames, desc='Converting PDB to FASTA...'):    
    input_file_path = os.path.join(CLEANED_PDB_DIR, filename)
    structure_code = filename[-8:-4]
    output_file_path = os.path.join(CONVERTED_FASTA_DIR, structure_code + '.fasta')
    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', PDBConstructionWarning) 
            pdb2fasta(input_file_path, output_file_path)  
    except Exception as e:
        error_file_names.append(error_file_names)
        print(file_name, 'error', e, type(e))

print(f'{len(error_file_names)} errors out of {len(filenames)} files') 

Converting PDB to FASTA...:   0%|          | 0/879 [00:00<?, ?it/s]

0 errors out of 879 files


**Check how many `.fasta` files have been created:** (the number should match the upper limit in the progressbar above)

In [74]:
# `tail -n +2` will skip the first line of `ls` output (that one is just a summary line)
print(CONVERTED_FASTA_DIR)
command = f"ls -l {CONVERTED_FASTA_DIR} | tail -n +2 | wc -l"
! $command

../../data/sequences/incremental-convertedIB3July2024
     879


## FASTA -> DataFrame

**Check for badly generated FASTA sequences and put all that are OK to the Python dictionary:**

In [75]:
filenames = [fn for fn in os.listdir(CONVERTED_FASTA_DIR) if fn.endswith('.fasta')]
df_dict = dict()
problems = []

In [76]:
for filename in tqdm(filenames, desc='Converting PDB to FASTA...'):
    input_file_path = os.path.join(CONVERTED_FASTA_DIR, filename)
    fasta_sequences = list(SeqIO.parse(open(input_file_path, 'r', encoding='utf-8'), 'fasta'))
    pdb_code = filename[:4]
    
    # check again - maybe there was problem with parsing the sequence but the file exists
    if len(fasta_sequences) != 2:
        print(
            f'PROBLEM: File {file_name} only has {len(fasta_sequences)} sequences, ' + 
            f' should have 2 (renumber script failed somehow)')
        problems.append(file_name)
        # problematic structures will not be included in the further analysis
        # throw it away
        continue
        
    for seq in fasta_sequences:
        seq_id = pdb_code + seq.id[4:]
        assert len(seq_id) == 6
        assert not seq.id.startswith('????') # or seq.id[:4] == pdb_code
        df_dict[seq_id] = list(str(seq.seq)) # ['a', 'b', 'c' ... ]

Converting PDB to FASTA...:   0%|          | 0/879 [00:00<?, ?it/s]

In [78]:
print('average len of a chain: ', pd.Series([len(v) for k, v in df_dict.items()]).mean())

average len of a chain:  128.31968145620021


**Apparently there were no problems.**

**Create dataframe from the dictionary and export it to CSV:**

In [79]:
fasta_df = pd.DataFrame(list(df_dict.values()), index=list(df_dict.keys())).fillna('-').sort_index()
print('fasta_df.shape', fasta_df.shape)
fasta_df.to_csv(f'{FASTA_UNALIGNED_CSV_DIR}/fasta_all.csv')  
fasta_df.head(n=2)

fasta_df.shape (1758, 144)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
7DF1:H,V,Q,L,V,Q,S,G,A,X,E,...,-,-,-,-,-,-,-,-,-,-
7DF1:L,D,I,V,M,T,Q,S,P,D,S,...,-,-,-,-,-,-,-,-,-,-


**Split merged dataframe onto two smaller ones, one containing only L rows, the other one \
containing only H rows:**

In [80]:
fasta_L_df = utils.select_only_chain_sequences(fasta_df, 'L')
print('fasta_L_df.shape', fasta_L_df.shape)
fasta_L_df.to_csv(f'{FASTA_UNALIGNED_CSV_DIR}/fasta_L.csv')    
fasta_L_df.head(n=2)

fasta_L_df.shape (879, 144)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
7DF1:L,D,I,V,M,T,Q,S,P,D,S,...,-,-,-,-,-,-,-,-,-,-
7E9O:L,D,I,V,M,T,Q,S,P,L,S,...,-,-,-,-,-,-,-,-,-,-


In [81]:
fasta_H_df = utils.select_only_chain_sequences(fasta_df, 'H')
print('fasta_H_df.shape', fasta_H_df.shape)
fasta_H_df.to_csv(f'{FASTA_UNALIGNED_CSV_DIR}/fasta_H.csv')   
fasta_H_df.head(n=2)

fasta_H_df.shape (879, 144)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
7DF1:H,V,Q,L,V,Q,S,G,A,X,E,...,-,-,-,-,-,-,-,-,-,-
7E9O:H,Q,V,Q,L,L,E,S,G,G,X,...,S,-,-,-,-,-,-,-,-,-
