## PDB format description

## Generate FASTA data

Convert cleaned PDB files into FASTA aminoacid sequence files. \
Each generated FASTA file contains light chain sequence
and heavy chain sequence.

In [2]:
# generate fasta files

import os
import subprocess
import sys

import pandas as pd
import warnings
from Bio import SeqIO
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from tqdm.notebook import tqdm

import _file_paths as fp                
        
def pdb2fasta(input_file_path, output_file_path):
    SeqIO.convert(input_file_path, 'pdb-atom', output_file_path, 'fasta')
    

# not used anymore. 
# cleanup script solves this now
def fix_pdb_file(fn):
    """
    Some PDBs have float values in within-chain-position column
    SeqIO.convert and possibly other functions have problems with that.
    This function converts such values to usual format, while solving related problems
        as well (incorrectly spaced files etc.)
    """
    new_lines = []
    with open(fn, 'r') as f:
        for index, line in enumerate(f.readlines()):
            if line.startswith('ATOM'):
                # adhering exact PDB format definition here
                resseq = str(line[22:26].strip().split('.')[0])
                new_line = line[:22] + resseq.rjust(4) 
                if line[26] == '0':
                    new_line += line[27:]
                else:
                    new_line += line[26:]
                line = new_line
            new_lines.append(line)

    with open(fn, 'w') as f:
        f.write(''.join(new_lines))

Export one short FASTA file per PDB structure

In [3]:
cleaned_dir_path = fp.PDB_CLEANED_SCHEME_RAW_DIR_PATH

error_file_names = []
file_names = os.listdir(cleaned_dir_path)
for file_name in tqdm(file_names, desc='Converting PDB to FASTA...'):
    # some hidden files may occur in the directory. skip them
    if not file_name.endswith('pdb') :
        continue
    input_file_path = os.path.join(cleaned_dir_path, file_name)
    
    # fix_pdb_file(input_file_path) not needed anymore
    
    structure_code = file_name[-8:-4]
    output_file_path = os.path.join(fp.AA_RAW_DIR_PATH, 
                                   structure_code + '.fasta')
    
    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', PDBConstructionWarning) 
            fix_pdb_file(input_file_path)
            pdb2fasta(input_file_path, output_file_path)  
    except Exception as e:
        error_file_names.append(error_file_names)
        print(file_name, 'error', e, type(e))
        

print(f'{len(error_file_names)} errors out of {len(file_names)} files') 

Converting PDB to FASTA...:   0%|          | 0/4247 [00:00<?, ?it/s]

0 errors out of 4247 files


Merge all FASTAs into one big CSV that will make up a pandas dataframe

In [5]:
# merge fastas into fasta_merged.csv
import os, pandas as pd, _file_paths as fp
from Bio import SeqIO
from tqdm.notebook import tqdm

file_names = os.listdir(fp.AA_RAW_DIR_PATH)
df_dict = dict()

for file_name in tqdm(file_names, desc='Converting PDB to FASTA...'):
    # some hidden files may occur in the directory. skip them
    if not file_name.endswith('fasta') :
        continue

    # load fasta sequences into the dictionary
    input_file_path = os.path.join(fp.AA_RAW_DIR_PATH, file_name)
    fasta_sequences = SeqIO.parse(open(input_file_path, 'r', encoding='utf-8'), 'fasta')
    # import pickle; pickle.dump(fasta_sequences, open('fasta_sequences.p', 'wb')); break;
    for seq in fasta_sequences:
        df_dict[seq.id] = list(str(seq.seq)) # ['a', 'b', 'c' ... ]
    
# create dataframe from the dictionary and export it to CSV
fasta_df = pd.DataFrame(list(df_dict.values()), index=list(df_dict.keys()))
fasta_df.sort_index(inplace=True)
fasta_df.to_csv(fp.AA_RAW_MERGED_FILE_PATH)  
fasta_df.head()

Converting PDB to FASTA...:   0%|          | 0/4252 [00:00<?, ?it/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,513,514,515,516,517,518,519,520,521,522
12E8:H,E,V,Q,L,Q,Q,S,G,A,E,...,,,,,,,,,,
12E8:L,D,I,V,M,T,Q,S,Q,K,F,...,,,,,,,,,,
15C8:H,E,V,Q,L,Q,Q,S,G,A,E,...,,,,,,,,,,
15C8:L,D,I,V,L,T,Q,S,P,A,I,...,,,,,,,,,,
1A0Q:H,V,Q,L,Q,E,S,D,A,E,L,...,,,,,,,,,,


In [6]:
fasta_df.shape

(8492, 523)