In generated cleaned PDB files, there are gaps within chains.

Create FASTA and SASA files that contain gap character `-` where needed, \
so working with such data is as convenient as possible later on.

In [1]:
import collections as c
import os
import pickle

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
from Bio import SeqIO
from Bio.PDB import PDBParser
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from tqdm.notebook import tqdm



import _file_paths as fp

MAXIMUM_CHAIN_LEN = 300
PROGRESS_CHECK_004_PATH = 'progress_check_004_data.p'

try:
    progress = pickle.load(open(PROGRESS_CHECK_004_PATH, 'rb'))
except FileNotFoundError:
    progress = {
        'processed': set(),
        'cons_sasa_alignment': dict(L=[0] * MAXIMUM_CHAIN_LEN, H=[0] * MAXIMUM_CHAIN_LEN),
        'cons_sasa_counts': dict(L=c.defaultdict(int), H=c.defaultdict(int))
    }

In [39]:
# progress['cons_sasa_counts']

In [6]:
SASA_EMPTY_VALUE = np.nan


def write_structure_property_files(dir_path: str, structure_code: str, l: list, h: list):
    l = [str(x) for x in l]
    h = [str(x) for x in h]
    with open(os.path.join(dir_path, 'light', structure_code + '.csv'),
              'w', encoding='utf-8') as light_file:
        light_file.write(' '.join(l) + os.linesep)
        
    with open(os.path.join(dir_path, 'heavy', structure_code + '.csv'),
              'w', encoding='utf-8') as heavy_file:
        heavy_file.write(' '.join(h) + os.linesep)
        
    with open(os.path.join(dir_path, 'merged', structure_code + '.csv'),
              'w', encoding='utf-8') as merged_file:
        merged_file.write(' '.join(l) + os.linesep)   
        merged_file.write(' '.join(h) + os.linesep)                    

        
def generate_sasa_heatmap(data: dict, output_file_path: str):
    plt.clf()
    sns.set(rc={'figure.figsize':(20, 2)})
    ax = sns.heatmap(data)
    plt.savefig(output_file_path)

    
def handle_file(file_name: str) -> dict:
    input_file_path = os.path.join(fp.PDB_CLEANED_DIR_PATH, file_name)
    
    structure_code = input_file_path[-8:-4]
    
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', PDBConstructionWarning) 
            
            # load PDB file
            pp = PDBParser()
            structure = pp.get_structure(structure_code, input_file_path)

            # prepare numbering data
            chain_positions = dict(L=list(), H=list())
            for chain in structure.get_chains():
                prev = 0
                for residue in chain.get_residues():
                    pos = residue.get_id()[1]
                    for p in range(prev+1, pos):
                        chain_positions[chain.id].append('-')
                    chain_positions[chain.id].append(pos)
                    prev = pos
            # write numbering data
            structure_code = input_file_path[-8:-4]
            write_structure_property_files(
                fp.POSITIONS_ALIGNED_DIR_PATH, 
                structure_code,
                chain_positions['L'],
                chain_positions['H'])

            # prepare fasta data
            fasta_file = os.path.join(
                fp.AA_RAW_DIR_PATH,
                structure_code + '.fasta')
            fasta_sequences = SeqIO.parse(
                open(fasta_file, 'r', encoding='utf-8'), 
                'fasta')
            # write fasta data
            fasta_dashes = dict(L='', H='')
            for seq in fasta_sequences:
                fasta_dashes[seq.id[-1]] = str(seq.seq).replace('X', '-')
            write_structure_property_files(
                fp.AA_ALIGNED_DIR_PATH, 
                structure_code,
                fasta_dashes['L'],
                fasta_dashes['H'])

            # prepare sasa data
            sasa_alignment = dict(L=[], H=[])
            sasa_file_path = os.path.join(
                fp.SASA_RAW_DIR_PATH, 
                structure_code + '.txt')
            with open(sasa_file_path, 'r', encoding='utf-8') as sasa_file:
                for chain_id, sasa_line in zip(['L', 'H'], sasa_file):
                    sasa_line_i = iter(sasa_line.split())
                    for pos in chain_positions[chain_id]:
                        if pos == '-':
                            sasa_alignment[chain_id].append(SASA_EMPTY_VALUE)
                        else:
                            sasa_alignment[chain_id].append(next(sasa_line_i))
            for chain_id, alignment in sasa_alignment.items():
                sasa_alignment[chain_id] = [float(x) for x in alignment] 
                
            # write sasa data
            write_structure_property_files(
                fp.SASA_ALIGNED_DIR_PATH,
                structure_code,
                sasa_alignment['L'],
                sasa_alignment['H'])
            
            # create SASA visualization
            # 1) append zeroes to the shorter chain
            max_len = max(len(alignment) for alignment in sasa_alignment.values())
            for chain_id, alignment in sasa_alignment.items():
                if max_len > len(alignment):
                    sasa_alignment[chain_id] += ([SASA_EMPTY_VALUE] * (max_len-len(alignment)))
            sasa_data = np.array([sasa_alignment['L'], sasa_alignment['H']])
            # 2) generate the heatmap
            heatmap_path = os.path.join(fp.SASA_VIZ_DIR_PATH, structure_code+'.png')
            generate_sasa_heatmap(sasa_data, heatmap_path)
            
            return {'sasa_alignment': sasa_alignment}

import collections as c           
consensual_sasa_alignment = progress['cons_sasa_alignment']
cons_sasa_counts = progress['cons_sasa_counts']
file_names = os.listdir(fp.PDB_CLEANED_DIR_PATH)

for file_name in tqdm(file_names, desc='Generating aligned data files'):
    if file_name.startswith('.'): continue
    if file_name in progress['processed']: continue
    if file_name.endswith('.pdb'): continue

    try:
        stats = handle_file(file_name)
        sasa_alignment = stats['sasa_alignment']
        for chain_id, alignment in sasa_alignment.items():
            for index, sasa in enumerate(alignment):
                if sasa == SASA_EMPTY_VALUE: continue
                consensual_sasa_alignment[chain_id][index] += sasa
                cons_sasa_counts[chain_id][index] += 1
        
        # save progress
        progress['cons_sasa_alignment'] = consensual_sasa_alignment
        progress['cons_sasa_counts'] = cons_sasa_counts
        progress['processed'].add(file_name)
        pickle.dump(progress, open(PROGRESS_CHECK_004_PATH, 'wb'))
    except ValueError as e:
        print(file_name, e, type(e))
    except StopIteration as e:
        print(type(e))
        
for chain_id, alignment in consensual_sasa_alignment.items():
    for index, sasa in enumerate(alignment):
        if cons_sasa_counts[chain_id][index] == 0: continue
        consensual_sasa_alignment[chain_id][index] /= cons_sasa_counts[chain_id][index]
                                 
csa = consensual_sasa_alignment
summary_heatmap_path = os.path.join(fp.SASA_VIZ_DIR_PATH, 'all.png')
generate_sasa_heatmap(np.array([csa['L'], csa['H']]), summary_heatmap_path)

Generating aligned data files:   0%|          | 0/10001 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '../../data/sequences/structure-chains/nxy..fasta'

In [2]:
# renumbering from RAW to AHO pdb

import os

from tqdm.notebook import tqdm

import _file_paths as fp
import _renumber

PROGRESS_CHECK_PATH = 'progress_check_004_renumber_aho.p'
progress = _renumber.get_progress(progress_check_path=PROGRESS_CHECK_PATH)    
file_names = os.listdir(fp.PDB_CLEANED_SCHEME_RAW_DIR_PATH)

for file_name in tqdm(file_names, desc='Cleaning raw PDB data...'):
    _renumber.renumber(input_dir_path=fp.PDB_CLEANED_SCHEME_RAW_DIR_PATH, 
             file_name=file_name,
             output_dir_path=fp.PDB_CLEANED_SCHEME_AHO_DIR_PATH,
             numbering_scheme='aho',
             progress=progress, 
             progress_check_path=PROGRESS_CHECK_PATH)

Cleaning raw PDB data...:   0%|          | 0/240 [00:00<?, ?it/s]

In [4]:
# unecessary
import os
import subprocess
from tqdm.notebook import tqdm

import _file_paths as fp

file_names = os.listdir(fp.AA_RAW_DIR_PATH)
for file_name in tqdm(file_names, desc='Cleaning raw PDB data...'):
    if not file_name.endswith('.fasta'):
        continue
    input_path = os.path.join(fp.AA_RAW_DIR_PATH, file_name)
    output_path = os.path.join('../../data/csv/anarci-from-fasta/', file_name[:-6]) # without extension
    cmd = ['anarci', '-i', input_path, 
           '--scheme', 'aho', 
           '-o', output_path, 
           '--csv']
    code = subprocess.call(cmd)
    if code != 0:
        print(file_name, code)

Cleaning raw PDB data...:   0%|          | 0/4251 [00:00<?, ?it/s]