In [2]:
import os
import pandas as pd
from Bio import PDB

def extract_residue_info(pdb_file, chain_id='A'):
    """
    Extracts residue information from a PDB file for a specific chain.
    
    Parameters:
    - pdb_file (str): Path to the PDB file.
    - chain_id (str): Chain ID to filter residues.
    
    Returns:
    - List of dictionaries containing residue information.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    residues = []
    
    for model in structure:
        for chain in model:
            if chain.get_id() == chain_id:
                for residue in chain:
                    if PDB.is_aa(residue):
                        residues.append({
                            'residue_id': residue.get_id()[1],  # Extract the residue number
                            'residue_name': residue.get_resname()  # Extract the residue name
                        })
    return residues

def align_sequences(ref_seq, pdb_sequences):
    """
    Aligns sequences from the reference and PDB sequences based on residue IDs.
    
    Parameters:
    - ref_seq (list of dicts): Reference sequence residues.
    - pdb_sequences (dict): Dictionary with PDB sequence titles and residues.
    
    Returns:
    - Pandas DataFrame with aligned sequences.
    """
    # Extract residue IDs from the reference sequence
    ref_residue_ids = [residue['residue_id'] for residue in ref_seq]

    # Initialize the DataFrame with the reference sequence
    df = pd.DataFrame(ref_residue_ids, columns=['residue_id'])
    df['ref_seq'] = [residue['residue_name'] for residue in ref_seq]

    # Align each PDB sequence to the reference sequence
    for pdb_title, pdb_seq in pdb_sequences.items():
        pdb_dict = {res['residue_id']: res['residue_name'] for res in pdb_seq}
        df[pdb_title] = df['residue_id'].apply(lambda x: pdb_dict.get(x, '-'))

    return df

def compare_seq(ref_seq, pdb_file_paths, protein_id):
    """
    Compares and aligns sequences from reference and PDB files, and writes results to CSV.
    
    Parameters:
    - ref_seq (str): Path to the reference PDB file.
    - pdb_file_paths (list of str): List of PDB file paths to compare.
    - protein_id (str): Identifier for the output directory and file.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(f'{protein_id}_outputs', exist_ok=True)

    # Extract residue information from the reference sequence
    ref_residues = extract_residue_info(ref_seq, chain_id='A')

    # Extract residue information from each PDB file
    pdb_sequences = {}
    for pdb_file in pdb_file_paths:
        pdb_title = os.path.basename(pdb_file).replace('.pdb', '')  # Use the filename as the title
        pdb_sequences[pdb_title] = extract_residue_info(pdb_file, chain_id='A')

    # Align the sequences
    df = align_sequences(ref_residues, pdb_sequences)

    # Output the DataFrame to a CSV file
    output_file = os.path.join(f'{protein_id}_outputs', f'{protein_id}_aligned_sequences1.csv')
    df.to_csv(output_file, index=False)
    print(f"Aligned sequences have been written to {output_file}")


In [3]:
ref_seq = './ASPG_pdb/AF-G3V1Y8-F1-model_v4.pdb'
pdb_file_paths = ['./ASPG_pdb/pdb1apy.pdb', './ASPG_pdb/pdb1apz.pdb', './ASPG_pdb/pdb1ayy.pdb', './ASPG_pdb/pdb1p4k.pdb', './ASPG_pdb/pdb1p4v.pdb', './ASPG_pdb/pdb2gac.pdb', './ASPG_pdb/pdb2gaw.pdb', './ASPG_pdb/pdb2gl9.pdb', './ASPG_pdb/pdb3ljq.pdb', './ASPG_pdb/pdb4r4y.pdb', './ASPG_pdb/pdb5v2i.pdb', './ASPG_pdb/pdb9gaa.pdb', './ASPG_pdb/pdb9gac.pdb', './ASPG_pdb/pdb9gaf.pdb']

compare_seq(ref_seq, pdb_file_paths,'ASPG')

Aligned sequences have been written to ASPG_outputs/ASPG_aligned_sequences1.csv
