In [10]:
import os
import pandas as pd
from Bio import PDB

def extract_residue_info(pdb_file):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    residues = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if PDB.is_aa(residue):
                    residues.append({
                        'residue_id': residue.get_id()[1],  # Extract the residue number
                        'residue_name': residue.get_resname()  # Extract the residue name
                    })
    return residues

def align_sequences(ref_seq, pdb_sequences):
    # Extract residue IDs from the reference sequence
    ref_residue_ids = [residue['residue_id'] for residue in ref_seq]

    # Initialize the DataFrame with the reference sequence
    df = pd.DataFrame(ref_residue_ids, columns=['residue_id'])
    df['ref_seq'] = [residue['residue_name'] for residue in ref_seq]

    # Align each PDB sequence to the reference sequence
    for pdb_title, pdb_seq in pdb_sequences.items():
        pdb_dict = {res['residue_id']: res['residue_name'] for res in pdb_seq}
        df[pdb_title] = df['residue_id'].apply(lambda x: pdb_dict.get(x, '-'))

    return df

def compare_seq(ref_seq, pdb_file_paths, protein_id):
    # Create the output directory if it doesn't exist
    os.makedirs(f'{protein_id}_outputs', exist_ok=True)

    # Extract residue information from the reference sequence
    ref_residues = extract_residue_info(ref_seq)

    # Extract residue information from each PDB file
    pdb_sequences = {}
    for pdb_file in pdb_file_paths:
        pdb_title = pdb_file.split('/')[-1]  # Use the filename as the title
        pdb_sequences[pdb_title] = extract_residue_info(pdb_file)

    # Align the sequences
    df = align_sequences(ref_residues, pdb_sequences)

    # Output the DataFrame to a CSV file
    output_file = os.path.join(f'{protein_id}_outputs', f'{protein_id}_aligned_sequences.csv')
    df.to_csv(output_file, index=False)
    print(f"Aligned sequences have been written to {output_file}")



In [13]:
ref_seq = './HYAL1_pdb/AF-Q12794-F1-model_v4.pdb'
pdb_file_paths = ['./HYAL1_pdb/pdb2pe4.pdb']

compare_seq(ref_seq, pdb_file_paths,'ASPG')

Aligned sequences have been written to ASPG_outputs/ASPG_aligned_sequences.csv
