In [4]:
import os
from Bio.PDB import PDBParser
import pandas as pd

# Define the path to your PDB files and DataFrame CSV

# Paths for testing
# pdb_folder_path = '/blue/raquel.dias/laylaaschuster/protein_proj/examples2'
# output_folder_path = '/blue/raquel.dias/laylaaschuster/protein_proj/examples2/testing'

pdb_folder_path = './pdb_dataset_REDO_files'
df_csv_path = './pdbOrig_unique_chain_molecule_names.csv'
output_folder_path = './pdb_dataset_REDO_chain_files'

# Read the DataFrame from CSV
df = pd.read_csv(df_csv_path)

In [5]:
# Create a set of tuples for allowed PDB IDs and chain IDs from the DataFrame
allowed_chains = set(df.apply(lambda row: (row['PDB_ID'], row['Chain_ID']), axis=1))

In [6]:
import warnings
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.PDB import PDBIO, Select

warnings.simplefilter('ignore', PDBConstructionWarning)

class ChainSelect(Select):
    def __init__(self, chain_letters):
        self.chain_letters = chain_letters

    def accept_chain(self, chain):
        return chain.id in self.chain_letters

# Initialize PDB parser
parser = PDBParser()

# Loop over PDB files in the directory
for pdb_file_name in os.listdir(pdb_folder_path):
    if not pdb_file_name.endswith(".pdb"):  # Skip files that do not end with .pdb
        continue
        
    pdb_file_path = os.path.join(pdb_folder_path, pdb_file_name)

    if os.path.isdir(pdb_file_path):  # Skip directories
        continue
        
    # Extract the 4 alphanumeric PDB ID code from the file name
    pdb_id = os.path.splitext(pdb_file_name)[0][:4].upper()

    # Parse the structure
    try:
        structure = parser.get_structure(pdb_id, pdb_file_path)
    except IOError as e:
        print(f"Could not open PDB file: {e}")
        continue
        
    for model in structure:
        for chain_model in model:
            chain_id = chain_model.id
            
            # Check if the chain is in the allowed set
            if (pdb_id, chain_id) in allowed_chains:
                # Write the chain PDB file if it is allowed
                output_filename = f"{pdb_id}_{chain_id}.pdb"
                output_filepath = os.path.join(output_folder_path, output_filename)

                io = PDBIO()
                io.set_structure(structure)
                io.save(output_filepath, ChainSelect(chain_id))
                
                # with open(output_filepath, 'w') as outfile:
                #     for residue in chain_model.get_residues():
                #         for atom in residue.get_atoms():
                #             outfile.write(atom.get_parent().get_parent().child_list[0].child_list[0].get_parent().__str__())
                            
            #     print(f"Saved: {output_filename}")
            # else:
            #     print(f"Skipping chain {chain_id} of {pdb_id} as it's not listed.")