# Engineer binders based on PDB structures 

In [36]:
%pip install biopython

Note: you may need to restart the kernel to use updated packages.


In [35]:
from Bio.PDB import PDBParser, NeighborSearch, Selection
from Bio.SeqUtils import seq1
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import random

def extract_proximal_sequence(pdb_file, 
                              chain_id, 
                              target_residues_range, 
                              neighbor_chain_id, 
                              N, 
                              max_distance, 
                              increment, 
                              output_fasta):
    # Parse the PDB file
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    
    # Get the target chain and residues
    chain = structure[0][chain_id]
    start_residue, end_residue = target_residues_range
    target_residues = [residue for residue in chain if start_residue <= residue.get_id()[1] <= end_residue]
    target_atoms = Selection.unfold_entities(target_residues, 'A')
    
    # Get the neighbor chain
    neighbor_chain = structure[0][neighbor_chain_id]
    neighbor_residues_list = list(neighbor_chain)
    neighbor_atoms_list = list(neighbor_chain.get_atoms())
    
    # Initialize variables
    distance_threshold = 1.0
    previous_neighbor_resseqs = []
    
    # Loop to increment distance_threshold
    while distance_threshold <= max_distance:
        # Build Neighbor Search tree
        ns = NeighborSearch(neighbor_atoms_list)
        
        # Find neighbor residues within the distance threshold
        neighbor_residues = set()
        for atom in target_atoms:
            neighbors = ns.search(atom.get_coord(), distance_threshold, level='R')
            neighbor_residues.update(neighbors)
        
        # Filter out target residues (in case chains are the same)
        neighbor_residues = neighbor_residues - set(target_residues)
        
        # Get residue sequence numbers (resseq) of neighbor residues
        neighbor_resseqs = sorted(residue.get_id()[1] for residue in neighbor_residues)
        
        if neighbor_resseqs:
            min_resseq = min(neighbor_resseqs)
            max_resseq = max(neighbor_resseqs)
            resseq_range = max_resseq - min_resseq
            
            if resseq_range > N:
                # Condition met; use previous iteration's data
                if previous_neighbor_resseqs:
                    min_resseq_prev = min(previous_neighbor_resseqs)
                    max_resseq_prev = max(previous_neighbor_resseqs)
                    # Extract the sequence from min_resseq_prev to max_resseq_prev
                    sequence_residues = [residue for residue in neighbor_chain if min_resseq_prev <= residue.get_id()[1] <= max_resseq_prev]
                    
                    # Adjust sequence to length N
                    sequence_residues = adjust_sequence_length(sequence_residues, neighbor_chain, N)
                    
                    # Convert residues to sequence
                    sequence = ''.join([seq1(residue.get_resname()) for residue in sequence_residues])
                    seq_record = SeqRecord(Seq(sequence), id=f"Residues_{sequence_residues[0].get_id()[1]}_{sequence_residues[-1].get_id()[1]}", description="")
                    
                    # Write to FASTA
                    SeqIO.write([seq_record], output_fasta, 'fasta')
                    print(f"Sequence saved to {output_fasta}")
                else:
                    print("No previous iteration data available.")
                return  # Exit the function after saving the sequence
            else:
                # Update previous_neighbor_resseqs
                previous_neighbor_resseqs = neighbor_resseqs
        
        # Increment the distance_threshold
        distance_threshold += increment
    
    print("No sequence found that meets the criteria within the maximum distance threshold.")

def adjust_sequence_length(sequence_residues, neighbor_chain, N):
    # If the sequence is already N residues, return it
    if len(sequence_residues) == N:
        return sequence_residues
    elif len(sequence_residues) > N:
        # Trim the sequence from both ends to get length N
        excess = len(sequence_residues) - N
        start = excess // 2
        end = start + N
        return sequence_residues[start:end]
    else:
        # Need to add residues from left/right to make length N
        residues_needed = N - len(sequence_residues)
        resseq_numbers = [residue.get_id()[1] for residue in sequence_residues]
        min_resseq = min(resseq_numbers)
        max_resseq = max(resseq_numbers)
        
        # Collect available residues to the left and right
        left_residues = [residue for residue in neighbor_chain if residue.get_id()[1] < min_resseq]
        right_residues = [residue for residue in neighbor_chain if residue.get_id()[1] > max_resseq]
        
        # Randomly select residues from left or right
        added_residues = []
        while residues_needed > 0 and (left_residues or right_residues):
            side = random.choice(['left', 'right'])
            if side == 'left' and left_residues:
                added_residues.insert(0, left_residues.pop())  # Add to the beginning
            elif side == 'right' and right_residues:
                added_residues.append(right_residues.pop(0))   # Add to the end
            residues_needed -= 1
        
        # Combine the residues
        sequence_residues = added_residues + sequence_residues
        return sequence_residues

# Example usage
pdb_file = '../../data/structures/6vja.pdb'
output_fasta='../../results/predictions/6vja_chainI_rational_design1.fasta'
chain_id = 'D'                         # Chain containing target residues
target_residues_range = (166, 183)       # Start and end residue numbers of the target range
neighbor_chain_id = 'I'                # Chain to limit neighbor search to
N = 80                                 # Desired length of the sequence
max_distance = 50.0                    # Maximum distance threshold
increment = 0.5                        # Increment for distance_threshold

extract_proximal_sequence(pdb_file, chain_id, target_residues_range, neighbor_chain_id, N, max_distance, increment, output_fasta)


Sequence saved to ../../results/predictions/6vja_chainI_rational_design1.fasta


In [44]:
import os
from Bio.PDB import PDBParser, NeighborSearch, Selection
from Bio.SeqUtils import seq1
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import random

def extract_proximal_sequence(pdb_file, 
                              chain_id, 
                              target_residues_range, 
                              neighbor_chain_id, 
                              N, 
                              max_distance, 
                              increment):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    pdb_filename = os.path.basename(pdb_file)
    pdb_id = os.path.splitext(pdb_filename)[0]
    
    chain = structure[0][chain_id]
    start_residue, end_residue = target_residues_range
    target_residues = [residue for residue in chain if start_residue <= residue.get_id()[1] <= end_residue]
    target_atoms = Selection.unfold_entities(target_residues, 'A')
    neighbor_chain = structure[0][neighbor_chain_id]
    neighbor_atoms_list = list(neighbor_chain.get_atoms())

    distance_threshold = random.uniform(1.0, max_distance / 2)
    max_distance_threshold = max_distance
    neighbor_resseqs_list = []

    while distance_threshold <= max_distance_threshold:
        ns = NeighborSearch(neighbor_atoms_list)
        neighbor_residues = set()
        for atom in target_atoms:
            neighbors = ns.search(atom.get_coord(), distance_threshold, level='R')
            neighbor_residues.update(neighbors)
        neighbor_residues = neighbor_residues - set(target_residues)
        neighbor_resseqs = sorted(set(residue.get_id()[1] for residue in neighbor_residues))

        if neighbor_resseqs:
            resseq_ranges = get_contiguous_ranges(neighbor_resseqs)
            neighbor_resseqs_list.extend(resseq_ranges)
        
        distance_threshold += increment

    if neighbor_resseqs_list:
        selected_range = random.choice(neighbor_resseqs_list)
        min_resseq, max_resseq = selected_range
        sequence_residues = [residue for residue in neighbor_chain if min_resseq <= residue.get_id()[1] <= max_resseq]
        sequence_residues = adjust_sequence_length(sequence_residues, neighbor_chain, N)
        sequence = ''.join([seq1(residue.get_resname()) for residue in sequence_residues])
        
        resseq_numbers = [residue.get_id()[1] for residue in sequence_residues]
        min_resseq = min(resseq_numbers)
        max_resseq = max(resseq_numbers)

        seq_id = f"{pdb_id}_chain{neighbor_chain_id}_resseq_{min_resseq}-{max_resseq}"
        seq_record = SeqRecord(Seq(sequence), id=seq_id, description="")
        return seq_record
    else:
        return None

def get_contiguous_ranges(numbers):
    ranges = []
    start = numbers[0]
    prev = numbers[0]
    for number in numbers[1:]:
        if number == prev + 1:
            prev = number
        else:
            ranges.append((start, prev))
            start = number
            prev = number
    ranges.append((start, prev))
    return ranges

def adjust_sequence_length(sequence_residues, neighbor_chain, N):
    if len(sequence_residues) == N:
        return sequence_residues
    elif len(sequence_residues) > N:
        excess = len(sequence_residues) - N
        start = random.randint(0, excess)
        end = start + N
        return sequence_residues[start:end]
    else:
        residues_needed = N - len(sequence_residues)
        resseq_numbers = [residue.get_id()[1] for residue in sequence_residues]
        min_resseq = min(resseq_numbers)
        max_resseq = max(resseq_numbers)
        left_residues = [residue for residue in neighbor_chain if residue.get_id()[1] < min_resseq]
        right_residues = [residue for residue in neighbor_chain if residue.get_id()[1] > max_resseq]
        left_residues.sort(key=lambda r: r.get_id()[1], reverse=True)
        right_residues.sort(key=lambda r: r.get_id()[1])
        added_residues = []

        while residues_needed > 0 and (left_residues or right_residues):
            side = random.choice(['left', 'right'])
            if side == 'left' and left_residues:
                added_residues.insert(0, left_residues.pop(0))
                residues_needed -= 1
            elif side == 'right' and right_residues:
                added_residues.append(right_residues.pop(0))
                residues_needed -= 1
            else:
                if left_residues:
                    added_residues.insert(0, left_residues.pop(0))
                    residues_needed -= 1
                elif right_residues:
                    added_residues.append(right_residues.pop(0))
                    residues_needed -= 1
                else:
                    break
        sequence_residues = added_residues + sequence_residues
        return sequence_residues[:N]

# Example usage
pdb_file = '../../data/structures/6vja.pdb'
output_fasta = '../../results/predictions/6vja_chainI_rational_designs.fasta'
chain_id = 'D'
target_residues_range = (166, 183)
neighbor_chain_id = 'I'
N = 80
max_distance = 20.0
increment = 0.5

seq_records = []
unique_sequences = set()
for _ in range(10):
    seq_record = extract_proximal_sequence(pdb_file, chain_id, target_residues_range, neighbor_chain_id, N, max_distance, increment)
    if seq_record:
        sequence_str = str(seq_record.seq)
        if sequence_str not in unique_sequences:
            unique_sequences.add(sequence_str)
            seq_records.append(seq_record)
        else:
            continue  # Skip duplicate sequence
    else:
        continue  # No sequence found

SeqIO.write(seq_records, output_fasta, 'fasta')
print(f"{len(seq_records)} unique sequences saved to {output_fasta}")

10 unique sequences saved to ../../results/predictions/6vja_chainI_rational_designs.fasta


In [47]:
import os
from Bio.PDB import PDBParser, NeighborSearch, Selection
from Bio.SeqUtils import seq1
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import random


def parse_pdb_file(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    pdb_filename = os.path.basename(pdb_file)
    pdb_id = os.path.splitext(pdb_filename)[0]
    return structure, pdb_id


def get_target_residues(chain, start_residue, end_residue):
    return [residue for residue in chain if start_residue <= residue.get_id()[1] <= end_residue]


def find_neighbor_residues(target_atoms, neighbor_chain, distance_threshold):
    ns = NeighborSearch(list(neighbor_chain.get_atoms()))
    neighbor_residues = set()
    for atom in target_atoms:
        neighbors = ns.search(atom.get_coord(), distance_threshold, level='R')
        neighbor_residues.update(neighbors)
    return neighbor_residues


def extract_proximal_sequence(pdb_file, chain_id, target_residues_range, neighbor_chain_id, N, max_distance, increment):
    structure, pdb_id = parse_pdb_file(pdb_file)
    target_chain = structure[0][chain_id]
    neighbor_chain = structure[0][neighbor_chain_id]
    
    start_residue, end_residue = target_residues_range
    target_residues = get_target_residues(target_chain, start_residue, end_residue)
    target_atoms = Selection.unfold_entities(target_residues, 'A')
    
    distance_threshold = random.uniform(1.0, max_distance / 2)
    neighbor_resids_list = []
    
    while distance_threshold <= max_distance:
        neighbor_residues = find_neighbor_residues(target_atoms, neighbor_chain, distance_threshold)
        neighbor_residues -= set(target_residues)
        neighbor_resids = sorted(set(residue.get_id() for residue in neighbor_residues))
        
        if neighbor_resids:
            resseq_ranges = get_contiguous_residue_ids(neighbor_resids)
            neighbor_resids_list.extend(resseq_ranges)
        
        distance_threshold += increment
    
    if neighbor_resids_list:
        selected_range = random.choice(neighbor_resids_list)
        min_residue_id, max_residue_id = selected_range
        sequence_residues = [
            residue for residue in neighbor_chain
            if min_residue_id <= residue.get_id() <= max_residue_id
        ]
        
        # Adjust sequence length to N
        sequence_residues = adjust_sequence_length(sequence_residues, neighbor_chain, N)
        if sequence_residues is None:
            return None
        
        # Create sequence record
        return create_sequence_record(neighbor_chain, sequence_residues, pdb_id, neighbor_chain_id)
    return None


def create_sequence_record(neighbor_chain, sequence_residues, pdb_id, neighbor_chain_id):
    sequence = ''.join([seq1(residue.get_resname()) for residue in sequence_residues])
    residue_ids = [residue.get_id() for residue in sequence_residues]
    
    min_residue_id = residue_ids[0]
    max_residue_id = residue_ids[-1]
    
    min_resseq, min_icode = min_residue_id[1], min_residue_id[2].strip()
    max_resseq, max_icode = max_residue_id[1], max_residue_id[2].strip()
    
    # Format insertion codes if they are present
    min_icode = min_icode if min_icode else ''
    max_icode = max_icode if max_icode else ''
    
    seq_id = f"{pdb_id}_chain{neighbor_chain_id}_resseq_{min_resseq}{min_icode}-{max_resseq}{max_icode}"
    return SeqRecord(Seq(sequence), id=seq_id, description="")


def get_contiguous_residue_ids(residue_ids):
    ranges = []
    start = residue_ids[0]
    prev = residue_ids[0]
    for residue_id in residue_ids[1:]:
        if residue_id[1] == prev[1] + 1 and residue_id[2] == ' ':
            prev = residue_id
        else:
            ranges.append((start, prev))
            start = residue_id
            prev = residue_id
    ranges.append((start, prev))
    return ranges


def adjust_sequence_length(sequence_residues, neighbor_chain, N):
    if len(sequence_residues) == N:
        return sequence_residues
    elif len(sequence_residues) > N:
        excess = len(sequence_residues) - N
        start = random.randint(0, excess)
        return sequence_residues[start:start + N]
    
    extended_sequence = extend_sequence(sequence_residues, neighbor_chain, N)
    if extended_sequence is None:
        return None
    else:
        return extended_sequence


def extend_sequence(sequence_residues, neighbor_chain, N):
    residues_needed = N - len(sequence_residues)
    residue_ids = [residue.get_id() for residue in sequence_residues]
    min_residue_id = min(residue_ids)
    max_residue_id = max(residue_ids)
    
    left_residues = [residue for residue in neighbor_chain if residue.get_id() < min_residue_id]
    right_residues = [residue for residue in neighbor_chain if residue.get_id() > max_residue_id]
    
    left_residues.sort(key=lambda r: r.get_id(), reverse=True)
    right_residues.sort(key=lambda r: r.get_id())
    
    added_residues = []
    
    while residues_needed > 0 and (left_residues or right_residues):
        if left_residues and right_residues:
            side = random.choice(['left', 'right'])
        elif left_residues:
            side = 'left'
        elif right_residues:
            side = 'right'
        else:
            break
        
        if side == 'left' and left_residues:
            added_residues.insert(0, left_residues.pop(0))
            residues_needed -= 1
        elif side == 'right' and right_residues:
            added_residues.append(right_residues.pop(0))
            residues_needed -= 1
    
    final_sequence = added_residues + sequence_residues
    if len(final_sequence) == N:
        return final_sequence
    elif len(final_sequence) > N:
        return final_sequence[:N]
    else:
        return None


def main():
    pdb_file = '../../data/structures/6vja.pdb'
    output_fasta = '../../results/predictions/6vja_chainI_rational_designs.fasta'
    chain_id = 'D'
    target_residues_range = (166, 183)
    neighbor_chain_id = 'I'
    N = 80
    max_distance = 20.0
    increment = 0.5
    
    seq_records = []
    unique_sequences = set()
    attempts = 0
    max_attempts = 100
    
    while len(seq_records) < 10 and attempts < max_attempts:
        seq_record = extract_proximal_sequence(pdb_file, chain_id, target_residues_range, neighbor_chain_id, N, max_distance, increment)
        attempts += 1
        if seq_record:
            sequence_str = str(seq_record.seq)
            seq_id = seq_record.id
            if sequence_str not in unique_sequences and seq_id not in [rec.id for rec in seq_records]:
                unique_sequences.add(sequence_str)
                seq_records.append(seq_record)
    
    SeqIO.write(seq_records, output_fasta, 'fasta')
    print(f"{len(seq_records)} unique sequences saved to {output_fasta}")


if __name__ == '__main__':
    main()


10 unique sequences saved to ../../results/predictions/6vja_chainI_rational_designs.fasta


In [50]:
pdb_file

'../../data/structures/6vja.pdb'

In [63]:
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1

# Parse the PDB file
parser = PDBParser(QUIET=True)
structure = parser.get_structure('6vja', pdb_file)

# Get chain I
chain_I = structure[0]['I']

# Extract residues
residues = list(chain_I)

# Get the sequence for positions 1-80
sequence_residues = residues[0:80]  # Positions 1-80 (indices 0-79)
sequence = ''.join([seq1(residue.get_resname()) for residue in sequence_residues])

print(sequence)


QVQLQQPGAELVKPGASVKMSCKASGYTFTSYNMHWVKQTPGRGLEWIGAIYPGNGDTSYNQKFKGKATLTADKSSSTAY


In [61]:
list(chain_I)[166-46:183-46]

[<Residue ASN het=  resseq=166 icode= >,
 <Residue CYS het=  resseq=167 icode= >,
 <Residue GLU het=  resseq=168 icode= >,
 <Residue PRO het=  resseq=169 icode= >,
 <Residue ALA het=  resseq=170 icode= >,
 <Residue ASN het=  resseq=171 icode= >,
 <Residue PRO het=  resseq=172 icode= >,
 <Residue SER het=  resseq=173 icode= >,
 <Residue GLU het=  resseq=174 icode= >,
 <Residue LYS het=  resseq=175 icode= >,
 <Residue ASN het=  resseq=176 icode= >,
 <Residue SER het=  resseq=177 icode= >,
 <Residue PRO het=  resseq=178 icode= >,
 <Residue SER het=  resseq=179 icode= >,
 <Residue THR het=  resseq=180 icode= >,
 <Residue GLN het=  resseq=181 icode= >,
 <Residue TYR het=  resseq=182 icode= >]