***Analysis script for the trifunctional crosslinks of DDI1_DDI2 tetramer protein complex***

In [3]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import argparse
from Bio import SeqIO

In [4]:
# Read in the csv file for the trifunctional XL-MS data
# path to current working directory 
dir_path = os.getcwd()
xls_data_dir = os.path.join(dir_path, 'derived_data/xls/')
fasta_data_dir = os.path.join(dir_path, 'data/fasta/')
df = pd.read_csv(os.path.join(xls_data_dir, 'ddi_trifunctional.csv'))

# Use some biopython module to read in fasta files for analysis later
# Read in fasta files for DDI1 and DDI2
ddi1_fasta = list(SeqIO.parse(os.path.join(fasta_data_dir, 'ddi1.fasta'), "fasta"))
ddi2_fasta = list(SeqIO.parse(os.path.join(fasta_data_dir, 'ddi2.fasta'), "fasta"))

# Remove the K from residue1/2/3 columns 
df['Residue1'] = df['Residue1'].str.replace('K', '')
df['Residue2'] = df['Residue2'].str.replace('K', '')
df['Residue3'] = df['Residue3'].str.replace('K', '')
print(f"raw trifunctional XL-MS data: \n", df)

# Remove redundant rows, where the order of the residues in a row, as long as the protein 
# names are the same, does not matter
df['residue_list'] = df[['Residue1', 'Residue2', 'Residue3']].values.tolist()
df['residue_list'] = df['residue_list'].apply(lambda x: sorted(x))
df['protein_list'] = df[['Protein1', 'Protein2', 'Protein3']].values.tolist()
df['protein_list'] = df['protein_list'].apply(lambda x: sorted(x))
df['unique_id'] = df['residue_list'].astype(str) + '_' + df['protein_list'].astype(str)
df = df.drop_duplicates(subset=['unique_id'])
df = df.drop(columns=['residue_list', 'protein_list', 'unique_id'])
print(f"non-redundant trifunctional XL-MS data: \n", df)

# Function to check if residues are lysines in the given fasta record
def check_lysines(fasta_record, residue_list):
    sequence = str(fasta_record.seq)
    lysine_positions = [m.start() + 1 for m in re.finditer('K', sequence)]  # +1 for 1-based indexing
    for residue in residue_list:
        if int(residue) not in lysine_positions:
            print(f"Warning: Residue {residue} is not a lysine in {fasta_record.id}")
        else:
            print(f"Residue {residue} is a lysine in {fasta_record.id}")

# Check residues in DDI1
check_lysines(ddi1_fasta[0], [345])
check_lysines(ddi2_fasta[0], [337]) 

raw trifunctional XL-MS data: 
    Protein1 Residue1 Protein2 Residue2 Protein3 Residue3
0      DDI1       77     DDI1      161     DDI1      291
1      DDI1       77     DDI1      291     DDI1      382
2      DDI1      133     DDI1      133     DDI1      213
3      DDI1      133     DDI1      133     DDI1      291
4      DDI1      133     DDI1      161     DDI1      213
5      DDI1      133     DDI1      161     DDI1      291
6      DDI1      133     DDI1      161     DDI1      345
7      DDI1      133     DDI1      161     DDI2      337
8      DDI1      133     DDI1      213     DDI1      291
9      DDI1      133     DDI1      213     DDI1      345
10     DDI1      133     DDI1      213     DDI2      337
11     DDI1      133     DDI1      291     DDI1      291
12     DDI1      133     DDI1      291     DDI1      382
13     DDI1      161     DDI1      161     DDI1      291
14     DDI1      161     DDI1      213     DDI1      291
15     DDI1      161     DDI1      291     DDI1      382

In [None]:
import numpy as np
import json
import os
from pathlib import Path
from Bio import SeqIO
from Bio.PDB import PDBParser, DSSP

class DomainAnalyzer:
    """
    A class to analyze protein domains using AlphaFold structure predictions
    and DSSP secondary structure analysis.
    """
    
    def __init__(self, dssp_bin="dssp"):
        """
        Initialize the DomainAnalyzer.
        
        Args:
            dssp_bin (str): Path to DSSP binary executable
        """
        self.dssp_bin = dssp_bin
        self.output_dir = Path("output_data/domain_analysis")
        self.output_dir.mkdir(parents=True, exist_ok=True)
    
    def get_disorder_from_alphafold(self, pdb_file):
        """
        Use B-factor pLDDT<70 as disorder indicator per CA atom.
        
        Args:
            pdb_file (str): Path to PDB file
            
        Returns:
            np.array: Boolean array indicating disorder for each residue
        """
        disorder = []
        with open(pdb_file, 'r') as f:
            for line in f:
                if line.startswith("ATOM") and line[12:16].strip() == "CA":
                    b_factor = float(line[60:66])
                    disorder.append(b_factor < 70)
        return np.array(disorder)
    
    def find_domains(self, disorder_pred):
        """
        Call contiguous ordered runs ≥8 aa as domains, merge small gaps.
        
        Args:
            disorder_pred (np.array): Boolean array of disorder predictions
            
        Returns:
            list: List of domain tuples (start, end) in 1-based indexing
        """
        domains = []
        start = None
        n = len(disorder_pred)
        
        for i in range(n):
            if not disorder_pred[i] and start is None:
                start = i
            if (disorder_pred[i] or i == n-1) and start is not None:
                if i - start >= 8:
                    domains.append((start + 1, i))  # 1-based indexing
                start = None
        
        # Merge domains separated by <5 aa
        merged = []
        for dom in domains:
            if not merged:
                merged.append(dom)
            else:
                prev = merged[-1]
                if dom[0] - prev[1] < 5:
                    merged[-1] = (prev[0], dom[1])
                else:
                    merged.append(dom)
        
        return merged
    
    def run_dssp_analysis(self, pdb_file):
        """
        Run DSSP to extract secondary structure per residue.
        
        Args:
            pdb_file (str): Path to PDB file
            
        Returns:
            DSSP object containing secondary structure information
        """
        parser = PDBParser(QUIET=True)
        struct = parser.get_structure("protein", pdb_file)
        dssp = DSSP(struct[0], pdb_file, dssp=self.dssp_bin)
        return dssp
    
    def extend_domains_with_ss(self, domains, pdb_file):
        """
        Grow domain boundaries while adjacent residues are helix/strand.
        
        Args:
            domains (list): List of domain tuples
            pdb_file (str): Path to PDB file
            
        Returns:
            list: List of extended domain tuples
        """
        try:
            dssp_data = self.run_dssp_analysis(pdb_file)
            ss = [residue[2] for residue in dssp_data]
        except Exception as e:
            print(f"Warning: DSSP analysis failed for {pdb_file}: {e}")
            return domains
        
        extended = []
        for start, end in domains:
            i0, i1 = start - 1, end - 1  # Convert to 0-based
            
            # Extend start while previous residues are helix/strand
            while i0 > 0 and ss[i0 - 1] in ("H", "E"):
                i0 -= 1
            
            # Extend end while next residues are helix/strand
            while i1 < len(ss) - 1 and ss[i1 + 1] in ("H", "E"):
                i1 += 1
            
            extended.append((i0 + 1, i1 + 1))  # Convert back to 1-based
        
        return extended
    
    def get_protein_info(self, fasta_file):
        """
        Extract protein information from FASTA file.
        
        Args:
            fasta_file (str): Path to FASTA file
            
        Returns:
            dict: Dictionary containing protein ID, description, and sequence length
        """
        fasta_records = list(SeqIO.parse(fasta_file, "fasta"))
        if not fasta_records:
            raise ValueError(f"No sequences found in {fasta_file}")
        
        record = fasta_records[0]  # Take first sequence
        return {
            "protein_id": record.id,
            "description": record.description,
            "sequence_length": len(record.seq)
        }
    
    def analyze_domains(self, fasta_file, pdb_file, output_name=None):
        """
        Perform complete domain analysis and save results.
        
        Args:
            fasta_file (str): Path to FASTA file
            pdb_file (str): Path to PDB file
            output_name (str, optional): Custom output filename (without extension)
            
        Returns:
            dict: Domain analysis results
        """
        # Get protein information
        protein_info = self.get_protein_info(fasta_file)
        
        # Generate output filename if not provided
        if output_name is None:
            protein_id = protein_info["protein_id"].replace("|", "_")
            output_name = f"{protein_id}_domains"
        
        # Perform domain analysis
        disorder = self.get_disorder_from_alphafold(pdb_file)
        initial_domains = self.find_domains(disorder)
        extended_domains = self.extend_domains_with_ss(initial_domains, pdb_file)
        
        # Prepare results
        results = {
            "protein_info": protein_info,
            "analysis_parameters": {
                "disorder_threshold": 70,
                "minimum_domain_length": 8,
                "merge_gap_threshold": 5
            },
            "input_files": {
                "fasta_file": str(fasta_file),
                "pdb_file": str(pdb_file)
            },
            "domain_analysis": {
                "initial_domains": initial_domains,
                "extended_domains": extended_domains,
                "num_domains": len(extended_domains),
                "total_domain_residues": sum(end - start + 1 for start, end in extended_domains)
            }
        }
        
        # Save to JSON file
        output_file = self.output_dir / f"{output_name}.json"
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2)
        
        print(f"Domain analysis completed for {protein_info['protein_id']}")
        print(f"Found {len(extended_domains)} domains: {extended_domains}")
        print(f"Results saved to: {output_file}")
        
        return results

# Example usage function
def analyze_ddi_proteins():
    """Example function to analyze DDI1 and DDI2 proteins."""
 #   mkdssp_path = os.path.expanduser("~/.local/bin/mkdssp")
 #   analyzer = DomainAnalyzer(dssp_bin=mkdssp_path)
    analyzer = DomainAnalyzer()
    
    # Define file paths (adjust these to your actual file locations)
    proteins = [
        {
            "name": "DDI1",
            "fasta": "data/fasta/ddi1.fasta",
            "pdb": "data/pdb/ddi1.pdb"
        },
        {
            "name": "DDI2", 
            "fasta": "data/fasta/ddi2.fasta",
            "pdb": "data/pdb/ddi2.pdb"
        }
    ]
    
    results = {}
    for protein in proteins:
        if os.path.exists(protein["fasta"]) and os.path.exists(protein["pdb"]):
            result = analyzer.analyze_domains(
                protein["fasta"], 
                protein["pdb"], 
                protein["name"].lower()
            )
            results[protein["name"]] = result
        else:
            print(f"Warning: Files not found for {protein['name']}")
    
    return results

if __name__ == "__main__":
    # Run analysis
    results = analyze_ddi_proteins()

Domain analysis completed for DDI1
Found 3 domains: [(1, 79), (142, 291), (295, 371)]
Results saved to: output_data/domain_analysis/ddi1.json
Domain analysis completed for DDI2
Found 3 domains: [(1, 78), (133, 364), (375, 398)]
Results saved to: output_data/domain_analysis/ddi2.json
