***Analysis script for the trifunctional crosslinks of DDI1_DDI2 tetramer protein complex***

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import argparse
from Bio import SeqIO

In [2]:
# Read in the csv file for the trifunctional XL-MS data
# path to current working directory 
dir_path = os.getcwd()
xls_data_dir = os.path.join(dir_path, 'derived_data/xls/')
fasta_data_dir = os.path.join(dir_path, 'data/fasta/')
df = pd.read_csv(os.path.join(xls_data_dir, 'ddi_trifunctional.csv'))

# Use some biopython module to read in fasta files for analysis later
# Read in fasta files for DDI1 and DDI2
ddi1_fasta = list(SeqIO.parse(os.path.join(fasta_data_dir, 'ddi1.fasta'), "fasta"))
ddi2_fasta = list(SeqIO.parse(os.path.join(fasta_data_dir, 'ddi2.fasta'), "fasta"))

# Remove the K from residue1/2/3 columns 
df['Residue1'] = df['Residue1'].str.replace('K', '')
df['Residue2'] = df['Residue2'].str.replace('K', '')
df['Residue3'] = df['Residue3'].str.replace('K', '')
print(f"raw trifunctional XL-MS data: \n", df)

# Remove redundant rows, where the order of the residues in a row, as long as the protein 
# names are the same, does not matter
df['residue_list'] = df[['Residue1', 'Residue2', 'Residue3']].values.tolist()
df['residue_list'] = df['residue_list'].apply(lambda x: sorted(x))
df['protein_list'] = df[['Protein1', 'Protein2', 'Protein3']].values.tolist()
df['protein_list'] = df['protein_list'].apply(lambda x: sorted(x))
df['unique_id'] = df['residue_list'].astype(str) + '_' + df['protein_list'].astype(str)
df = df.drop_duplicates(subset=['unique_id'])
df = df.drop(columns=['residue_list', 'protein_list', 'unique_id'])
print(f"non-redundant trifunctional XL-MS data: \n", df)

# Function to check if residues are lysines in the given fasta record
def check_lysines(fasta_record, residue_list):
    sequence = str(fasta_record.seq)
    lysine_positions = [m.start() + 1 for m in re.finditer('K', sequence)]  # +1 for 1-based indexing
    for residue in residue_list:
        if int(residue) not in lysine_positions:
            print(f"Warning: Residue {residue} is not a lysine in {fasta_record.id}")
        else:
            print(f"Residue {residue} is a lysine in {fasta_record.id}")

# Check residues in DDI1
check_lysines(ddi1_fasta[0], [346])
check_lysines(ddi2_fasta[0], [337]) 

raw trifunctional XL-MS data: 
    Protein1 Residue1 Protein2 Residue2 Protein3 Residue3
0      DDI1       77     DDI1      161     DDI1      291
1      DDI1       77     DDI1      291     DDI1      382
2      DDI1      133     DDI1      133     DDI1      213
3      DDI1      133     DDI1      133     DDI1      291
4      DDI1      133     DDI1      161     DDI1      213
5      DDI1      133     DDI1      161     DDI1      291
6      DDI1      133     DDI1      161     DDI1      345
7      DDI1      133     DDI1      161     DDI2      337
8      DDI1      133     DDI1      213     DDI1      291
9      DDI1      133     DDI1      213     DDI1      345
10     DDI1      133     DDI1      213     DDI2      337
11     DDI1      133     DDI1      291     DDI1      291
12     DDI1      133     DDI1      291     DDI1      382
13     DDI1      161     DDI1      161     DDI1      291
14     DDI1      161     DDI1      213     DDI1      291
15     DDI1      161     DDI1      291     DDI1      382

In [3]:
import numpy as np
import json
import os
from pathlib import Path
from Bio import SeqIO
from Bio.PDB import PDBParser, DSSP

class DomainAnalyzer:
    """
    A class to analyze protein domains using AlphaFold structure predictions
    and DSSP secondary structure analysis.
    """
    
    def __init__(self, dssp_bin="dssp"):
        """
        Initialize the DomainAnalyzer.
        
        Args:
            dssp_bin (str): Path to DSSP binary executable
        """
        self.dssp_bin = dssp_bin
        self.output_dir = Path("output_data/domain_analysis")
        self.output_dir.mkdir(parents=True, exist_ok=True)
    
    def get_disorder_from_alphafold(self, pdb_file):
        """
        Use B-factor pLDDT<70 as disorder indicator per CA atom.
        
        Args:
            pdb_file (str): Path to PDB file
            
        Returns:
            np.array: Boolean array indicating disorder for each residue
        """
        disorder = []
        with open(pdb_file, 'r') as f:
            for line in f:
                if line.startswith("ATOM") and line[12:16].strip() == "CA":
                    b_factor = float(line[60:66])
                    disorder.append(b_factor < 70)
        return np.array(disorder)
    
    def find_domains(self, disorder_pred):
        """
        Call contiguous ordered runs ≥8 aa as domains, merge small gaps.
        
        Args:
            disorder_pred (np.array): Boolean array of disorder predictions
            
        Returns:
            list: List of domain tuples (start, end) in 1-based indexing
        """
        domains = []
        start = None
        n = len(disorder_pred)
        
        for i in range(n):
            if not disorder_pred[i] and start is None:
                start = i
            if (disorder_pred[i] or i == n-1) and start is not None:
                if i - start >= 8:
                    domains.append((start + 1, i))  # 1-based indexing
                start = None
        
        # Merge domains separated by <5 aa
        merged = []
        for dom in domains:
            if not merged:
                merged.append(dom)
            else:
                prev = merged[-1]
                if dom[0] - prev[1] < 5:
                    merged[-1] = (prev[0], dom[1])
                else:
                    merged.append(dom)
        
        return merged
    
    def run_dssp_analysis(self, pdb_file):
        """
        Run DSSP to extract secondary structure per residue.
        
        Args:
            pdb_file (str): Path to PDB file
            
        Returns:
            DSSP object containing secondary structure information
        """
        parser = PDBParser(QUIET=True)
        struct = parser.get_structure("protein", pdb_file)
        dssp = DSSP(struct[0], pdb_file, dssp=self.dssp_bin)
        return dssp
    
    def extend_domains_with_ss(self, domains, pdb_file):
        """
        Grow domain boundaries while adjacent residues are helix/strand.
        
        Args:
            domains (list): List of domain tuples
            pdb_file (str): Path to PDB file
            
        Returns:
            list: List of extended domain tuples
        """
        try:
            dssp_data = self.run_dssp_analysis(pdb_file)
            ss = [residue[2] for residue in dssp_data]
        except Exception as e:
            print(f"Warning: DSSP analysis failed for {pdb_file}: {e}")
            return domains
        
        extended = []
        for start, end in domains:
            i0, i1 = start - 1, end - 1  # Convert to 0-based
            
            # Extend start while previous residues are helix/strand
            while i0 > 0 and ss[i0 - 1] in ("H", "E"):
                i0 -= 1
            
            # Extend end while next residues are helix/strand
            while i1 < len(ss) - 1 and ss[i1 + 1] in ("H", "E"):
                i1 += 1
            
            extended.append((i0 + 1, i1 + 1))  # Convert back to 1-based
        
        return extended
    
    def get_protein_info(self, fasta_file):
        """
        Extract protein information from FASTA file.
        
        Args:
            fasta_file (str): Path to FASTA file
            
        Returns:
            dict: Dictionary containing protein ID, description, and sequence length
        """
        fasta_records = list(SeqIO.parse(fasta_file, "fasta"))
        if not fasta_records:
            raise ValueError(f"No sequences found in {fasta_file}")
        
        record = fasta_records[0]  # Take first sequence
        return {
            "protein_id": record.id,
            "description": record.description,
            "sequence_length": len(record.seq)
        }
    
    def analyze_domains(self, fasta_file, pdb_file, output_name=None):
        """
        Perform complete domain analysis and save results.
        
        Args:
            fasta_file (str): Path to FASTA file
            pdb_file (str): Path to PDB file
            output_name (str, optional): Custom output filename (without extension)
            
        Returns:
            dict: Domain analysis results
        """
        # Get protein information
        protein_info = self.get_protein_info(fasta_file)
        
        # Generate output filename if not provided
        if output_name is None:
            protein_id = protein_info["protein_id"].replace("|", "_")
            output_name = f"{protein_id}_domains"
        
        # Perform domain analysis
        disorder = self.get_disorder_from_alphafold(pdb_file)
        initial_domains = self.find_domains(disorder)
        extended_domains = self.extend_domains_with_ss(initial_domains, pdb_file)
        
        # Prepare results
        results = {
            "protein_info": protein_info,
            "analysis_parameters": {
                "disorder_threshold": 60,
                "minimum_domain_length": 6,
                "merge_gap_threshold": 4
            },
            "input_files": {
                "fasta_file": str(fasta_file),
                "pdb_file": str(pdb_file)
            },
            "domain_analysis": {
                "initial_domains": initial_domains,
                "extended_domains": extended_domains,
                "num_domains": len(extended_domains),
                "total_domain_residues": sum(end - start + 1 for start, end in extended_domains)
            }
        }
        
        # Save to JSON file
        output_file = self.output_dir / f"{output_name}.json"
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2)
        
        print(f"Domain analysis completed for {protein_info['protein_id']}")
        print(f"Found {len(extended_domains)} domains: {extended_domains}")
        print(f"Results saved to: {output_file}")
        
        return results

# Example usage function
def analyze_ddi_proteins():
    """Example function to analyze DDI1 and DDI2 proteins."""
    mkdssp_path = os.path.expanduser("~/.local/bin/mkdssp")
    analyzer = DomainAnalyzer(dssp_bin=mkdssp_path)
    
    # Define file paths (adjust these to your actual file locations)
    proteins = [
        {
            "name": "DDI1",
            "fasta": "data/fasta/ddi1.fasta",
            "pdb": "data/pdb/ddi1.pdb"
        },
        {
            "name": "DDI2", 
            "fasta": "data/fasta/ddi2.fasta",
            "pdb": "data/pdb/ddi2.pdb"
        }
    ]
    
    results = {}
    for protein in proteins:
        if os.path.exists(protein["fasta"]) and os.path.exists(protein["pdb"]):
            result = analyzer.analyze_domains(
                protein["fasta"], 
                protein["pdb"], 
                protein["name"].lower()
            )
            results[protein["name"]] = result
        else:
            print(f"Warning: Files not found for {protein['name']}")
    
    return results

if __name__ == "__main__":
    # Run analysis
    results = analyze_ddi_proteins()

Domain analysis completed for DDI1
Found 3 domains: [(1, 79), (142, 290), (295, 371)]
Results saved to: output_data/domain_analysis/ddi1.json
Domain analysis completed for DDI2
Found 3 domains: [(1, 78), (133, 363), (386, 398)]
Results saved to: output_data/domain_analysis/ddi2.json


In [4]:
# Analysis of the original dataset to obtain csv file for the bifunctional crosslinks 

# Read in the csv file containing the raw data
rawdata_file = os.path.join(xls_data_dir, 'raw_data.csv')
# make sure that empty values are read as empty strings and not NaN
df_raw = pd.read_csv(rawdata_file, dtype=str)
print(f"raw data dimensions: {df_raw.shape}")
# Process only bifunctional crosslinks from this raw data, if the entry in the column 
# Gene C is empty, it is a bifunctional crosslink, in this case, drop the
# columns Gene C, Resi C, and any other columns that have empty values for that row, and also
# drop rows that have non empty values for Gene C and Resi C
df_bifunctional = df_raw[df_raw['Gene C'].isna()].copy()
df_bifunctional = df_bifunctional.drop(columns=['Gene C', 'Resi C', 'Peptide', 'R1_L_only', 'R2_L_only', 'R1_H_only', 'R2_H_only', 'R1_Mixed', 'R2_Mixed'])
# Drop any other columns that have empty values for that row
df_bifunctional = df_bifunctional.dropna(axis=1, how='all')
print(f"bifunctional data dimensions: {df_bifunctional.shape}")
# Remove duplicate rows
#df_bifunctional = df_bifunctional.drop_duplicates() 
#print(f"bifunctional data dimensions after removing duplicates: {df_bifunctional.shape}")
# rename column names Gene A to Protein1, Resi A to Residue1, Gene B to Protein2, Resi B to Residue2
df_bifunctional = df_bifunctional.rename(columns={'Gene A': 'Protein1', 'Resi A': 'Residue1', 
                                                  'Gene B': 'Protein2', 'Resi B': 'Residue2'})

# Some rows have entries such as DDI1, DDI2 in the Gene A/B columns 
# and also entries correspondingly in the Resi A/B columns with a comma separating 
# two residue numbers, in this case we need to create two separate rows, for example if the 
# entries are DDI1, DDI2 in Gene A/B and 345, 337 in Resi A/B, we need to create two rows
# row 1 will have DDI1 in whichever column it was present in and 345 in the the corresponding
# Resi column, and row 2 will have DDI2 in the Gene column and 337 in the Resi column
# use regex to check if there is a comma or ; in any of the entries
#print(df_bifunctional)
pattern = re.compile(r'[;,]')
# only copy rows with these patterns separately, and we shall process them now
pattern = re.compile(r'[;,]')
df_multi = df_bifunctional[df_bifunctional.apply(
    lambda row: (bool(pattern.search(str(row['Protein1']))) or 
                 bool(pattern.search(str(row['Protein2']))) or 
                 bool(pattern.search(str(row['Residue1']))) or 
                 bool(pattern.search(str(row['Residue2'])))), axis=1)].copy()
print(f"rows with multiple entries dimensions: {df_multi.shape}")
# remove the df_multi from df_bifunctional
df_bifunctional = df_bifunctional.drop(df_multi.index)
print(f"bifunctional data dimensions after removing multi-entry rows: {df_bifunctional.shape}")

raw data dimensions: (118, 23)
bifunctional data dimensions: (99, 14)
rows with multiple entries dimensions: (14, 14)
bifunctional data dimensions after removing multi-entry rows: (85, 14)


In [5]:
import pandas as pd
import re
from itertools import product

def split_cell(cell):
    """
    Splits a cell on commas or semicolons.
    Strips whitespace and ignores empty strings.
    Returns a list of entries.
    """
    if pd.isna(cell):
        return []
    parts = re.split(r'[;,]', str(cell))
    return [p.strip() for p in parts if p.strip()]

def split_row(row):
    """
    Given a row, split the multi-entry key columns (Protein1/Residue1 and Protein2/Residue2)
    by generating the full Cartesian product.
    Non-key columns are copied unchanged.
    """
    new_rows = []
    
    # Split key columns
    p1_list = split_cell(row['Protein1'])
    r1_list = split_cell(row['Residue1'])
    p2_list = split_cell(row['Protein2'])
    r2_list = split_cell(row['Residue2'])
    
    # If splitting returned an empty list, revert to original string.
    if not p1_list: p1_list = [row['Protein1']]
    if not r1_list: r1_list = [row['Residue1']]
    if not p2_list: p2_list = [row['Protein2']]
    if not r2_list: r2_list = [row['Residue2']]
    
    # If one column in a pair is single and the other multiple, replicate the single value.
    if len(p1_list) == 1 and len(r1_list) > 1:
        p1_list = p1_list * len(r1_list)
    elif len(r1_list) == 1 and len(p1_list) > 1:
        r1_list = r1_list * len(p1_list)
        
    if len(p2_list) == 1 and len(r2_list) > 1:
        p2_list = p2_list * len(r2_list)
    elif len(r2_list) == 1 and len(p2_list) > 1:
        r2_list = r2_list * len(p2_list)
    
    # Take the cross product between the two pairs (i.e. full Cartesian product).
    for (p1, r1) in product(p1_list, r1_list):
        for (p2, r2) in product(p2_list, r2_list):
            new_row = row.copy()
            new_row['Protein1'] = p1
            new_row['Residue1'] = r1
            new_row['Protein2'] = p2
            new_row['Residue2'] = r2
            new_rows.append(new_row)
    
    return new_rows

def explode_multiple_entries(df):
    """
    Expand the DataFrame by exploding rows that contain multiple entries in the key columns.
    """
    rows = []
    for _, row in df.iterrows():
        exploded = split_row(row)
        rows.extend(exploded)
    return pd.DataFrame(rows)

# Explode the DataFrame:
df_exploded = explode_multiple_entries(df_multi)
print("Exploded DataFrame:")
#print(df_exploded)
print(f"exploded data dimensions: {df_exploded.shape}")
# merge df_exploded into df_bifunctional - df_multi 
df_bifunctional = pd.concat([df_bifunctional, df_exploded], ignore_index=True)
#print("Final Bifunctional DataFrame:")
#print(df_bifunctional)
print(f"final bifunctional data dimensions: {df_bifunctional.shape}")

# Process rows and compare them, for example, if the pair DDI1-345 and DDI2-337 
# is repeated in two rows, then one of them need to be dropped, regardless of the order
# of the entries in the row, i.e. DDI1-345 and DDI2-337 is the same as DDI2-337 and DDI1-345
# Create a unique key by sorting the pairs (ignoring order)
# First, create the 'pair_key' as before.
df_bifunctional['pair_key'] = df_bifunctional.apply(
    lambda row: '_'.join(sorted([f"{row['Protein1']}-{row['Residue1']}",
                                  f"{row['Protein2']}-{row['Residue2']}"])),
    axis=1
)

# Calculate frequency for each pair_key.
frequency = df_bifunctional['pair_key'].value_counts()

# Map the frequency to each row.
df_bifunctional['frequency'] = df_bifunctional['pair_key'].map(frequency)

# Drop duplicate rows based on pair_key and then remove pair_key.
df_bifunctional_unique = df_bifunctional.drop_duplicates(subset=['pair_key']).drop(columns=['pair_key'])

print(f"bifunctional data dimensions after removing duplicates: {df_bifunctional_unique.shape}")
# Save to CSV
bifunctional_file = os.path.join(xls_data_dir, 'ddi_bifunctional_unique_processed.csv')
df_bifunctional_unique.to_csv(bifunctional_file, index=False)

bifunctional_file = os.path.join(xls_data_dir, 'ddi_bifunctional_processed.csv')
df_bifunctional.to_csv(bifunctional_file, index=False)

Exploded DataFrame:
exploded data dimensions: (68, 14)
final bifunctional data dimensions: (153, 14)
bifunctional data dimensions after removing duplicates: (74, 15)


In [6]:
# Now, we look for residue numbers from the dataframe that are less than 234 for DDI1 and less than 
# 230 for DDI2 as these are the NTD domains, and we only use these crosslinks for modeling
def filter_crosslinks(df):
    cond1 = ((df['Protein1'] == 'DDI1') & (df['Residue1'].astype(int) < 234)) | \
            ((df['Protein1'] == 'DDI2') & (df['Residue1'].astype(int) < 230))
    cond2 = ((df['Protein2'] == 'DDI1') & (df['Residue2'].astype(int) < 234)) | \
            ((df['Protein2'] == 'DDI2') & (df['Residue2'].astype(int) < 230))
    return df[cond1 & cond2]

df_filtered = filter_crosslinks(df_bifunctional_unique)
# arrange the df_filtered in ascending order based on Residue1, so all rows will be sorted
# based on this criteria
df_filtered = df_filtered.sort_values(
    by=['Residue1', 'Residue2'],
    key=lambda col: col.astype(int)
).reset_index(drop=True)
print(f"filtered bifunctional data dimensions: {df_filtered.shape}")
print(df_filtered)

# write to file
filtered_file = os.path.join(xls_data_dir, 'ddi_bifunctional_ntdomain_filtered.csv')
df_filtered.to_csv(filtered_file, index=False)


filtered bifunctional data dimensions: (17, 15)
   Protein1 Residue1 Protein2 Residue2 NR1_L_only NR1_Mixed NR1_H_only  \
0      DDI1       31     DDI1       31       0.81      0.51          1   
1      DDI1       31     DDI1       77          1      0.07       0.57   
2      DDI1       31     DDI1      133          1      0.15       0.57   
3      DDI1       31     DDI1      161          1       0.1       0.48   
4      DDI1       31     DDI1      213          1      0.12       0.45   
5      DDI2       67     DDI2       77          1      0.51       0.81   
6      DDI2       77     DDI2      153          1      0.07       0.75   
7      DDI1       77     DDI1      161          1      0.29       0.62   
8      DDI1       77     DDI1      213          1      0.09       0.67   
9      DDI1      133     DDI1      133          1      0.51       0.54   
10     DDI1      133     DDI1      161          1      0.23       0.57   
11     DDI1      133     DDI1      190          1      0.03     

In [7]:
# Process the dataframe further to create IMP input files for modeling

# read in the csv file
df_imp_process = pd.read_csv(os.path.join(xls_data_dir, 'final_bifunctional_data.csv'))
print(f"data dimensions for IMP processing: {df_imp_process.shape}")

# We are going to add three more columns to this dataframe: Copy1,Copy2,Iid
# For each row, if Protein1 == Protein2 and Residue1 == Residue2, then the crosslink 
# has to be between two copies of the same protein, then Copy1 and Copy2 will be 0 and 1,
# and Iid will be 1. Now, when we have Protein1 = Protein2 and Residue1 != Residue2, then
# Copy1 and Copy2 could be 0 and 1, or 1 and 0 so two new rows will be created, and Iid
# will be same number for both these rows which is continuously incremented for each new
# entry created starting from 1. case 3 is when Protein1 != Protein2, in this case, 
# Copy1 and Copy2 could be 0 and 0 or 1 and 1 or 0 and 1 or 1 and 0, so Copy1 and Copy2 will be
# 0 and 0, and Iid will be continuously incremented for each new entry

rows = []
iid = 1

for row in df_imp_process.to_dict(orient='records'):
    if row['Protein1'] == row['Protein2']:
        if row['Residue1'] == row['Residue2']:
            # Same protein and same residue:
            row.update({'Copy1': 0, 'Copy2': 1, 'Iid': iid})
            rows.append(row)
            iid += 1
        else:
            # Same protein but different residues, create two rows with complementary Copy assignments
            row1 = row.copy()
            row1.update({'Copy1': 0, 'Copy2': 1, 'Iid': iid})
            row2 = row.copy()
            row2.update({'Copy1': 1, 'Copy2': 0, 'Iid': iid})
            rows.extend([row1, row2])
            iid += 1
    else:
        # Different proteins:
        row.update({'Copy1': 0, 'Copy2': 0, 'Iid': iid})
        rows.append(row)
        iid += 1

df_imp_process = pd.DataFrame(rows)
print(df_imp_process)

# save to file
imp_file = os.path.join(xls_data_dir, 'ddi_bifunctional_imp_processed.csv')
df_imp_process.to_csv(imp_file, index=False)

data dimensions for IMP processing: (17, 4)
   Protein1  Residue1 Protein2  Residue2  Copy1  Copy2  Iid
0      DDI1        31     DDI1        31      0      1    1
1      DDI1        31     DDI1        77      0      1    2
2      DDI1        31     DDI1        77      1      0    2
3      DDI1        31     DDI1       133      0      1    3
4      DDI1        31     DDI1       133      1      0    3
5      DDI1        31     DDI1       161      0      1    4
6      DDI1        31     DDI1       161      1      0    4
7      DDI1        31     DDI1       213      0      1    5
8      DDI1        31     DDI1       213      1      0    5
9      DDI1        77     DDI1       161      0      1    6
10     DDI1        77     DDI1       161      1      0    6
11     DDI1        77     DDI1       213      0      1    7
12     DDI1        77     DDI1       213      1      0    7
13     DDI1       133     DDI1       133      0      1    8
14     DDI1       133     DDI1       161      0      1  