In [22]:
import os
import subprocess
import pandas as pd
from collections import defaultdict
import re

In [31]:
# ============================================================================
# CONFIGURATION
# ============================================================================
# Each query folder corresponds to a txt file
QUERY_CONFIGS = [
    {
        'query_folder': '/home/lead/notebooks/sol_up_pdb_files',  # Folder with PDB files (queries from sheet 1)
        'uniprot_txt': '/home/lead/notebooks/sol_up_ids.txt',      # Corresponding txt file with UniProt IDs
        'label': 'solubility_up'                              # Label for this query set
    },
    {
        'query_folder': '/home/lead/notebooks/sol_down_pdb_files',  # Folder with PDB files (queries from sheet 2)
        'uniprot_txt': '/home/lead/notebooks/sol_down_ids.txt',      # Corresponding txt file with UniProt IDs
        'label': 'solubility_down'                              # Label for this query set
    }
]

FOLDDISCO_INDEX = "/home/lead/notebooks/folddisco_index/index_murine_sk/index"  # Path to your Folddisco index
TOP_N = 50  # Number of top results to consider per query
THREADS = 4  # Number of threads for Folddisco
OUTPUT_DIR = "folddisco_results"  # Directory to store individual results

In [32]:
# ============================================================================
# SETUP
# ============================================================================

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [33]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def extract_uniprot_from_id(structure_id):
    """
    Extract UniProt ID from structure identifier.
    Handles formats like: AF-P09813-F1-model_v4.pdb or P09813_*.pdb
    """
    # Try pattern: AF-UNIPROT-*
    match = re.search(r'AF-([A-Z0-9]+)-', structure_id)
    if match:
        return match.group(1)
    
    # Try pattern: UNIPROT_*
    match = re.search(r'([A-Z0-9]+)_', structure_id)
    if match:
        return match.group(1)
    
    # Try pattern: just the uniprot ID
    match = re.search(r'([A-Z][0-9][A-Z0-9]{3,}[0-9])', structure_id)
    if match:
        return match.group(1)
    
    return None

def load_uniprot_ids(txt_file):
    """Load UniProt IDs from a text file."""
    with open(txt_file, 'r') as f:
        return set([line.strip() for line in f if line.strip()])

def run_folddisco_query(query_pdb, output_file):
    """
    Run Folddisco query for a single PDB file.
    """
    cmd = [
        "folddisco", "query",
        "-i", FOLDDISCO_INDEX,
        "-p", query_pdb,
        "-t", str(THREADS),
        "--per-structure",
        "--sort-by-score",
        "--header",
        "--skip-match"
    ]
    
    try:
        with open(output_file, 'w') as f:
            result = subprocess.run(cmd, stdout=f, stderr=subprocess.PIPE, text=True)
        
        if result.returncode != 0:
            print(f"  ERROR: {result.stderr}")
            return False
        return True
    except Exception as e:
        print(f"  ERROR running Folddisco: {e}")
        return False

def parse_folddisco_output(output_file, top_n, uniprot_ids_set):
    """
    Parse Folddisco output and return top N results with UniProt IDs.
    """
    try:
        df = pd.read_csv(output_file, sep='\t')
        
        # Take top N results
        df_top = df.head(top_n).copy()
        
        # Extract UniProt IDs from structure identifiers
        df_top['uniprot_id'] = df_top['id'].apply(extract_uniprot_from_id)
        
        # Filter to only those in our UniProt list
        df_filtered = df_top[df_top['uniprot_id'].isin(uniprot_ids_set)]
        
        return df_filtered
    except Exception as e:
        print(f"  ERROR parsing output: {e}")
        return pd.DataFrame()

In [34]:
# ============================================================================
# MAIN PROCESSING
# ============================================================================

# Storage for results across all datasets
all_results = []
query_summary = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

# Process each query configuration (folder + txt file pair)
for config_idx, config in enumerate(QUERY_CONFIGS, 1):
    query_folder = config['query_folder']
    uniprot_txt = config['uniprot_txt']
    label = config['label']
    
    print("="*70)
    print(f"PROCESSING CONFIGURATION {config_idx}/{len(QUERY_CONFIGS)}: {label}")
    print("="*70)
    print(f"Query folder: {query_folder}")
    print(f"UniProt file: {uniprot_txt}")
    
    # Load UniProt IDs for this dataset
    uniprot_ids_set = load_uniprot_ids(uniprot_txt)
    print(f"Loaded {len(uniprot_ids_set)} UniProt IDs\n")
    
    # Get all PDB files in query directory
    query_pdbs = [f for f in os.listdir(query_folder) 
                  if f.endswith('.pdb') and os.path.isfile(os.path.join(query_folder, f))]
    
    print(f"Found {len(query_pdbs)} query PDB files\n")
    
    # Create subdirectory for this dataset's results
    dataset_output_dir = os.path.join(OUTPUT_DIR, label)
    os.makedirs(dataset_output_dir, exist_ok=True)
    
    # Process each query PDB
    for i, pdb_file in enumerate(query_pdbs, 1):
        query_path = os.path.join(query_folder, pdb_file)
        output_file = os.path.join(dataset_output_dir, f"{pdb_file}.tsv")
        
        print(f"[{i}/{len(query_pdbs)}] Processing: {pdb_file}")
        
        # Extract query UniProt ID
        query_uniprot = extract_uniprot_from_id(pdb_file)
        
        # Run Folddisco
        if run_folddisco_query(query_path, output_file):
            print(f"  ✓ Query complete")
            
            # Parse results
            df_matches = parse_folddisco_output(output_file, TOP_N, uniprot_ids_set)
            
            print(f"  ✓ Found {len(df_matches)} matches in UniProt list")
            
            # Tally the matches
            for _, row in df_matches.iterrows():
                target_uniprot = row['uniprot_id']
                query_summary[label][query_uniprot][target_uniprot] += 1
                
                # Store detailed result
                all_results.append({
                    'dataset': label,
                    'query_pdb': pdb_file,
                    'query_uniprot': query_uniprot,
                    'target_uniprot': target_uniprot,
                    'target_structure': row['id'],
                    'idf_score': row['idf_score'],
                    'min_rmsd': row['min_rmsd'],
                    'max_node_cov': row['max_node_cov'],
                    'uniprot_source': uniprot_txt
                })
        else:
            print(f"  ✗ Query failed")
        
        print()

PROCESSING CONFIGURATION 1/2: solubility_up
Query folder: /home/lead/notebooks/sol_up_pdb_files
UniProt file: /home/lead/notebooks/sol_up_ids.txt
Loaded 274 UniProt IDs

Found 259 query PDB files

[1/259] Processing: AF-Q8CGY6-F1-model_v4.pdb
  ✓ Query complete
  ✓ Found 3 matches in UniProt list

[2/259] Processing: AF-Q99JX3-F1-model_v4.pdb
  ✓ Query complete
  ✓ Found 3 matches in UniProt list

[3/259] Processing: AF-Q8C0L9-F1-model_v4.pdb
  ✓ Query complete
  ✓ Found 3 matches in UniProt list

[4/259] Processing: AF-Q9JK48-F1-model_v4.pdb
  ✓ Query complete
  ✓ Found 5 matches in UniProt list

[5/259] Processing: AF-Q921M7-F1-model_v4.pdb
  ✓ Query complete
  ✓ Found 1 matches in UniProt list

[6/259] Processing: AF-O35344-F1-model_v4.pdb
  ✓ Query complete
  ✓ Found 2 matches in UniProt list

[7/259] Processing: AF-P28665-F1-model_v4.pdb
  ✓ Query complete
  ✓ Found 3 matches in UniProt list

[8/259] Processing: AF-Q9JKV1-F1-model_v4.pdb
  ✓ Query complete
  ✓ Found 2 matches in U

In [35]:
# ============================================================================
# CREATE SUMMARY DATAFRAMES
# ============================================================================

print("\n" + "="*70)
print("Creating summary dataframes...")

# Overall summary
summary_data = []
for dataset, query_dict in query_summary.items():
    for query_uniprot, target_counts in query_dict.items():
        for target_uniprot, count in target_counts.items():
            summary_data.append({
                'dataset': dataset,
                'query_uniprot_id': query_uniprot,
                'target_uniprot_id': target_uniprot,
                'appearance_count': count
            })

df_summary = pd.DataFrame(summary_data)
df_summary = df_summary.sort_values(['dataset', 'query_uniprot_id', 'appearance_count'], 
                                     ascending=[True, True, False])

# Create detailed results dataframe
df_detailed = pd.DataFrame(all_results)

# Save results
summary_output = "folddisco_summary.csv"
detailed_output = "folddisco_detailed_results.csv"

df_summary.to_csv(summary_output, index=False)
df_detailed.to_csv(detailed_output, index=False)

print(f"✓ Summary saved to: {summary_output}")
print(f"✓ Detailed results saved to: {detailed_output}")


Creating summary dataframes...
✓ Summary saved to: folddisco_summary.csv
✓ Detailed results saved to: folddisco_detailed_results.csv


In [36]:
# ============================================================================
# DISPLAY RESULTS
# ============================================================================

print("\n" + "="*70)
print("SUMMARY STATISTICS")
print("="*70)

for config in QUERY_CONFIGS:
    label = config['label']
    dataset_summary = df_summary[df_summary['dataset'] == label]
    dataset_detailed = df_detailed[df_detailed['dataset'] == label]
    
    print(f"\n{label}:")
    print(f"  Total queries processed: {len(dataset_summary['query_uniprot_id'].unique())}")
    print(f"  Total unique target proteins found: {len(dataset_summary['target_uniprot_id'].unique())}")
    print(f"  Total matches recorded: {len(dataset_detailed)}")

print("\n" + "="*70)
print("TOP 10 MOST FREQUENT TARGET PROTEINS (across all datasets)")
print("="*70)
top_targets = df_summary.groupby('target_uniprot_id')['appearance_count'].sum().sort_values(ascending=False).head(10)
print(top_targets)

print("\n" + "="*70)
print("SAMPLE OF SUMMARY DATAFRAME")
print("="*70)
print(df_summary.head(20))

# Return the dataframes for further analysis
print("\n✓ Complete! DataFrames available as: df_summary, df_detailed")


SUMMARY STATISTICS

solubility_up:
  Total queries processed: 259
  Total unique target proteins found: 259
  Total matches recorded: 768

solubility_down:
  Total queries processed: 193
  Total unique target proteins found: 193
  Total matches recorded: 1167

TOP 10 MOST FREQUENT TARGET PROTEINS (across all datasets)
target_uniprot_id
E9Q3L2    174
Q91V24    172
Q80SU7    171
F8VPN4    144
Q5U430    141
Q8K440    119
P70170     98
Q69ZN7     60
O35379     42
Q9JMH9     33
Name: appearance_count, dtype: int64

SAMPLE OF SUMMARY DATAFRAME
              dataset query_uniprot_id target_uniprot_id  appearance_count
1425  solubility_down       A0A075B5M7        A0A075B5M7                 1
1426  solubility_down       A0A075B5M7        A0A075B5N3                 1
1427  solubility_down       A0A075B5M7            P03977                 1
1428  solubility_down       A0A075B5M7            Q62230                 1
1429  solubility_down       A0A075B5M7        A0A075B680                 1
1430 