In [30]:
import pandas as pd
import re
import requests
import math
import time
import os, re

In [31]:
# Load Excel + Extract UniProt IDs

ref_xlsx = "/home/akandalkar/reference list_all detected proteins.xlsx"
df = pd.read_excel(ref_xlsx, header=1)

# Extract UniProt IDs from "Protein Accession #" (e.g. sp|Q61738-2|ITA7_MOUSE → Q61738)
def extract_uniprot(acc):
    if isinstance(acc, str):
        match = re.search(r"\|([A-Z0-9]+)(?:-\d+)?\|", acc)
        if match:
            return match.group(1)
    return None

df["UniProt_ID"] = df["Protein Accession #"].apply(extract_uniprot)

# Drop NAs, get unique IDs
uniprot_ids = df["UniProt_ID"].dropna().unique().tolist()
print(f"Total UniProt IDs extracted: {len(uniprot_ids)}")

Total UniProt IDs extracted: 4350


In [38]:
saranya_folder = "/home/akandalkar/af_mouse_latest"

available_uniprot_ids = set()
for root, _, files in os.walk(saranya_folder):
    for file in files:
        if file.lower().endswith((".pdb.gz", ".cif.gz")):
            match = re.search(r"AF-([A-Z0-9]+)", file)
            if match:
                available_uniprot_ids.add(match.group(1))

print("Available UniProt IDs from folder:", len(available_uniprot_ids))


Available UniProt IDs from folder: 21615


In [39]:
# From your Excel dataframe
df_uniprot_ids = set(df["UniProt_ID"].dropna())

filtered_ids = sorted(df_uniprot_ids.intersection(available_uniprot_ids))
print("Overlap between Excel UniProt IDs and Saranya’s files:", len(filtered_ids))


Overlap between Excel UniProt IDs and Saranya’s files: 3923


In [40]:
with open("ref_uniprot_ids_filtered.txt", "w") as f:
    for uid in filtered_ids:
        f.write(uid + "\n")

print("Saved filtered UniProt IDs to ref_uniprot_ids_filtered.txt")

Saved filtered UniProt IDs to ref_uniprot_ids_filtered.txt


In [41]:
df.head()

Unnamed: 0,Protein Group#,Protein Accession #,Protein Description,GN,UniProt_ID,PDB_IDs
0,SJPG102208.001,sp|A2A432|CUL4B_MOUSE,Cullin-4B OS=Mus musculus GN=Cul4b PE=1 SV=1,Cul4b,A2A432,[]
1,SJPG104379.001,sp|A2A5R2|BIG2_MOUSE,Brefeldin A-inhibited guanine nucleotide-excha...,Arfgef2,A2A5R2,[]
2,SJPG106459.001,sp|A2A791|ZMYM4_MOUSE,Zinc finger MYM-type protein 4 OS=Mus musculus...,Zmym4,A2A791,[]
3,SJPG111204.002,sp|A2A863-2|ITB4_MOUSE,Isoform 2 of Integrin beta-4 OS=Mus musculus G...,Itgb4,A2A863,[]
4,SJPG111204.001,sp|A2A863|ITB4_MOUSE,Integrin beta-4 OS=Mus musculus GN=Itgb4 PE=2 ...,Itgb4,A2A863,[]


In [61]:
folder = "/home/akandalkar/af_mouse_latest"

# Count only files (ignores subdirectories)
num_files = len([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])
print("Number of files:", num_files)

Number of files: 43231


# KR Addition. This will move the matched files over to a new directory in lead team folder

In [67]:
# Import required libraries
import os
import shutil
import glob

In [68]:
# Configure your paths here
PDB_DIR = "/home/akandalkar/af_mouse_latest"  # Directory containing your PDB files
UNIPROT_FILE = "ref_uniprot_ids_filtered.txt"  # Your text file with UniProt IDs
OUTPUT_DIR = "ref_pdb_files"  # Output directory for matched files

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [69]:
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Read UniProt IDs from file
with open(UNIPROT_FILE, 'r') as f:
    uniprot_ids = [line.strip() for line in f if line.strip()]

print(f"Found {len(uniprot_ids)} UniProt IDs to search for\n")

Found 3923 UniProt IDs to search for



In [None]:
# Counter for matches
matched_files = 0
not_found = []

# Search for each UniProt ID
for uniprot_id in uniprot_ids:
    # Create search pattern: *[uniprot_id]*.pdb
    os.system(f"cp {PDB_DIR}/*{uniprot_id}* ref_pdb_files")

In [75]:
#Remove the *cif.gz files and unzip all *pdb.gz files
os.system(f"rm ref_pdb_files/*cif.gz")
os.system(f"gunzip ref_pdb_files/*pdb.gz")

0

In [76]:
# Summary
print(f"\n{'='*60}")
print(f"Summary:")
print(f"Total UniProt IDs searched: {len(uniprot_ids)}")


Summary:
Total UniProt IDs searched: 3923


In [None]:
# Make another sub folder with only the ones from up&down

In [None]:
import pandas as pd
import re

In [77]:
# Configure your paths
EXCEL_FILE = "/home/akandalkar/Demontis example_Up & down proteins for proteome enrichment.xlsx"  # Path to your Excel file
OUTPUT_FILE_SHEET1 = "sol_up_ids.txt"  # Output for first sheet
OUTPUT_FILE_SHEET2 = "sol_down_ids.txt"  # Output for second sheet

In [78]:
def extract_uniprot_id(text):
    """
    Extract UniProt ID from format like 'Apoa2_sp|P09813|APOA2_MOUSE'
    Returns the middle part (P09813)
    """
    if pd.isna(text):
        return None
    
    text = str(text)
    # Match pattern: something|UNIPROT_ID|something
    match = re.search(r'\|([A-Z0-9]+)\|', text)
    if match:
        return match.group(1)
    return None

def process_sheet(df, sheet_name):
    """
    Process a sheet to extract and deduplicate UniProt IDs.
    Keeps the first occurrence (highest placement) of each ID.
    """
    uniprot_ids = []
    seen = set()
    
    # Search through all columns and rows
    for idx, row in df.iterrows():
        for col in df.columns:
            cell_value = row[col]
            uniprot_id = extract_uniprot_id(cell_value)
            
            if uniprot_id and uniprot_id not in seen:
                uniprot_ids.append(uniprot_id)
                seen.add(uniprot_id)
    
    print(f"{sheet_name}:")
    print(f"  Total unique UniProt IDs found: {len(uniprot_ids)}")
    
    return uniprot_ids

In [79]:
# Read Excel file
excel_file = pd.ExcelFile(EXCEL_FILE)
sheet_names = excel_file.sheet_names

print(f"Found {len(sheet_names)} sheets in Excel file")
print(f"Sheet names: {sheet_names}\n")

Found 2 sheets in Excel file
Sheet names: ['up', 'down']



In [80]:
# Process first sheet
df_sheet1 = pd.read_excel(EXCEL_FILE, sheet_name=sheet_names[0])
uniprot_ids_sheet1 = process_sheet(df_sheet1, sheet_names[0])

# Save to text file
with open(OUTPUT_FILE_SHEET1, 'w') as f:
    for uid in uniprot_ids_sheet1:
        f.write(f"{uid}\n")
print(f"  Saved to: {OUTPUT_FILE_SHEET1}\n")

# Process second sheet
df_sheet2 = pd.read_excel(EXCEL_FILE, sheet_name=sheet_names[1])
uniprot_ids_sheet2 = process_sheet(df_sheet2, sheet_names[1])

# Save to text file
with open(OUTPUT_FILE_SHEET2, 'w') as f:
    for uid in uniprot_ids_sheet2:
        f.write(f"{uid}\n")
print(f"  Saved to: {OUTPUT_FILE_SHEET2}\n")

print("="*60)
print("Complete!")
print(f"Sheet 1 ({sheet_names[0]}): {len(uniprot_ids_sheet1)} unique IDs")
print(f"Sheet 2 ({sheet_names[1]}): {len(uniprot_ids_sheet2)} unique IDs")

up:
  Total unique UniProt IDs found: 274
  Saved to: sol_up_ids.txt

down:
  Total unique UniProt IDs found: 227
  Saved to: sol_down_ids.txt

Complete!
Sheet 1 (up): 274 unique IDs
Sheet 2 (down): 227 unique IDs


# we will now make seperate folders for the sol up and down proteins to evaluate using folddisco

In [81]:
# Import required libraries
import os
import shutil
import glob

In [82]:
# Configure your paths here
PDB_DIR = "/home/akandalkar/af_mouse_latest"  # Directory containing your PDB files
UNIPROT_FILE_up = "sol_up_ids.txt"  # Your text file with UniProt IDs
UNIPROT_FILE_down = "sol_down_ids.txt"  # Your text file with UniProt IDs
OUTPUT_DIR_up = "sol_up_pdb_files"  # Output directory for matched files
OUTPUT_DIR_down = "sol_down_pdb_files"  # Output directory for matched files

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR_up, exist_ok=True)
os.makedirs(OUTPUT_DIR_down, exist_ok=True)

In [83]:
# Read UniProt IDs from file
with open(UNIPROT_FILE_up, 'r') as f:
    uniprot_ids_up = [line.strip() for line in f if line.strip()]

print(f"Found {len(uniprot_ids_up)} UniProt IDs to search for\n")

Found 274 UniProt IDs to search for



In [84]:
# Read UniProt IDs from file
with open(UNIPROT_FILE_down, 'r') as f:
    uniprot_ids_down = [line.strip() for line in f if line.strip()]

print(f"Found {len(uniprot_ids_down)} UniProt IDs to search for\n")

Found 227 UniProt IDs to search for



In [85]:
# copy UP prots over missing 15 prot structures
# Counter for matches
matched_files = 0
not_found = []

# Search for each UniProt ID
for uniprot_id in uniprot_ids_up:
    # Create search pattern: *[uniprot_id]*.pdb
    os.system(f"cp {PDB_DIR}/*{uniprot_id}* {OUTPUT_DIR_up}")

cp: cannot stat '/home/akandalkar/af_mouse_latest/*Q91VB8*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*P63300*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*Q61001*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*A2ACG7*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*D3YV81*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*F6UV57*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*B7ZC71*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*F7BC60*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*F8WIV2*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*F7A856*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*E9PWE8*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_

In [86]:
#Remove the *cif.gz files and unzip all *pdb.gz files
os.system(f"rm {OUTPUT_DIR_up}/*cif.gz")
os.system(f"gunzip {OUTPUT_DIR_up}/*pdb.gz")

0

In [87]:
# copy DOWN prots over missing 35 prot structures
# Counter for matches
matched_files = 0
not_found = []

# Search for each UniProt ID
for uniprot_id in uniprot_ids_down:
    # Create search pattern: *[uniprot_id]*.pdb
    os.system(f"cp {PDB_DIR}/*{uniprot_id}* {OUTPUT_DIR_down}")

cp: cannot stat '/home/akandalkar/af_mouse_latest/*Q5SWZ5*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*E9QQ93*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*A2A4P4*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*V9GXW5*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*P01899*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*Q8BKX6*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*Q3UE92*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*E9PUR2*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*P01900*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*F6Z1R4*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_latest/*F6ZGN3*': No such file or directory
cp: cannot stat '/home/akandalkar/af_mouse_

In [88]:
#Remove the *cif.gz files and unzip all *pdb.gz files
os.system(f"rm {OUTPUT_DIR_down}/*cif.gz")
os.system(f"gunzip {OUTPUT_DIR_down}/*pdb.gz")

0