In [1]:
!pip install torch_geometric
!pip install biopython
!pip install networkx

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m61.4/63.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1
Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [15]:
import os
import shutil
import requests
import subprocess
from pathlib import Path
from Bio.PDB import PDBParser, PDBIO
import torch
from torch_geometric.data import Data
import networkx as nx
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import combinations
from pathlib import Path

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load dataset:

In [4]:
df_fireprot = pd.read_csv("/content/drive/MyDrive/707/data/fireprotdb_results.csv")
df_fireprot.shape

  df_fireprot = pd.read_csv("/content/drive/MyDrive/707/data/fireprotdb_results.csv")


(53445, 35)

Filter missing $\Delta\Delta G$ values:

In [5]:
df_fireprot_w_ddG = df_fireprot[df_fireprot["ddG"].notna()]
df_fireprot_w_ddG.shape

(39177, 35)

Dropping duplicates:

In [6]:
df_fireprot_no_dupl = df_fireprot_w_ddG.drop_duplicates()
df_fireprot_no_dupl.shape

(12131, 35)

Dropping proteins with missing ids:

In [7]:
df_fireprot_w_pnames = df_fireprot_no_dupl[df_fireprot_no_dupl["protein_name"].notna()]
df_fireprot_w_uniprot_id = df_fireprot_w_pnames[df_fireprot_w_pnames["uniprot_id"].notna()]
df_fireprot_w_uniprot_id.shape

(12090, 35)

Sometimes, multiple $\Delta\Delta G$ entries per pair (wt,mutated) (corresponding to different experimental conditions):

In [8]:
subset_cols = ["protein_name","chain","position", "wild_type", "mutation"]
df_groupby = df_fireprot_w_uniprot_id.groupby(subset_cols)
df_fireprot_median_ddG = pd.DataFrame()
exp_ids = []
p_names = []
uniprot_ids = []
pbd_ids = []
chains = []
positions = []
wild_types = []
mutations = []
ddGs = []
sequences = []
is_in_cat_pock = []
is_ess = []
for name, group in tqdm(df_groupby):
  median_ddG_id = group["ddG"].median()
  group["dist_to_median"] = group["ddG"].apply(lambda x: abs(x-median_ddG_id))
  group_sorted = group.sort_values(by="dist_to_median")
  exp_ids.append(group_sorted.iloc[0]["experiment_id"])
  p_names.append(group_sorted.iloc[0]["protein_name"])
  uniprot_ids.append(group_sorted.iloc[0]["uniprot_id"])
  pbd_ids.append(group_sorted.iloc[0]["pdb_id"])
  positions.append(group_sorted.iloc[0]["position"])
  wild_types.append(group_sorted.iloc[0]["wild_type"])
  mutations.append(group_sorted.iloc[0]["mutation"])
  ddGs.append(group_sorted.iloc[0]["ddG"])
  sequences.append(group_sorted.iloc[0]["sequence"])
  is_in_cat_pock.append(group_sorted.iloc[0]["is_in_catalytic_pocket"])
  is_ess.append(group_sorted.iloc[0]["is_essential"])
  chains.append(group_sorted.iloc[0]["chain"])
  # print(group_sorted)

df_fireprot_median_ddG = pd.DataFrame({
    "experiment_id": exp_ids,
    "protein_name": p_names,
    "uniprot_id": uniprot_ids,
    "pdb_id": pbd_ids,
    "chain": chains,
    "position": positions,
    "wild_type": wild_types,
    "mutation": mutations,
    "ddG": ddGs,
    "sequence": sequences,
    "is_in_catalytic_pocket": is_in_cat_pock,
    "is_essential": is_ess
})

100%|██████████| 5086/5086 [00:08<00:00, 595.74it/s]


Include the mutated sequence:

In [9]:
def mutate_sequence(row):
  sequence = list(row["sequence"])
  sequence[row["position"]-1] = row["mutation"]
  return "".join(sequence)

In [10]:
df_fireprot_median_ddG["mt_sequence"] = df_fireprot_median_ddG.apply(mutate_sequence, axis=1)

In [11]:
df_fireprot_median_ddG

Unnamed: 0,experiment_id,protein_name,uniprot_id,pdb_id,chain,position,wild_type,mutation,ddG,sequence,is_in_catalytic_pocket,is_essential,mt_sequence
0,PT017913,10 kDa chaperonin,P0A6F9,1AON|1AON|1AON|1AON|1AON|1AON|1AON|1AON|1AON|1...,A,3,I,C,0.40,MNIRPLHDRVIVKRKEVETKSAGGIVLTGSAAAKSTRGEVLAVGNG...,False,False,MNCRPLHDRVIVKRKEVETKSAGGIVLTGSAAAKSTRGEVLAVGNG...
1,PT017914,10 kDa chaperonin,P0A6F9,1AON|1AON|1AON|1AON|1AON|1AON|1AON|1AON|1AON|1...,A,3,I,W,0.40,MNIRPLHDRVIVKRKEVETKSAGGIVLTGSAAAKSTRGEVLAVGNG...,False,False,MNWRPLHDRVIVKRKEVETKSAGGIVLTGSAAAKSTRGEVLAVGNG...
2,VB01861,10 kDa chaperonin,P0A6F9,1AON|1AON|1AON|1AON|1AON|1AON|1AON|1AON|1AON|1...,A,48,I,W,0.20,MNIRPLHDRVIVKRKEVETKSAGGIVLTGSAAAKSTRGEVLAVGNG...,False,False,MNIRPLHDRVIVKRKEVETKSAGGIVLTGSAAAKSTRGEVLAVGNG...
3,PT017915,10 kDa chaperonin,P0A6F9,1AON|1AON|1AON|1AON|1AON|1AON|1AON|1AON|1AON|1...,A,95,V,C,0.30,MNIRPLHDRVIVKRKEVETKSAGGIVLTGSAAAKSTRGEVLAVGNG...,False,False,MNIRPLHDRVIVKRKEVETKSAGGIVLTGSAAAKSTRGEVLAVGNG...
4,VB05720,30S ribosomal protein S6,P23370,1RIS|1RIS|1RIS|1RIS,A,6,V,A,2.40,MRRYEVNIVLNPNLDQSQLALEKEIIQRALENYGARVEKVEELGLR...,False,False,MRRYEANIVLNPNLDQSQLALEKEIIQRALENYGARVEKVEELGLR...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5081,PT008738,U1 small nuclear ribonucleoprotein A,P09012,1OIA,A,86,Y,F,2.60,MAVPETRPNHTIYINNLNEKIKKDELKKSLYAIFSQFGQILDILVS...,False,False,MAVPETRPNHTIYINNLNEKIKKDELKKSLYAIFSQFGQILDILVS...
5082,PT008739,U1 small nuclear ribonucleoprotein A,P09012,1OIA,A,86,Y,T,2.90,MAVPETRPNHTIYINNLNEKIKKDELKKSLYAIFSQFGQILDILVS...,False,False,MAVPETRPNHTIYINNLNEKIKKDELKKSLYAIFSQFGQILDILVS...
5083,PT018889,Villin-1,P02640,1QQV|1YU5,A,791,H,Y,-1.10,MVELSKKVTGKLDKTTPGIQIWRIENMEMVPVPTKSYGNFYEGDCY...,False,False,MVELSKKVTGKLDKTTPGIQIWRIENMEMVPVPTKSYGNFYEGDCY...
5084,VB00068,cAMP-activated global transcriptional regulato...,P0ACJ8,1G6N|1G6N,A,129,S,A,0.30,MVLGKPQTDPTLEWFLSHCHIHKYPSKSTLIHQGEKAETLYYIVKG...,True,True,MVLGKPQTDPTLEWFLSHCHIHKYPSKSTLIHQGEKAETLYYIVKG...


In [12]:


# Map standard amino acids to one-hot vectors
AA_LIST = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY',
           'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER',
           'THR', 'TRP', 'TYR', 'VAL']
AA_TO_IDX = {aa: i for i, aa in enumerate(AA_LIST)}

def aa_one_hot(resname):
    vec = [0] * 20
    if resname in AA_TO_IDX:
        vec[AA_TO_IDX[resname]] = 1
    return vec

def pdb_to_graph(pdb_path, distance_threshold=8.0):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("protein", pdb_path)
    model = structure[0]

    nodes = []
    coords = []

    # Extract C-alpha coordinates and residue info
    for chain in model:
        for res in chain:
            if res.get_resname() not in AA_LIST:
                continue
            if 'CA' not in res:
                continue  # skip non-standard residues
            ca = res['CA'].get_coord()
            coords.append(ca)
            nodes.append({
                'feat': aa_one_hot(res.get_resname()),
                'coord': ca
            })

    coords = np.array(coords)
    num_nodes = len(nodes)

    # Create edges based on distance threshold
    edge_index = []
    for i, j in combinations(range(num_nodes), 2):
        dist = np.linalg.norm(coords[i] - coords[j])
        if dist < distance_threshold:
            edge_index.append([i, j])
            edge_index.append([j, i])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    x = torch.tensor([n['feat'] for n in nodes], dtype=torch.float)
    pos = torch.tensor(coords, dtype=torch.float)

    return Data(x=x, edge_index=edge_index, pos=pos)

In [56]:
TMP_DIR = f"/content/tmp/"
FOLDX_PATH = "/content/foldx/foldx_20251231"
ROTABASE_SRC = "/content/foldx/rotabase.txt"  # Path to rotabase.txt

def process_protein_entry(uniprot_id, mutation_df):

    # os.makedirs(TMP_DIR + uniprot_id, exist_ok=True)
    # Step 0: Copy rotabase.txt

    # os.symlink(ROTABASE_SRC, os.path.join(TMP_DIR, "rotabase.txt"), )

    # Step 1: Download WT PDB
    pdb_url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"
    pdb_path = os.path.join(TMP_DIR, f"{uniprot_id}.pdb")
    pdb_filename = f"{uniprot_id}.pdb"
    r = requests.get(pdb_url)
    with open(pdb_path, 'wb') as f:
        f.write(r.content)

    # Step 2: Repair WT structure with FoldX
    # subprocess.run([
    #     FOLDX_PATH, "--command=RepairPDB", f"--pdb={pdb_filename}", "--output-dir=" + TMP_DIR
    # ])
    # repaired_path = os.path.join(TMP_DIR, f"{uniprot_id}_Repair.pdb")

    # Step 3: Convert WT structure to graph
    # wt_graph = pdb_to_graph(repaired_path)
    wt_graph = pdb_to_graph(pdb_path)

    result = []

    # Step 4: Loop through each mutation
    for _, row in mutation_df.iterrows():
        mutation = row['wild_type'] + row["chain"] + str(row['position']) + row['mutation']
        ddg = row['ddG']

        with open(os.path.join(TMP_DIR, "individual_list.txt"), "w") as f:
            f.write(f"{mutation};\n")

        subprocess.run([
            FOLDX_PATH, "--command=BuildModel",
            # f"--pdb={Path(repaired_path).name}",
            f"--pdb={Path(pdb_path).name}",
            "--mutant-file=individual_list.txt",
            "--output-dir=" + TMP_DIR,
            "--numberOfRuns=1"
        ])

        # mutant_pdb_path = os.path.join(TMP_DIR, f"Repair_{uniprot_id}_1.pdb")
        # if not os.path.exists(mutant_pdb_path):
        #     print(f"[Warning] Mutation {mutation} failed for {uniprot_id}")
        #     continue
        generated_path = os.path.join(TMP_DIR, f"WT_{uniprot_id}_1.pdb")
        renamed_path = os.path.join(TMP_DIR, f"{uniprot_id}_{mutation}.pdb")

        if not os.path.exists(generated_path):
            print(f"[Warning] Mutation {mutation} failed for {uniprot_id}")
            continue
        print(f"Mutation {mutation} succeeded for {uniprot_id}")
        # Rename to avoid overwriting
        os.rename(generated_path, renamed_path)


        mutant_graph = pdb_to_graph(renamed_path)
        metadata_dict = row.to_dict()
        result.append((wt_graph, mutant_graph, ddg, metadata_dict))

        # for f in os.listdir(TMP_DIR):
        #   if f.endswith(".fxout") or f.endswith(".pdb") or f.startswith("indiv"):
        #       os.remove(os.path.join(TMP_DIR, f))

    return result

In [13]:
!mkdir -p /content/foldx
!unzip "/content/drive/MyDrive/BMI_707_Project/foldx_1Linux64_0.zip" -d /content/foldx
!chmod +x /content/foldx

Archive:  /content/drive/MyDrive/BMI_707_Project/foldx_1Linux64_0.zip
 extracting: /content/foldx/yasaraPlugin.zip  
  inflating: /content/foldx/foldx_20251231  
  inflating: /content/foldx/rotabase.txt  


In [78]:
df_temp = df_fireprot_median_ddG[df_fireprot_median_ddG["uniprot_id"] == "Q9REI6"]

In [79]:
uniprot_id = "Q9REI6"
pdb_url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"
pdb_path = os.path.join(TMP_DIR, f"{uniprot_id}.pdb")
pdb_filename = f"{uniprot_id}.pdb"
r = requests.get(pdb_url)
with open(pdb_path, 'wb') as f:
  f.write(r.content)

In [80]:
wt = pdb_to_graph(pdb_path)

In [81]:
row = df_temp.iloc[0]
mutation = row['wild_type'] + row["chain"] + str(row['position']) + row['mutation']
ddg = row['ddG']

with open(os.path.join(TMP_DIR, "individual_list.txt"), "w") as f:
    f.write(f"{mutation};\n")

subprocess.run([
            FOLDX_PATH, "--command=BuildModel",
            # f"--pdb={Path(repaired_path).name}",
            f"--pdb={Path(pdb_path).name}",
            "--mutant-file=individual_list.txt",
            "--output-dir=" + TMP_DIR,
            "--numberOfRuns=1"
        ])

CompletedProcess(args=['/content/foldx/foldx_20251231', '--command=BuildModel', '--pdb=Q9REI6.pdb', '--mutant-file=individual_list.txt', '--output-dir=/content/tmp/', '--numberOfRuns=1'], returncode=0)

In [76]:
graph = pdb_to_graph("/content/tmp/Q9REI6_1.pdb")

In [59]:
g_wt = pdb_to_graph("/content/tmp/Q9REI6.pdb")
g_mut = pdb_to_graph("/content/tmp/Q9REI6_GA130P.pdb")

In [65]:
u = g_wt.x
v = g_mut.x
torch.sum(torch.abs(u - v))

tensor(0.)

In [24]:
df_groupby_protein = df_fireprot_median_ddG.groupby("uniprot_id")
count = 0
all_results = []
os.makedirs(TMP_DIR, exist_ok=True)
shutil.copy(ROTABASE_SRC, os.path.join(TMP_DIR, "rotabase.txt"))
os.chdir(TMP_DIR)
for name, group in tqdm(df_groupby_protein):

  print(name)

  results = process_protein_entry(name, group)
  all_results.extend(results)
  count += 1
  if count > 0:
    break

  0%|          | 0/174 [00:00<?, ?it/s]

O74035



0it [00:00, ?it/s][A
1it [00:01,  1.13s/it][A
2it [00:08,  4.84s/it][A
3it [00:17,  6.74s/it][A
4it [00:28,  8.28s/it][A
5it [00:30,  6.02s/it][A
6it [00:32,  4.80s/it][A
7it [00:34,  3.90s/it][A
8it [00:36,  3.14s/it][A
9it [00:38,  2.77s/it][A
10it [00:39,  2.44s/it][A
11it [00:57,  7.11s/it][A
12it [00:58,  5.33s/it][A
13it [01:30, 13.24s/it][A
14it [01:32,  6.59s/it]
  1%|          | 1/174 [01:32<4:27:13, 92.68s/it]

P00004



0it [00:00, ?it/s][A
1it [00:01,  1.88s/it][A
2it [00:05,  2.76s/it][A
3it [00:09,  3.60s/it][A
4it [00:12,  3.32s/it][A
5it [00:22,  5.48s/it][A
6it [00:30,  6.55s/it][A
7it [00:33,  5.31s/it][A
8it [00:48,  8.47s/it][A
9it [00:51,  6.79s/it][A
10it [01:07,  6.73s/it]
  1%|          | 1/174 [02:40<7:42:11, 160.30s/it]


In [94]:
from Bio.PDB import PDBParser

parser = PDBParser(QUIET=True)
structure = parser.get_structure("AF", "Q9REI6.pdb")
chains = set()

for model in structure:
    for chain in model:
        chains.add(chain.id)

print("Chains found:", chains)

Chains found: {'A'}


In [46]:
for idx in range(len(result)):
  poswt = result[idx][0]
  posmut= result[idx][1]
  print(np.sum([poswt.x - posmut.x]))

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [14]:
#make the full loop
#run on all structures
#not necessary to save files
#just store a meta_data dict in the structure
#get a huge list, need to save it in a dataset object
#clear everything in the folder.