In [4]:
import pandas as pd

# Load the dataset (change path if needed)
df = pd.read_csv("data/fireprotdb_results.csv")

# Preview the data
print(df.head())

  experiment_id             protein_name uniprot_id pdb_id chain  position  \
0      LL000001  Haloalkane dehalogenase     P59336   1CQW     A       245   
1      LL000002  Haloalkane dehalogenase     P59336   1CQW     A        95   
2      LL000004  Haloalkane dehalogenase     P59336   1CQW     A       176   
3      LL000005  Haloalkane dehalogenase     P59336   1CQW     A       171   
4      LL000006  Haloalkane dehalogenase     P59336   1CQW     A       148   

  wild_type mutation  ddG  dTm  ...  technique  technique_details  pH    tm  \
0         V        L  NaN  2.1  ...        NaN                NaN NaN  52.5   
1         L        V  NaN -0.4  ...        NaN                NaN NaN  50.0   
2         C        F  NaN  5.2  ...        NaN                NaN NaN  55.6   
3         G        Q  NaN  3.1  ...        NaN                NaN NaN  53.5   
4         T        L  NaN  1.1  ...        NaN                NaN NaN  51.5   

   notes  publication_doi  publication_pubmed  hsw_job_i

  df = pd.read_csv("data/fireprotdb_results.csv")


In [5]:
print(df.columns)

Index(['experiment_id', 'protein_name', 'uniprot_id', 'pdb_id', 'chain',
       'position', 'wild_type', 'mutation', 'ddG', 'dTm', 'is_curated', 'type',
       'derived_type', 'interpro_families', 'conservation', 'is_essential',
       'correlated_positions', 'is_back_to_consensus', 'secondary_structure',
       'asa', 'is_in_catalytic_pocket', 'is_in_tunnel_bottleneck', 'b_factor',
       'method', 'method_details', 'technique', 'technique_details', 'pH',
       'tm', 'notes', 'publication_doi', 'publication_pubmed', 'hsw_job_id',
       'datasets', 'sequence'],
      dtype='object')


In [8]:
protein_counts_1 = df["protein_name"].value_counts().reset_index()
protein_counts_1

Unnamed: 0,protein_name,count
0,Subtilisin-chymotrypsin inhibitor-2A,12276
1,Tyrosine-protein kinase Fyn,2488
2,Halohydrin dehalogenase,2480
3,ADHA,2404
4,Immunoglobulin G-binding protein G,2220
...,...,...
205,S-adenosylmethionine synthase isoform type-1,1
206,Non-specific lipid-transfer protein,1
207,Tetracycline repressor protein class D,1
208,Hydrolase,1


In [9]:
from Bio.PDB import PDBList, PDBParser

# Download structure from RCSB
pdb_id = "1CQW"  # example: replace with actual PDB ID
pdbl = PDBList()
pdbl.retrieve_pdb_file(pdb_id, file_format="pdb", pdir="./structures")

# Parse the structure
parser = PDBParser()
structure = parser.get_structure(pdb_id, f"./structures/pdb{pdb_id.lower()}.ent")

# Print chain IDs
for model in structure:
    for chain in model:
        print("Chain:", chain.id)

Structure exists: './structures/pdb1cqw.ent' 
Chain: A


In [10]:
df_filtered = df[df['ddG'].notnull()]

df_filtered.shape

(39177, 35)

In [11]:
protein_counts = df_filtered["protein_name"].value_counts().reset_index()
protein_counts

Unnamed: 0,protein_name,count
0,Subtilisin-chymotrypsin inhibitor-2A,11160
1,Immunoglobulin G-binding protein G,2158
2,Tryptophan synthase alpha chain,1915
3,Thermonuclease,1857
4,10 kDa chaperonin,1764
...,...,...
154,Non-specific lipid-transfer protein,1
155,Tetracycline repressor protein class D,1
156,Alpha-amylase,1
157,"Ferredoxin, heterocyst",1


In [31]:
columns_to_keep = [
    'experiment_id',             # for tracking/logging
    'protein_name',              # useful for protein-level filtering
    'uniprot_id',                # for linking external annotations
    'pdb_id',                    # required for structure
    'chain',                     # required to select chain from PDB
    'position',                  # residue index of mutation
    'wild_type',                 # original amino acid
    'mutation',                  # mutated amino acid
    'ddG',                       # target variable
    'sequence',                  # full wild-type sequence
    'is_in_catalytic_pocket',    # core binary feature (100% coverage)
    'is_essential'               # core binary feature (100% coverage)
]
df_subset = df_filtered[columns_to_keep]

df_subset = df_subset.dropna()


In [32]:
df_subset.isna().sum()


experiment_id             0
protein_name              0
uniprot_id                0
pdb_id                    0
chain                     0
position                  0
wild_type                 0
mutation                  0
ddG                       0
sequence                  0
is_in_catalytic_pocket    0
is_essential              0
dtype: int64

In [34]:
print(df_subset.head())


     experiment_id                        protein_name uniprot_id  \
5818      LL000714  Immunoglobulin G-binding protein G     P06654   
5819      LL000714  Immunoglobulin G-binding protein G     P06654   
5820      LL000715  Immunoglobulin G-binding protein G     P06654   
5821      LL000715  Immunoglobulin G-binding protein G     P06654   
5822      LL000716  Immunoglobulin G-binding protein G     P06654   

              pdb_id chain  position wild_type mutation   ddG  \
5818  1PGA|1EM7|2GB1     A         1         M        A -0.14   
5819  1PGA|1EM7|2GB1     A         1         M        A -0.14   
5820  1PGA|1EM7|2GB1     A         1         M        D -0.38   
5821  1PGA|1EM7|2GB1     A         1         M        D -0.38   
5822  1PGA|1EM7|2GB1     A         1         M        E -0.64   

                                               sequence  \
5818  MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...   
5819  MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRN...   
5820  MEKEKKVKYFL

In [35]:
print(df_subset.columns)

Index(['experiment_id', 'protein_name', 'uniprot_id', 'pdb_id', 'chain',
       'position', 'wild_type', 'mutation', 'ddG', 'sequence',
       'is_in_catalytic_pocket', 'is_essential'],
      dtype='object')


In [14]:
##### Parse Structure and Get Residue Coordinates

from Bio.PDB import PDBParser
import numpy as np

def extract_residue_coords(pdb_file, chain_id):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)
    model = structure[0]  # first model

    coords = {}
    for residue in model[chain_id]:
        if 'CA' in residue:  # alpha carbon only
            res_id = residue.get_id()[1]
            coords[res_id] = residue['CA'].get_coord()

    return coords


In [42]:
!pip install networkx
!pip install scipy
!pip install matplotlib
!pip install torch
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Collecting aiohttp (from torch_geometric)
  Downloading aiohttp-3.11.17-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting requests (from torch_geometric)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm (from torch_geometric)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->torch_geometric)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->torch_geometric)
  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting attrs>=17.3.0 (from aiohttp->torch_geometric)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->torch_geometric)
  Downloading frozenlist-1.6.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (16 kB)
Collecting multidict<7.0,>=4.5 (from aiohtt

In [19]:
##### Create Graph Edges Based on Distance

import networkx as nx
from scipy.spatial.distance import euclidean

def build_protein_graph(coords, distance_threshold=8.0):
    G = nx.Graph()
    residue_ids = list(coords.keys())

    # Add nodes
    for res_id in residue_ids:
        G.add_node(res_id, pos=coords[res_id])

    # Add edges based on distance
    for i, id1 in enumerate(residue_ids):
        for id2 in residue_ids[i+1:]:
            dist = euclidean(coords[id1], coords[id2])
            if dist <= distance_threshold:
                G.add_edge(id1, id2, distance=dist)

    return G


In [22]:
##### Construct graph from PDB
def construct_graph_from_pdb(pdb_file, chain_id, distance_threshold=8.0):
    coords = extract_residue_coords(pdb_file, chain_id)
    graph = build_protein_graph(coords, distance_threshold)
    return graph

import matplotlib.pyplot as plt

def plot_graph_2d(graph):
    pos = {n: graph.nodes[n]['pos'][:2] for n in graph.nodes}
    nx.draw(graph, pos, node_size=50, with_labels=True)
    plt.show()


Matplotlib is building the font cache; this may take a moment.


In [56]:
##### Add node features and one hot encode the amino acids

import pandas as pd

def one_hot_encode_aa(aa):
    aa_list = list("ACDEFGHIKLMNPQRSTVWY")  # standard 20 amino acids
    return [int(aa == x) for x in aa_list]

def enrich_graph_with_features(graph, df, sequence_length):
    for _, row in df.iterrows():
        pos = int(row['position'])
        if pos in graph.nodes:
            graph.nodes[pos]['aa_type'] = one_hot_encode_aa(row['wild_type'])
            graph.nodes[pos]['is_catalytic'] = int(row['is_in_catalytic_pocket'])
            graph.nodes[pos]['is_essential'] = int(row['is_essential'])
            graph.nodes[pos]['relative_pos'] = pos / sequence_length
        else:
            print(f"⚠️ Residue position {pos} not found in structure.")
    return graph




In [43]:
##### Convert a Graph to a PyTorch Geometric Data Object

import torch
from torch_geometric.data import Data

def convert_nx_to_pyg(graph, ddG_value):
    # Extract node features
    features = []
    node_id_map = {n: i for i, n in enumerate(graph.nodes)}  # map from pos to 0-indexed ID
    for n in graph.nodes:
        feat = graph.nodes[n]['aa_type'] + [graph.nodes[n]['is_catalytic'], graph.nodes[n]['is_essential'], graph.nodes[n]['relative_pos']]
        features.append(feat)
    x = torch.tensor(features, dtype=torch.float)

    # Build edge index
    edge_index = []
    edge_attr = []
    for u, v, attrs in graph.edges(data=True):
        edge_index.append([node_id_map[u], node_id_map[v]])
        edge_index.append([node_id_map[v], node_id_map[u]])  # undirected
        edge_attr.append([attrs['distance']])
        edge_attr.append([attrs['distance']])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    # Target
    y = torch.tensor([ddG_value], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


In [46]:
##### Build GNN Encoder

import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class ProteinGNNEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.readout = global_mean_pool  # can swap to max or attention
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim)
        )

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = self.readout(x, batch)
        return self.mlp(x)


In [None]:
print(df.columns)

Index(['experiment_id', 'protein_name', 'uniprot_id', 'pdb_id', 'chain',
       'position', 'wild_type', 'mutation', 'ddG', 'dTm', 'is_curated', 'type',
       'derived_type', 'interpro_families', 'conservation', 'is_essential',
       'correlated_positions', 'is_back_to_consensus', 'secondary_structure',
       'asa', 'is_in_catalytic_pocket', 'is_in_tunnel_bottleneck', 'b_factor',
       'method', 'method_details', 'technique', 'technique_details', 'pH',
       'tm', 'notes', 'publication_doi', 'publication_pubmed', 'hsw_job_id',
       'datasets', 'sequence'],
      dtype='object')
experiment_id               object
protein_name                object
uniprot_id                  object
pdb_id                      object
chain                       object
position                     int64
wild_type                   object
mutation                    object
ddG                        float64
dTm                        float64
is_curated                    bool
type                   

In [51]:
print(df.dtypes)


experiment_id               object
protein_name                object
uniprot_id                  object
pdb_id                      object
chain                       object
position                     int64
wild_type                   object
mutation                    object
ddG                        float64
dTm                        float64
is_curated                    bool
type                       float64
derived_type               float64
interpro_families           object
conservation               float64
is_essential                  bool
correlated_positions       float64
is_back_to_consensus          bool
secondary_structure         object
asa                        float64
is_in_catalytic_pocket        bool
is_in_tunnel_bottleneck       bool
b_factor                   float64
method                      object
method_details              object
technique                   object
technique_details           object
pH                         float64
tm                  

In [None]:
print(df.isnull().sum())

In [53]:
from Bio.PDB import PDBList

def fetch_pdb_on_demand(pdb_id, save_dir="pdbs"):
    pdbl = PDBList()
    pdb_id = pdb_id.lower()
    os.makedirs(save_dir, exist_ok=True)
    filepath = pdbl.retrieve_pdb_file(pdb_id, pdir=save_dir, file_format='pdb')

    # Bio.PDB stores as "pdbXXXX.ent" — convert to usable .pdb path
    actual_pdb_path = os.path.join(save_dir, f"{pdb_id}.pdb")
    if not os.path.exists(actual_pdb_path):
        os.rename(filepath, actual_pdb_path)
    return actual_pdb_path


In [None]:
print(df.head())

In [60]:
import os
import torch
import pandas as pd
from Bio.PDB import PDBList
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn as nn
import torch.nn.functional as F

# ===============================
# PDB Downloader
# ===============================
def fetch_pdb_on_demand(pdb_id, save_dir="pdbs"):
    pdbl = PDBList()
    pdb_id = pdb_id.lower()
    os.makedirs(save_dir, exist_ok=True)
    filepath = pdbl.retrieve_pdb_file(pdb_id, pdir=save_dir, file_format='pdb')

    # Bio.PDB stores as pdbXXXX.ent — rename to something easier
    new_path = os.path.join(save_dir, f"{pdb_id}.pdb")
    if not os.path.exists(new_path):
        os.rename(filepath, new_path)
    return new_path

# ===============================
# GNN Encoder
# ===============================
class ProteinGNNEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.readout = global_mean_pool
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim)
        )

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = self.readout(x, batch)
        return self.mlp(x)

# ===============================
# One-hot AA Encoder
# ===============================
def one_hot_encode_aa(aa):
    aa_list = list("ACDEFGHIKLMNPQRSTVWY")
    return [int(aa == x) for x in aa_list]

# ===============================
# Feature Enrichment
# ===============================
def enrich_graph_with_features(graph, df, sequence_length):
    for _, row in df.iterrows():
        pos = int(row['position'])
        if pos in graph.nodes:
            graph.nodes[pos]['aa_type'] = one_hot_encode_aa(row['wild_type'])
            graph.nodes[pos]['is_catalytic'] = int(row['is_in_catalytic_pocket'])
            graph.nodes[pos]['is_essential'] = int(row['is_essential'])
            graph.nodes[pos]['relative_pos'] = pos / sequence_length
    return graph

# ===============================
# Convert to PyG Graph
# ===============================
def convert_nx_to_pyg(graph, ddG_value):
    features = []
    node_id_map = {}
    filtered_nodes = []
    
    # Only keep nodes with full feature annotations
    for i, n in enumerate(graph.nodes):
        node_data = graph.nodes[n]
        if all(k in node_data for k in ['aa_type', 'is_catalytic', 'is_essential', 'relative_pos']):
            node_id_map[n] = len(filtered_nodes)
            filtered_nodes.append(n)
            feat = node_data['aa_type'] + [
                node_data['is_catalytic'],
                node_data['is_essential'],
                node_data['relative_pos']
            ]
            features.append(feat)

    if len(features) == 0:
        raise ValueError("❌ No usable nodes with complete features in this graph.")

    x = torch.tensor(features, dtype=torch.float)

    # Rebuild edge_index and edge_attr for filtered nodes
    edge_index = []
    edge_attr = []
    for u, v, attrs in graph.edges(data=True):
        if u in node_id_map and v in node_id_map:
            edge_index.append([node_id_map[u], node_id_map[v]])
            edge_index.append([node_id_map[v], node_id_map[u]])
            edge_attr.append([attrs['distance']])
            edge_attr.append([attrs['distance']])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    y = torch.tensor([ddG_value], dtype=torch.float)
    
    if edge_index.shape[1] == 0:
        raise ValueError("❌ Graph has no valid edges after filtering. Cannot proceed.")

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


# ===============================
# Single Example Test
# ===============================
# Assume you already have df loaded and cleaned
row = df.iloc[0:1]

sequence_length = len(row['sequence'].values[0])
ddG = float(row['ddG'].values[0])
pdb_id = row['pdb_id'].values[0].split('|')[0]
chain_id = row['chain'].values[0]

# Build full graph pipeline
pdb_file = fetch_pdb_on_demand(pdb_id)
G = construct_graph_from_pdb(pdb_file, chain_id)
G = enrich_graph_with_features(G, row, sequence_length)
data = convert_nx_to_pyg(G, ddG)

# Assign batch for PyG (single graph = all 0s)
data.batch = torch.zeros(data.num_nodes, dtype=torch.long)

# Pass through GNN
model = ProteinGNNEncoder(in_dim=23, hidden_dim=64, out_dim=128)
embedding = model(data.x, data.edge_index, data.batch)

print("✅ Protein embedding shape:", embedding.shape)


Structure exists: 'pdbs/pdb1cqw.ent' 


IndexError: tuple index out of range