In [20]:
import pandas as pd

# Load the dataset (change path if needed)
df = pd.read_csv("data/fireprotdb_results.csv")

# Preview the data
print(df.head())

  experiment_id             protein_name uniprot_id pdb_id chain  position  \
0      LL000001  Haloalkane dehalogenase     P59336   1CQW     A       245   
1      LL000002  Haloalkane dehalogenase     P59336   1CQW     A        95   
2      LL000004  Haloalkane dehalogenase     P59336   1CQW     A       176   
3      LL000005  Haloalkane dehalogenase     P59336   1CQW     A       171   
4      LL000006  Haloalkane dehalogenase     P59336   1CQW     A       148   

  wild_type mutation  ddG  dTm  ...  technique  technique_details  pH    tm  \
0         V        L  NaN  2.1  ...        NaN                NaN NaN  52.5   
1         L        V  NaN -0.4  ...        NaN                NaN NaN  50.0   
2         C        F  NaN  5.2  ...        NaN                NaN NaN  55.6   
3         G        Q  NaN  3.1  ...        NaN                NaN NaN  53.5   
4         T        L  NaN  1.1  ...        NaN                NaN NaN  51.5   

   notes  publication_doi  publication_pubmed  hsw_job_i

  df = pd.read_csv("data/fireprotdb_results.csv")


In [21]:
print(df.columns)

Index(['experiment_id', 'protein_name', 'uniprot_id', 'pdb_id', 'chain',
       'position', 'wild_type', 'mutation', 'ddG', 'dTm', 'is_curated', 'type',
       'derived_type', 'interpro_families', 'conservation', 'is_essential',
       'correlated_positions', 'is_back_to_consensus', 'secondary_structure',
       'asa', 'is_in_catalytic_pocket', 'is_in_tunnel_bottleneck', 'b_factor',
       'method', 'method_details', 'technique', 'technique_details', 'pH',
       'tm', 'notes', 'publication_doi', 'publication_pubmed', 'hsw_job_id',
       'datasets', 'sequence'],
      dtype='object')


In [22]:
print(df.shape)
protein_counts_1 = df["protein_name"].value_counts().reset_index()
protein_counts_1

(53445, 35)


Unnamed: 0,protein_name,count
0,Subtilisin-chymotrypsin inhibitor-2A,12276
1,Tyrosine-protein kinase Fyn,2488
2,Halohydrin dehalogenase,2480
3,ADHA,2404
4,Immunoglobulin G-binding protein G,2220
...,...,...
205,S-adenosylmethionine synthase isoform type-1,1
206,Non-specific lipid-transfer protein,1
207,Tetracycline repressor protein class D,1
208,Hydrolase,1


In [23]:
from Bio.PDB import PDBList, PDBParser

# Download structure from RCSB
pdb_id = "1CQW"  # example: replace with actual PDB ID
pdbl = PDBList()
pdbl.retrieve_pdb_file(pdb_id, file_format="pdb", pdir="./structures")

# Parse the structure
parser = PDBParser()
structure = parser.get_structure(pdb_id, f"./structures/pdb{pdb_id.lower()}.ent")

# Print chain IDs
for model in structure:
    for chain in model:
        print("Chain:", chain.id)

Structure exists: './structures/pdb1cqw.ent' 
Chain: A


In [24]:
df_filtered = df[df['ddG'].notnull()]

df_filtered.shape

(39177, 35)

In [25]:
protein_counts = df_filtered["protein_name"].value_counts().reset_index()
protein_counts

Unnamed: 0,protein_name,count
0,Subtilisin-chymotrypsin inhibitor-2A,11160
1,Immunoglobulin G-binding protein G,2158
2,Tryptophan synthase alpha chain,1915
3,Thermonuclease,1857
4,10 kDa chaperonin,1764
...,...,...
154,Non-specific lipid-transfer protein,1
155,Tetracycline repressor protein class D,1
156,Alpha-amylase,1
157,"Ferredoxin, heterocyst",1


In [26]:
columns_to_keep = ['experiment_id', 'protein_name', 'uniprot_id', 'pdb_id', 'chain',
       'position', 'wild_type', 'mutation', 'ddG', 'sequence', 'secondary_structure', 'asa', 'b_factor', 'is_in_catalytic_pocket', 'conservation', 'is_essential']
df_subset = df_filtered[columns_to_keep]

In [27]:
df_subset.isna().sum()

experiment_id                0
protein_name                31
uniprot_id                  41
pdb_id                       0
chain                        0
position                     0
wild_type                    0
mutation                     0
ddG                          0
sequence                     0
secondary_structure       1836
asa                       1836
b_factor                  2561
is_in_catalytic_pocket       0
conservation              2911
is_essential                 0
dtype: int64

In [19]:
df_subset[df_subset["protein_name"].isna()]

Unnamed: 0,experiment_id,protein_name,uniprot_id,pdb_id,chain,position,wild_type,mutation,ddG,sequence
38362,VB01224,,,2IMM,A,106,A,G,0.0,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
38367,VB01229,,,2IMM,A,15,A,L,-1.4,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
39677,VB01375,,,2IMM,A,38,F,Y,-0.4,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
40536,VB01496,,,2IMM,A,24,K,R,0.2,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
40769,VB01527,,,2IMM,A,112,L,I,-0.4,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
40832,VB01538,,,2IMM,A,21,M,I,-0.2,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
40833,VB01539,,,2IMM,A,21,M,L,0.3,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
40918,VB01551,,,2IMM,A,96,N,Q,-1.1,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
41420,VB01593,,,2IMM,A,49,P,S,0.2,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
41461,VB01599,,,2IMM,A,85,Q,E,0.4,DIVMTQSPSSLSVSAGERVTMSCKSSQSLLNSGNQKNFLAWYQQKP...
