# Protein Folding Principles Dataset

In [None]:
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
%load_ext autoreload
%autoreload 2

### Test PDB crawler

In [29]:
from dataset.pisces import fetch_pisces_table, get_filtered_pdb_codes

# Fetch and filter
pisces_df = fetch_pisces_table()
pdb_ids = get_filtered_pdb_codes(pisces_df)

# Preview results
print(f"Found {len(pdb_ids)} high-quality structures with pdb codes.")
print(pdb_ids[:10])


Detected PISCES columns: ['PDBchain', 'len', 'method', 'resol', 'rfac', 'freerfac']
Found 11116 high-quality structures with pdb codes.
['5D8V', '5NW3', '1UCS', '3X2M', '2VB1', '1US0', '6E6O', '6S2M', '1R6J', '4REK']


### Test beta-beta pattern finder

In [30]:
from dataset.io_utils import fetch_mmcif_file, run_dssp_on_mmcif
from dataset.motif_logic import detect_hairpins_and_chirality
import pandas as pd
import os

#pdb_ids = ["8GBS", "7R1C"]
pdb_ids = pdb_ids[0:100]#["8GBS", "7R1C"]
results = []

for pdb_id in pdb_ids:
    pdb_path = fetch_mmcif_file(pdb_id)
    try:
        model, dssp_df = run_dssp_on_mmcif(pdb_path, dssp_exe="/opt/anaconda3/envs/folding-dssp/bin/mkdssp")
        hp_df = detect_hairpins_and_chirality(model, dssp_df)
        results.append(hp_df)
    finally:
        # Always remove the file, even if DSSP or parsing fails
        if os.path.exists(pdb_path):
            os.remove(pdb_path)

all_hairpins = pd.concat(results, ignore_index=True)
all_hairpins.head(200)

Unnamed: 0,PDB,Chain,strand1_start,strand1_end,strand2_start,strand2_end,loop_len,handedness
0,5d8v,A,49,55,58,61,2,L
1,5d8v,A,58,61,68,70,6,L
2,5nw3,A,2,5,11,13,5,L
3,3x2m,A,57,64,79,87,14,L
4,3x2m,A,79,87,111,116,23,L
...,...,...,...,...,...,...,...,...
195,4y9w,A,148,152,160,165,7,L
196,4y9w,A,160,165,167,169,1,L
197,4y9w,A,178,181,189,197,7,L
198,4y9w,A,189,197,200,210,2,R
