"""
Dataset source:
Tubiana et al. (2022), PLOS Computational Biology
Loaded directly from original ZIP archive (unmodified).
"""


In [10]:
import sys
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
print("PROJECT_ROOT:", PROJECT_ROOT)
print("Contents:", list(PROJECT_ROOT.iterdir()))

OUTPUT_DIR = Path("../data/derived")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print("\nPython path:")
for p in sys.path:
    print(p)

PROJECT_ROOT: /home/user_stel/project_thesis
Contents: [PosixPath('/home/user_stel/project_thesis/.vscode'), PosixPath('/home/user_stel/project_thesis/results'), PosixPath('/home/user_stel/project_thesis/thesis_code'), PosixPath('/home/user_stel/project_thesis/data'), PosixPath('/home/user_stel/project_thesis/figures'), PosixPath('/home/user_stel/project_thesis/notebooks'), PosixPath('/home/user_stel/project_thesis/external'), PosixPath('/home/user_stel/project_thesis/.git')]

Python path:
/home/user_stel/miniconda3/lib/python312.zip
/home/user_stel/miniconda3/lib/python3.12
/home/user_stel/miniconda3/lib/python3.12/lib-dynload

/home/user_stel/miniconda3/lib/python3.12/site-packages
/home/user_stel/AISB/tubiana_etal_2022
/home/user_stel/project_thesis


# Load Dataset

In [2]:
ZIP_PATH = Path(
    "../external/tubiana_etal_2022/Ressources/datasets/S2 File.csv.zip"
)

df = pd.read_csv(ZIP_PATH, compression="zip")

df.head()


Unnamed: 0,domain,cathpdb,pdb,uniprot_acc,uniprot_id,residue_name,IBS,chain_id,residue_number,b_factor,...,S35,S60,S95,S100,uniref50,uniref90,uniref100,origin,location,taxon
0,PH,2m14A00,2M14,P32776,TFB1_YEAST,ASN,False,A,19,0.0,...,123,123.1,123.1.1,123.1.1.1,P32776,P32776,P32776,YEAST,['Nucleus'],Eukaryota/Fungi
1,PH,2m14A00,2M14,P32776,TFB1_YEAST,ASN,False,A,75,0.0,...,123,123.1,123.1.1,123.1.1.1,P32776,P32776,P32776,YEAST,['Nucleus'],Eukaryota/Fungi
2,PH,2m14A00,2M14,P32776,TFB1_YEAST,ASN,False,A,78,0.0,...,123,123.1,123.1.1,123.1.1.1,P32776,P32776,P32776,YEAST,['Nucleus'],Eukaryota/Fungi
3,PH,2m14A00,2M14,P32776,TFB1_YEAST,ASN,False,A,92,0.0,...,123,123.1,123.1.1,123.1.1.1,P32776,P32776,P32776,YEAST,['Nucleus'],Eukaryota/Fungi
4,PH,2m14A00,2M14,P32776,TFB1_YEAST,ASN,False,A,93,0.0,...,123,123.1,123.1.1,123.1.1.1,P32776,P32776,P32776,YEAST,['Nucleus'],Eukaryota/Fungi


In [6]:
df.shape
df.columns
df["IBS"].value_counts()


IBS
False    156007
True      27883
Name: count, dtype: int64

In [7]:
print(df.columns)

Index(['domain', 'cathpdb', 'pdb', 'uniprot_acc', 'uniprot_id', 'residue_name',
       'IBS', 'chain_id', 'residue_number', 'b_factor', 'sec_struc',
       'sec_struc_full', 'prot_block', 'data_type', 'Experimental Method',
       'resolution', 'RSA_total_freesasa_tien', 'convhull_vertex',
       'protrusion', 'is_hydrophobic_protrusion', 'is_co_insertable',
       'neighboursList', 'density', 'exposed', 'S35', 'S60', 'S95', 'S100',
       'uniref50', 'uniref90', 'uniref100', 'origin', 'location', 'taxon'],
      dtype='object')


In [3]:
FEATURE_COLUMNS = [
    "residue_name",
    "sec_struc",
    "RSA_total_freesasa_tien",
    "exposed",
    "convhull_vertex",
    "protrusion",
    "is_hydrophobic_protrusion",
    "density",
]

TARGET = "IBS"

# Feature Engineering

In [3]:
from thesis_code.feature_engineering.build_features import build_feature_table

df_feat = build_feature_table(df)
print(df_feat.columns)

Index(['domain', 'cathpdb', 'pdb', 'uniprot_acc', 'uniprot_id', 'residue_name',
       'IBS', 'chain_id', 'residue_number', 'b_factor', 'sec_struc',
       'sec_struc_full', 'prot_block', 'data_type', 'Experimental Method',
       'resolution', 'RSA_total_freesasa_tien', 'convhull_vertex',
       'protrusion', 'is_hydrophobic_protrusion', 'is_co_insertable',
       'neighboursList', 'density', 'exposed', 'S35', 'S60', 'S95', 'S100',
       'uniref50', 'uniref90', 'uniref100', 'origin', 'location', 'taxon',
       'neighbors', 'n_neighbors', 'neighbor_frac_exposed',
       'neighbor_frac_hydrophobic', 'neighbor_frac_charged',
       'neighbor_frac_polar', 'neighbor_frac_aromatic', 'neighbor_frac_small',
       'neighbor_mean_RSA'],
      dtype='object')


### Save the feature-engineered full dataset

In [11]:
df_feat.to_parquet(
    OUTPUT_DIR / "df_feat_context_v1.parquet",
    index=False
)

In [9]:
df_ml = df_feat[df_feat["exposed"] == True]

df_ml.attrs["feature_version"] = "v1_context"

print(df_ml.columns)

Index(['domain', 'cathpdb', 'pdb', 'uniprot_acc', 'uniprot_id', 'residue_name',
       'IBS', 'chain_id', 'residue_number', 'b_factor', 'sec_struc',
       'sec_struc_full', 'prot_block', 'data_type', 'Experimental Method',
       'resolution', 'RSA_total_freesasa_tien', 'convhull_vertex',
       'protrusion', 'is_hydrophobic_protrusion', 'is_co_insertable',
       'neighboursList', 'density', 'exposed', 'S35', 'S60', 'S95', 'S100',
       'uniref50', 'uniref90', 'uniref100', 'origin', 'location', 'taxon',
       'neighbors', 'n_neighbors', 'neighbor_frac_exposed',
       'neighbor_frac_hydrophobic', 'neighbor_frac_charged',
       'neighbor_frac_polar', 'neighbor_frac_aromatic', 'neighbor_frac_small',
       'neighbor_mean_RSA'],
      dtype='object')


### Save the ML dataset 

In [12]:
df_ml.to_parquet(
    OUTPUT_DIR / "df_ml_context_v1.parquet",
    index=False
)

### Save metadata

In [13]:
import json

metadata = {
    "feature_version": df_ml.attrs.get("feature_version"),
    "description": "Context-aware features, exposed residues only",
    "source": "Tubiana et al. dataset",
}

with open(OUTPUT_DIR / "df_ml_context_v1_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)


In [5]:
CATEGORICAL_FEATURES = [
    "residue_name",
    "sec_struc",
]

NUMERIC_FEATURES = [
    "RSA_total_freesasa_tien",
    "convhull_vertex",
    "protrusion",
    "is_hydrophobic_protrusion",
    "is_co_insertable",
    "density",
    "n_neighbors",
    "neighbor_frac_exposed",
    "neighbor_frac_hydrophobic",
    "neighbor_frac_charged",
    "neighbor_frac_polar",
    "neighbor_frac_aromatic",
    "neighbor_frac_small",
    "neighbor_mean_RSA",
]

TARGET = "IBS"

In [6]:
DROP_COLUMNS = [
    "domain", "cathpdb", "pdb", "chain_id", "residue_number",
    "uniprot_acc", "uniprot_id",
    "S35", "S60", "S95", "S100",
    "uniref50", "uniref90", "uniref100",
    "origin", "location", "taxon",
    "neighboursList", "neighbors",
    "data_type", "Experimental Method", "resolution",
]


In [None]:
df_feat = pd.read_parquet("../data/derived/df_feat_context_v1.parquet")
df_ml = pd.read_parquet("../data/derived/df_ml_context_v1.parquet")
