# Download Dataset
First step is to download the dataset, extract the metadata and drop constant columns (contain no information, only slow down computations).

In [24]:
import GEOparse
import pandas as pd

# Step 1: Download GEO dataset
gse = GEOparse.get_GEO("GSE96058", destdir="/Users/sarah/Code/bioinformatics-tool/backend/data", how="full")

# Step 2: Extract sample metadata (GSMs)
clinical_df = gse.phenotype_data

n_before = clinical_df.shape[1]
clinical_df = clinical_df.loc[:, clinical_df.nunique(dropna=False) > 1].copy()
n_after = clinical_df.shape[1]

print(f"Dropped {n_before - n_after} constant columns. Remaining columns: {n_after}")

# Set 'title' as index
clinical_df.set_index("title", inplace=True)


30-Jun-2025 18:14:15 INFO GEOparse - File already exist: using local version.
30-Jun-2025 18:14:15 INFO GEOparse - Parsing /Users/sarah/Code/bioinformatics-tool/backend/data/GSE96058_family.soft.gz: 
30-Jun-2025 18:14:15 DEBUG GEOparse - DATABASE: GeoMiame
30-Jun-2025 18:14:15 DEBUG GEOparse - SERIES: GSE96058
30-Jun-2025 18:14:15 DEBUG GEOparse - PLATFORM: GPL11154
30-Jun-2025 18:14:15 DEBUG GEOparse - PLATFORM: GPL18573
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528079
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528080
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528081
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528082
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528083
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528084
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528085
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528086
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528087
30-Jun-2025 18:14:15 DEBUG GEOparse - SAMPLE: GSM2528088
30-J

Dropped 26 constant columns. Remaining columns: 31


First overlook over data showed "NA" as a string in columns -> we want to replace it with np.nan globally. 

In [25]:
import numpy as np

clinical_df.replace("NA", np.nan, inplace=True)


In [26]:
for col in clinical_df.columns:
    print(col)


geo_accession
last_update_date
characteristics_ch1.0.scan-b external id
characteristics_ch1.1.instrument model
characteristics_ch1.2.age at diagnosis
characteristics_ch1.3.tumor size
characteristics_ch1.4.lymph node group
characteristics_ch1.5.lymph node status
characteristics_ch1.6.er status
characteristics_ch1.7.pgr status
characteristics_ch1.8.her2 status
characteristics_ch1.9.ki67 status
characteristics_ch1.10.nhg
characteristics_ch1.11.er prediction mgc
characteristics_ch1.12.pgr prediction mgc
characteristics_ch1.13.her2 prediction mgc
characteristics_ch1.14.ki67 prediction mgc
characteristics_ch1.15.nhg prediction mgc
characteristics_ch1.16.er prediction sgc
characteristics_ch1.17.pgr prediction sgc
characteristics_ch1.18.her2 prediction sgc
characteristics_ch1.19.ki67 prediction sgc
characteristics_ch1.20.pam50 subtype
characteristics_ch1.21.overall survival days
characteristics_ch1.22.overall survival event
characteristics_ch1.23.endocrine treated
characteristics_ch1.24.chemo 

# Data Preprocessing

Preprocessing of the features that remain:
- Numeric Features: leave as is, possibly standardize later
- Binary Features: map to 0/1
- Ordinal Categorical (e.g. nhg): map to 1,2,3
- Nominal categorical (e.g. pam50 subtype): one-hot encode -> so we don't introduce an artificial order


In [27]:

clinical_df_cleaned = clinical_df.copy()

# Clean up column names
clinical_df_cleaned.columns = clinical_df_cleaned.columns.str.replace(r'^characteristics_ch1\.\d+\.', '', regex=True)


## Columns to Drop
- identifiers
- duplicates

In [28]:
cols_to_drop = ['sample_id', 'geo_accession', 'scan-b external id', 'relation']

# Drop only if they exist (to avoid errors)
clinical_df_cleaned = clinical_df_cleaned.drop(columns=[col for col in cols_to_drop if col in clinical_df_cleaned.columns])

### Instrument Model
This is a feature that would be excluded before ML application but could be interesting during EDA (e.g. batch effects) and that is why I want to keep it for now. There are however three columns that seem to have the same consistent values with the same information ('instrument model', 'platform id' and 'instrument_model') so I check whether they have 100% match and if so, drop two of them and retain only one.

In [29]:
instrument_pairs = clinical_df_cleaned[['instrument model', 'platform_id']].dropna()
mapping_check = instrument_pairs.groupby('instrument model')['platform_id'].nunique()
print(mapping_check[mapping_check > 1])


Series([], Name: platform_id, dtype: int64)


In [30]:
instrument_pairs = clinical_df_cleaned[['instrument model', 'instrument_model']].dropna()
mapping_check = instrument_pairs.groupby('instrument model')['instrument_model'].nunique()
print(mapping_check[mapping_check > 1])


Series([], Name: instrument_model, dtype: int64)


Since none map to more than one platform id / model, I can drop two cols.

In [31]:
clinical_df_cleaned = clinical_df_cleaned.drop(columns=['platform_id', 'instrument_model'])

## Numeric Columns
no preprocessing necessary apart from transformation to numeric

In [32]:
numeric_cols = [
    'age at diagnosis',
    'tumor size',
    'overall survival days'
]


In [33]:
for col in numeric_cols:
    try:
        clinical_df_cleaned[col] = pd.to_numeric(clinical_df_cleaned[col], errors='coerce')
    except Exception as e:
        print(f"Couldn't convert {col}: {e}")


## Binary Columns
Binary columns are encoded in different ways. They should all numerically be 0 or 1 or NaN after preprocessing.

In [34]:
# Encode lymph node status
clinical_df_cleaned['lymph node status'] = clinical_df_cleaned['lymph node status'].map({
    'NodeNegative': 0,
    'NodePositive': 1
})


In [35]:
import numpy as np

binary_cols = ['er status', 'pgr status', 'her2 status', 'ki67 status', 'er prediction mgc', 'pgr prediction mgc', 'her2 prediction mgc', 'ki67 prediction mgc', 'er prediction sgc', 'pgr prediction sgc', 'her2 prediction sgc', 'ki67 prediction sgc', 'overall survival event', 'endocrine treated', 'chemo treated']

# Replace 'NA' strings with np.nan and convert to float
for col in binary_cols:
    clinical_df_cleaned[col] = clinical_df_cleaned[col].replace('NA', np.nan).astype(float)


## Ordinal Columns
Label encode -> preserve order

To create a mapping, the different unique values are examined.

In [36]:
print(f"Nottingham Histologic Grades: {clinical_df_cleaned['nhg'].unique()}")
print(f"Nottingham Histologic Grades Prediction: {clinical_df_cleaned['nhg prediction mgc'].unique()}")
print(f"Lymph Node Group: {clinical_df_cleaned['lymph node group'].unique()}")

Nottingham Histologic Grades: ['G3' 'G2' 'G1' nan]
Nottingham Histologic Grades Prediction: ['G3' 'G2']
Lymph Node Group: ['NodeNegative' '1to3' '4toX' 'SubMicroMet' nan]


Comment: interesting thet the NHG Prediction MGC only has grade 2 and 3!

### Lymph Node Group
Not sure whether to one hot encode or label encode -  there is an implicit order (Node Negative < SubmicroMet y< 1to3 < 4toX) but since the boundaries are fuzzy (SubMicroMet), this could also be a problem - solution: for now, use label encoding and potentially later on if using NN go back and one-hot-encode.

In [37]:
ordinal_map = {
    'nhg': {'G1': 1, 'G2': 2, 'G3': 3},
    'nhg prediction mgc': {'G2': 2, 'G3': 3},
    'lymph node group': {'NodeNegative': 0, 'SubMicroMet': 1, '1to3': 2, '4toX': 3}
}

for col, mapping in ordinal_map.items():
    clinical_df_cleaned[col] = clinical_df_cleaned[col].replace('NA', np.nan)
    clinical_df_cleaned[col] = clinical_df_cleaned[col].map(mapping)


## Non-Ordinal Columns
One Hot Encode. Keep originals for now (might be useful for analyses).

In [38]:
cols_to_one_hot = ['last_update_date', 'instrument model', 'pam50 subtype']
clinical_df_cleaned = pd.get_dummies(clinical_df_cleaned, columns=cols_to_one_hot, prefix_sep='__', dtype=int)

# Flag Columns
I want to flag columns I will not be using for ML training but would still like to keep for ETA (since they might give useful information).
- identifiers
- technical metadata
- prediction outputs from papers

In [39]:
id_columns = [
    'last_update_date__Mar 12 2018',
    'last_update_date__May 04 2022',
    'instrument model__HiSeq 2000',
    'instrument model__NextSeq 500',
    "er prediction mgc",
    "pgr prediction mgc",
    "her2 prediction mgc",
    "ki67 prediction mgc",
    "nhg prediction mgc",
    "er prediction sgc",
    "pgr prediction sgc",
    "her2 prediction sgc",
    "ki67 prediction sgc"
]

prediction_columns = [
    col for col in clinical_df.columns if 'prediction' in col.lower()
]

excluded_columns = id_columns + prediction_columns

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv(dotenv_path="/Users/sarah/Code/bioinformatics-tool/analysis/.env") 

clinical_df_cleaned.to_csv(os.getenv("CLINICAL"))
