In [8]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [9]:
# Load gene expression 
df_gen = pd.read_csv('data/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena', sep='\t')

# Load metadata
df_meta = pd.read_csv('data/Survival_SupplementalTable_S1_20171025_xena_sp', sep='\t')

In [10]:
if 'sample' in df_gen.columns:
    df_gen = df_gen.set_index('sample')

df_gen = df_gen[~df_gen.index.duplicated(keep='first')]

print(f"Genes after removing duplicates: {df_gen.shape[0]}")
df_gen_T = df_gen.T
df_gen_T.index.name = 'sample'
df_gen_T = df_gen_T.reset_index()

Genes after removing duplicates: 20530


In [11]:
# Select only relevant columns to avoid noise
# You can add 'age_at_initial_pathologic_diagnosis', 'gender', etc. if you want them as features
cols_to_keep = [
    # Keys & Targets
    'sample', 
    'OS', 
    'OS.time',
    
    # Clinical Path
    'cancer type abbreviation', 
    'ajcc_pathologic_tumor_stage', 
    'histological_type',
    
    # Demographics
    'age_at_initial_pathologic_diagnosis', 
    'gender', 
    'race' 
]
df_meta_clean = df_meta[cols_to_keep].copy()

In [12]:
# MERGE & ALIGN
# Inner join to keep only patients present in BOTH datasets
df_final = pd.merge(df_meta_clean, df_gen_T, on='sample', how='inner')

print(f"Merged dimensions: {df_final.shape}")

Merged dimensions: (11014, 20539)


In [13]:
# CLEANING & IMPUTATION

# Drop rows where TARGET variables are missing (cannot train without labels)
df_final = df_final.dropna(subset=['OS', 'OS.time'])

# Handle Genes: Fill NA with 0 (assuming NA means no expression detected)
# Identify gene columns (all columns after the metadata ones)
gene_cols = df_final.columns[len(cols_to_keep):]
df_final[gene_cols] = df_final[gene_cols].fillna(0)

# Handle Clinical Features: Fill categorical NAs with "Unknown" or Mode
df_final['ajcc_pathologic_tumor_stage'] = df_final['ajcc_pathologic_tumor_stage'].fillna('Unknown')

In [14]:
# DIMENSIONALITY REDUCTION 

# .1 Normalize the Gene Data 
# PCA is very sensitive to scale, so we must standardize first
print(" - Step 1: Normalizing gene expression data...")
scaler = StandardScaler()
# Extract just the gene columns
X_genes = df_final[gene_cols]
X_genes_scaled = scaler.fit_transform(X_genes)

# .2 Apply PCA to reduce dimensionality
# We want to keep enough components to explain 95% of the variance
print(" - Step 2: Running PCA...")
pca = PCA(n_components=0.95) # Keep 95% of the signal
X_pca = pca.fit_transform(X_genes_scaled)

print(f"   Original Features: {X_genes.shape[1]}")
print(f"   Reduced to PCA Components: {X_pca.shape[1]}")

# .3 Create a new DataFrame with PCA features
# Create column names like 'PCA_1', 'PCA_2', etc.
pca_cols = [f'PCA_{i+1}' for i in range(X_pca.shape[1])]
df_pca = pd.DataFrame(X_pca, columns=pca_cols, index=df_final.index)

# .4 Combine Metadata with PCA features
# We drop the original 20k genes and keep only the new PCA columns
df_final_pca = pd.concat([df_final[cols_to_keep], df_pca], axis=1)

print(f"New Data Shape: {df_final_pca.shape}")

# Update df_final to the new reduced version
df_final = df_final_pca

 - Step 1: Normalizing gene expression data...
 - Step 2: Running PCA...
   Original Features: 20530
   Reduced to PCA Components: 3390
New Data Shape: (10952, 3399)


In [15]:
# ==========================================
# ENCODE CLINICAL FEATURES
# ==========================================

# --- Handle Missing Age Before Scaling ---
# Fill missing age with the median
df_final['age_at_initial_pathologic_diagnosis'] = df_final['age_at_initial_pathologic_diagnosis'].fillna(
    df_final['age_at_initial_pathologic_diagnosis'].median()
)

# --- A. Normalize Numerical Features ---
scaler_age = StandardScaler()
df_final['age_scaled'] = scaler_age.fit_transform(df_final[['age_at_initial_pathologic_diagnosis']])

# --- B. Encode Categorical Features ---
categorical_cols = [
    'cancer type abbreviation', 
    'ajcc_pathologic_tumor_stage', 
    'histological_type', 
    'gender', 
    'race'
]

# Create a dictionary to store encoders
encoders = {}

for col in categorical_cols:
    # 1. Fill missing values
    df_final[col] = df_final[col].fillna("Unknown")
    
    # 2. Fit encoder
    le = LabelEncoder()
    df_final[f'{col}_encoded'] = le.fit_transform(df_final[col].astype(str))
    encoders[col] = le

# ==========================================
# FINALIZE DATAFRAME
# ==========================================
# Select the Final Features:
pca_cols = [c for c in df_final.columns if 'PCA_' in c]

clinical_features = [
    'age_scaled', 
    'cancer type abbreviation_encoded', 
    'ajcc_pathologic_tumor_stage_encoded', 
    'histological_type_encoded', 
    'gender_encoded', 
    'race_encoded'
]

targets = ['OS', 'OS.time']

# Combine valid columns
final_columns = ['sample'] + targets + clinical_features + pca_cols
df_ready = df_final[final_columns].copy()

print(f"Final Data Shape: {df_ready.shape}")


Final Data Shape: (10952, 3399)


In [16]:
output_filename = 'data/processed_pancan_pca.csv'
df_ready.to_csv(output_filename, index=False)

print("Preprocessing Pipeline Complete!")

Preprocessing Pipeline Complete!
