In [None]:
import pandas as pd

from sklearn.decomposition import PCA

In [None]:
# select the number of variables for the dimensionality reduction of the omics data
n_variables = 30

In [None]:
from pathlib import Path

# In a .ipynb, __file__ is not defined — we simulate it with cwd (current working dir)
SCRIPT_DIR = Path().resolve()  # acts as notebook's current directory
DATA_ROOT  = (SCRIPT_DIR / ".." / "datasets_TCGA").resolve()
MERGE_DIR  = DATA_ROOT / "merged"
MERGE_DIR.mkdir(parents=True, exist_ok=True)  # optional: ensure directory exists

# Load data
merged_rna  = pd.read_csv(MERGE_DIR / "merged_rnaseq_QC.csv")
merged_prot = pd.read_csv(MERGE_DIR / "merged_rppa.csv")
merged_dna  = pd.read_csv(MERGE_DIR / "merged_cna.csv")

In [None]:
print(f"RNA shape: {merged_rna.shape}",
      f"Protein shape: {merged_prot.shape}",
      f"DNA shape: {merged_dna.shape}")

In [None]:
### OPTIONAL FILTERING: keep only genes with gene type 'protein_coding' ###
'''
# get list of genes that have protein coding gene type 
gene_types = pd.read_csv('../datasets/gene_types.csv')
protein_coding_genes = gene_types[gene_types['gene_type'] == 'protein_coding']['gene_id']

# Filter the columns
filtered_columns = ['sample_id'] + protein_coding_genes.tolist()

rnaseq = rnaseq[filtered_columns]
gene_level = gene_level[filtered_columns]
'''

In [None]:
# Keep IDs with 2 or more other omics
to_delete = pd.read_csv('../datasets_TCGA/summary_removed_<2_omics_TCGA.tsv', sep='\t')
to_delete_ids = to_delete['sample_id'].astype(str).values

rnaseq = merged_rna[~merged_rna['sample_id'].astype(str).isin(to_delete_ids)].reset_index(drop=True)
protein = merged_prot[~merged_prot['sample_id'].astype(str).isin(to_delete_ids)].reset_index(drop=True)
gene_level = merged_dna[~merged_dna['sample_id'].astype(str).isin(to_delete_ids)].reset_index(drop=True)
# gene_level only in this script, but this will be saved with another name, as it is biologcially wrong.

In [None]:
# Check for missing values

nan_count_by_column = rnaseq.isna().sum()
columns_with_nan = nan_count_by_column[nan_count_by_column > 0].count()
print("Number of rnaseq columns with missing values:", columns_with_nan)

nan_count_by_column = protein.isna().sum()
columns_with_nan = nan_count_by_column[nan_count_by_column > 0].count()
print("Number of protein columns with missing values:", columns_with_nan)

nan_count_by_column = gene_level.isna().sum()
columns_with_nan = nan_count_by_column[nan_count_by_column > 0].count() # G: Here gene level seems a bit confusing, as I'd think about RNA, directly. 
print("Number of gene level columns with missing values:", columns_with_nan)

In [None]:
# drop the columns with missing values
rnaseq = rnaseq.dropna(axis=1)
protein = protein.dropna(axis=1)
gene_level = gene_level.dropna(axis=1)

# print shapes
print("rnaseq shape:", rnaseq.shape)
print("protein shape:", protein.shape)
print("gene level shape:", gene_level.shape)

In [None]:
#Discuss the harsh drop of dna CNV. 

In [None]:
# Save the reduced datasets using pathlib

rnaseq.to_csv(MERGE_DIR / f"reduced_rnaseq.csv", index=False)
protein.to_csv(MERGE_DIR / f"reduced_rppa.csv", index=False)
gene_level.to_csv(MERGE_DIR / f"reduced_cna.csv", index=False)

## Dimensionality Reduction

In [None]:
# RNA data

# Calculate variance for each gene (excluding 'sample_id')
variances = rnaseq.drop(columns=['sample_id']).var()

# Get indices of top `n_variables` genes with highest variance
top_gene_indices = variances.argsort()[::-1][:n_variables]
rna_to_keep_columns = variances.index[top_gene_indices]

# Add 'sample_id' and subset the dataframe
columns_to_keep = ['sample_id'] + list(rna_to_keep_columns)
rnaseq_reduced = rnaseq[columns_to_keep]

In [None]:
# Protein data

# Calculate variance for each protein (excluding 'sample_id')
variances = protein.drop(columns=['sample_id']).var()

# Get indices of top `n_variables` proteins with highest variance
top_protein_indices = variances.argsort()[::-1][:n_variables]
proteins_to_keep_columns = variances.index[top_protein_indices]

# Add 'sample_id' and subset the dataframe
columns_to_keep = ['sample_id'] + list(proteins_to_keep_columns)
protein_reduced = protein[columns_to_keep]

In [None]:
# Gene level data

# Separate features and sample_id
features = gene_level.drop(columns=['sample_id'])
sample_ids = gene_level['sample_id']

# Apply PCA to reduce to `n_variables` components
pca = PCA(n_components=n_variables)
pca_components = pca.fit_transform(features)

# Create a DataFrame with PCA components and sample_id
pca_df = pd.DataFrame(pca_components, columns=[f'PC{i+1}' for i in range(n_variables)])
pca_df.insert(0, 'sample_id', sample_ids)
gene_level_reduced = pca_df

In [None]:
# Save the reduced datasets using pathlib

rnaseq_reduced.to_csv(MERGE_DIR / f"reduced_rnaseq_{n_variables}.csv", index=False)
protein_reduced.to_csv(MERGE_DIR / f"reduced_rppa_{n_variables}.csv", index=False)
gene_level_reduced.to_csv(MERGE_DIR / f"reduced_cna_{n_variables}.csv", index=False)
