In [22]:
import pandas as pd
import matplotlib.pyplot as plt

em = pd.read_csv('/Users/sarah/Code/bioinformatics-tool/data/GSE96058_gene_expression_3273_samples_and_136_replicates_transformed.csv', index_col=0)

In [23]:
em_T = em.T
print(f"Initial shape (samples x genes): {em_T.shape}")

Initial shape (samples x genes): (3409, 30865)


Dropping columns that are constant and genes with a very low expression.

In [24]:
import numpy as np

# Step 1: Drop constant columns (genes with no variation across samples)
constant_cols = em_T.loc[:, em_T.nunique() <= 1].columns
cleaned_em_T = em_T.drop(columns=constant_cols)
print(f"Dropping {len(constant_cols)} constant columns.")

# Step 2: Drop low-expression genes (genes expressed in <1% of samples)
tolerance = 1e-2  # or 0.01
pseudo_zero = np.log2(0.1)  # ≈ -3.32
min_samples = int(0.01 * cleaned_em_T.shape[0])

# Count how many samples have expression different from -3.32
expressed = (np.abs(cleaned_em_T - pseudo_zero) > tolerance).sum(axis=0)
low_expression_cols = expressed[expressed < min_samples].index

print(f"Dropping {len(low_expression_cols)} low-expression columns (not equal to {pseudo_zero:.2f} in < {min_samples} samples).")

filtered_em_T = cleaned_em_T.drop(columns=low_expression_cols)

Dropping 2236 constant columns.
Dropping 2479 low-expression columns (not equal to -3.32 in < 34 samples).


Applying median-centering per column.

In [25]:
median_centered = filtered_em_T.sub(filtered_em_T.median(axis=0), axis=1)

Applying z-scoring per column.

In [26]:
# Drop columns with zero std dev after median-centering
stds = median_centered.std(axis=0, ddof=0)
zero_std_cols = stds[stds == 0].index

print(f"Dropping {len(zero_std_cols)} columns with zero standard deviation after centering.")

median_centered = median_centered.drop(columns=zero_std_cols)


Dropping 2 columns with zero standard deviation after centering.


In [27]:
median_centered.to_csv('/Users/sarah/Code/bioinformatics-tool/data/GSE96058_median_centered.csv')

In [19]:
zscored = median_centered.sub(median_centered.mean(axis=0), axis=1)
zscored = zscored.div(median_centered.std(axis=0, ddof=0), axis=1)


In [20]:
# Shows columns with any NaNs and how many there are
na_cols = zscored.isna().sum()
na_cols = na_cols[na_cols > 0]
print(na_cols)


Series([], dtype: int64)


In [None]:
# zscored.to_csv("/Users/sarah/Code/bioinformatics-tool/data/GSE96058_zscored.csv")