In [5]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler, QuantileTransformer, RobustScaler, PowerTransformer

# Function to check and replace inf or NaN with a large number
def check_and_replace_infs(df, replacement_value=1e6):
    df.replace([np.inf, -np.inf], replacement_value, inplace=True)
    df.fillna(replacement_value, inplace=True)
    return df

# Load the gene expression data
data = pd.read_csv("data/ml_data/RR1-combined.csv")

# Function to apply transformations and append to dataframe
def transform_and_append(df, transformation, transformation_name, apply_to_whole_df=False):
    transformed_data = df.copy()
    
    if apply_to_whole_df:
        transformed_numeric = transformation(transformed_data.select_dtypes(include=[np.number]))
    else:
        transformed_numeric = transformed_data.select_dtypes(include=[np.number]).apply(transformation)
    
    # Cast transformed data to the same type as the original data
    for col in transformed_numeric.columns:
        transformed_numeric[col] = transformed_numeric[col].astype(transformed_data[col].dtype)
    
    transformed_numeric = check_and_replace_infs(transformed_numeric)
    
    transformed_data.update(transformed_numeric)
    transformed_data['Environment'] = transformed_data['Environment'] + f'_{transformation_name}'
    return transformed_data

# Apply log transformation
log_transformed = transform_and_append(data, np.log1p, 'log')

# Apply sqrt transformation
sqrt_transformed = transform_and_append(data, np.sqrt, 'sqrt')

# Apply Box-Cox transformation
def boxcox_transform(x):
    x_positive = x + 1 - np.min(x)
    transformed, _ = boxcox(x_positive)
    return transformed

boxcox_transformed = transform_and_append(data, boxcox_transform, 'boxcox')

# Apply Centered Log Ratio (CLR) transformation
def clr_transform(x):
    gm = np.exp(np.mean(np.log(x + 1)))  # Adding 1 to avoid log(0)
    return np.log((x + 1) / gm)  # Adding 1 to avoid log(0)

clr_transformed = transform_and_append(data, clr_transform, 'clr')

# Median of Ratios (DESeq2) transformation approximation
def mr_transform(df):
    transformed_data = df.copy()
    counts = transformed_data.select_dtypes(include=[np.number])
    geometric_means = np.exp(np.mean(np.log(counts + 1), axis=0))  # Add 1 to avoid log(0)
    size_factors = counts.divide(geometric_means, axis=1).median(axis=0)
    normalized_counts = counts.divide(size_factors, axis=1)
    
    # Cast normalized counts to the same type as the original data
    for col in normalized_counts.columns:
        normalized_counts[col] = normalized_counts[col].astype(transformed_data[col].dtype)
    
    normalized_counts = check_and_replace_infs(normalized_counts)
    
    transformed_data.update(normalized_counts)
    return transformed_data

mr_transformed = mr_transform(data)
mr_transformed['Environment'] = data['Environment'] + '_mr'

# Apply Z-Score Normalization
def zscore_transform(x):
    return (x - np.mean(x)) / np.std(x)

zscore_transformed = transform_and_append(data, zscore_transform, 'zscore')

# Apply Quantile Transformation
def quantile_transform(df):
    qt = QuantileTransformer(output_distribution='normal', random_state=0)
    transformed = qt.fit_transform(df.select_dtypes(include=[np.number]))
    transformed_df = pd.DataFrame(transformed, columns=df.select_dtypes(include=[np.number]).columns)
    transformed_df = check_and_replace_infs(transformed_df)
    return transformed_df

quantile_transformed = transform_and_append(data, quantile_transform, 'quantile', apply_to_whole_df=True)

# Apply Min-Max Scaling
def minmax_transform(x):
    return (x - np.min(x)) / (np.max(x) - np.min(x))

minmax_transformed = transform_and_append(data, minmax_transform, 'minmax')

# Apply Robust Scaling
def robust_transform(df):
    rs = RobustScaler()
    transformed = rs.fit_transform(df.select_dtypes(include=[np.number]))
    transformed_df = pd.DataFrame(transformed, columns=df.select_dtypes(include=[np.number]).columns)
    transformed_df = check_and_replace_infs(transformed_df)
    return transformed_df

robust_transformed = transform_and_append(data, robust_transform, 'robust', apply_to_whole_df=True)

# Apply Yeo-Johnson Transformation
def yeo_johnson_transform(df):
    pt = PowerTransformer(method='yeo-johnson')
    transformed = pt.fit_transform(df.select_dtypes(include=[np.number]))
    transformed_df = pd.DataFrame(transformed, columns=df.select_dtypes(include=[np.number]).columns)
    transformed_df = check_and_replace_infs(transformed_df)
    return transformed_df

yeo_johnson_transformed = transform_and_append(data, yeo_johnson_transform, 'yeojohnson', apply_to_whole_df=True)

# Combine all transformed data
transformed_data = pd.concat([
    data,
    log_transformed,
    sqrt_transformed,
    boxcox_transformed,
    clr_transformed,
    mr_transformed,
    zscore_transformed,
    quantile_transformed,
    minmax_transformed,
    robust_transformed,
    yeo_johnson_transformed
    
], axis=0)

# Save the transformed data
output_file_path = 'data/ml_data/RR1-augmented.csv'
transformed_data.to_csv(output_file_path, index=False)

output_file_path



'data/ml_data/RR1-augmented.csv'

In [6]:
transformed_data

Unnamed: 0,Sample name,Environment,ORO Positivity (%),ENSMUSG00000000094,ENSMUSG00000000365,ENSMUSG00000000402,ENSMUSG00000000416,ENSMUSG00000000562,ENSMUSG00000000627,ENSMUSG00000000794,...,ENSMUSG00000116378,ENSMUSG00000116461,ENSMUSG00000116594,ENSMUSG00000116780,ENSMUSG00000117081,ENSMUSG00000117286,ENSMUSG00000117748,ENSMUSG00000117874,Dataset_pheno,Environment_pheno
0,Mmus_C57-6T_LVR_BSL_Rep1_B1,BSL,23.800000,2.976790,0.000000,3.113837,8.321191,1.331374,1.565405,0.000000,...,112.098170,0.000000,4.105522,11.228127,26.986622,1.783415,0.000000,0.000000,OSD-47,BSL
1,Mmus_C57-6T_LVR_BSL_Rep2_B2,BSL,19.790000,0.000000,3.716503,0.951380,9.696798,3.661008,10.466050,2.472985,...,146.512546,0.000000,0.000000,0.000000,6.659667,1.340699,0.000000,0.000000,OSD-47,BSL
2,Mmus_C57-6T_LVR_BSL_Rep3_B3,BSL,17.440000,4.184136,7.202477,1.025815,7.115434,1.315814,0.773555,1.777646,...,27.697014,11.283960,3.477891,0.000000,4.103265,1.445594,0.000000,0.000000,OSD-47,BSL
3,Mmus_C57-6T_LVR_GC_Rep1_G2,GC,14.490000,0.000000,0.000000,0.000000,11.822939,1.588528,10.343451,0.871708,...,132.800298,0.000000,4.547898,2.468616,0.000000,0.000000,0.000000,0.000000,OSD-47,GC
4,Mmus_C57-6T_LVR_GC_Rep2_G3,GC,13.960000,0.000000,6.221168,3.883461,54.709471,0.902846,2.928471,3.364846,...,137.862922,0.000000,6.009950,0.000000,0.000000,1.368158,20.267246,11.650397,OSD-47,GC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,Mmus_BAL-TAL_LVR_FLT_Rep2_F2,FLT_yeojohnson,0.989206,1.267120,-1.152028,1.299570,1.363421,0.391793,-0.441396,0.620940,...,0.771033,0.544072,-1.022884,1.377208,-0.715441,-1.018865,-0.101824,-0.745373,OSD-137,FLT
37,Mmus_BAL-TAL_LVR_FLT_Rep3_F3,FLT_yeojohnson,0.991991,1.089003,1.235104,0.571185,0.111074,0.531185,0.115854,-1.119308,...,-0.816520,-1.225575,1.454515,-1.091593,-0.715441,1.729635,0.127577,1.554018,OSD-137,FLT
38,Mmus_BAL-TAL_LVR_FLT_Rep4_F4,FLT_yeojohnson,-0.169529,-1.213736,0.617136,0.736319,-0.337692,-1.176901,-0.488421,-1.119308,...,-0.816520,0.008826,-1.022884,0.327997,-0.715441,1.525339,-0.073070,-0.745373,OSD-137,FLT
39,Mmus_BAL-TAL_LVR_FLT_Rep5_F5,FLT_yeojohnson,-0.183531,-1.213736,-1.152028,1.194676,0.409437,1.053048,1.104045,-0.251469,...,-0.816520,0.647063,-1.022884,1.026602,-0.715441,0.421221,-0.237947,-0.745373,OSD-137,FLT
