In [185]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

sklearn.set_config(transform_output="pandas")
import wandb
import tensorflow as tf
from tensorflow import keras

# tf.debugging.set_log_device_placement(True)

In [186]:
file_name = "gene_counts_NN_55_training.csv"

data = pd.read_csv(file_name)
data_features = data.drop(columns=["sample"])
target = data["sample"]

In [187]:
from sklearn.base import BaseEstimator, TransformerMixin


# Writing data normalisetion transformer
class Data_normalizer_DESeq2(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Ensure all values are non-negative
        data = X.applymap(lambda x: max(x, 0))

        # Take the log
        log_data = np.log1p(data)

        # Calculate the pseudo-reference sample for each gene
        log_data["pseudo_reference"] = log_data.mean(axis=1)

        # Filter out genes with -Inf as their average
        filtered_log_data = log_data[log_data["pseudo_reference"] != float("-inf")]

        # Subtract the gene pseudo-references from log counts
        ratio_data = filtered_log_data.iloc[:, :-1].sub(
            filtered_log_data["pseudo_reference"], axis=0
        )

        # Find the median of the ratios for each sample
        sample_medians = ratio_data.median(axis=0)

        # Convert medians to scaling factors
        scaling_factors = np.exp(sample_medians)

        # Divide the original counts by the scaling factors
        X = data.div(scaling_factors)

        return X

In [195]:
from sklearn.model_selection import train_test_split

split_params = {
    "test_size": 0.2,
    "random_state": 0,
    "shuffle": True,
}


X_train, X_test, y_train, y_test = train_test_split(
    data_features,
    target,
    **split_params,
    stratify=target,
)

In [189]:
from sklearn.pipeline import Pipeline
from category_encoders import OneHotEncoder

label_transformer = Pipeline(
    [
        (
            "one_hot_encoder",
            OneHotEncoder(),
        ),
    ]
)

y_train = label_transformer.fit_transform(y_train)
y_test = label_transformer.transform(y_test)

In [191]:
X_train.shape

(44, 19992)

In [103]:
from feature_engine.selection import DropConstantFeatures

dcf = DropConstantFeatures()
dcf.fit(X_train)

In [104]:
len(dcf.features_to_drop_)

1926

In [110]:
X_train["CD5L"].unique()

array([0.])

In [111]:
print("Number of variables before removing constant: ", X_train.shape[1])

X_train = dcf.transform(X_train)
X_test = dcf.transform(X_test)

print("Number of variables after removing constant: ", X_train.shape[1])

Number of variables before removing constant:  19992
Number of variables after removing constant:  18066


In [155]:
quasi_constant = DropConstantFeatures(tol=0.97)

# find quasi-constant features in the train set
quasi_constant.fit(X_train)

In [154]:
len(quasi_constant.features_to_drop_)

2567

In [149]:
quasi_constant.features_to_drop_[-2:]

['ASIC2', 'GALNT17']

In [148]:
X_train["GALNT17"].value_counts() / len(X_train)

GALNT17
0.0    1.0
Name: count, dtype: float64

In [150]:
X_train["ASIC2"].value_counts() / len(X_train)

ASIC2
0.0    1.0
Name: count, dtype: float64

In [151]:
# remove quasi-constant features - transform method

print("Number of variables before removing quasi-constant: ", X_train.shape[1])

X_train = quasi_constant.transform(X_train)
X_test = quasi_constant.transform(X_test)

print("Number of variables after removing quasi-constant: ", X_train.shape[1])

Number of variables before removing quasi-constant:  19992
Number of variables after removing quasi-constant:  17425


In [156]:
quasi_constant = DropConstantFeatures(tol=0.95)

# find quasi-constant features in the train set
quasi_constant.fit(X_train)

In [160]:
len(quasi_constant.features_to_drop_)

2947

In [161]:
quasi_constant.features_to_drop_[-2:]

['ASIC2', 'GALNT17']

In [165]:
quasi_constant.features_to_drop_[500:]

['SNCAIP',
 'MC5R',
 'MDGA2',
 'ACTRT1',
 'TSPY3',
 'NLRP8',
 'KRT6A',
 'KISS1',
 'FAM90A10P',
 'MASP1',
 'AL845331.2',
 'AC136352.5',
 'CGB1',
 'SPANXN1',
 'SYCN',
 'IRX1',
 'DEFA6',
 'RBMY1A1',
 'OR10X1',
 'CCDC169-SOHLH2',
 'TSPAN8',
 'CR933783.1',
 'TTLL6',
 'OR52L1',
 'APOBEC4',
 'SSU72P5',
 'TTC29',
 'TM4SF18',
 'SNAI2',
 'AC256223.1',
 'FGF14',
 'KCNK2',
 'MAGEA3',
 'DEFB127',
 'UCMA',
 'GDF3',
 'TBX22',
 'SYT10',
 'SSX4',
 'GABRB1',
 'ATP6V1G3',
 'PCDHA12',
 'SOX9',
 'NPY',
 'F2',
 'OR4A5',
 'DMRTB1',
 'RBP3',
 'HOXC9',
 'ZNF648',
 'MOBP',
 'TEX46',
 'H4C13',
 'DCAF8L1',
 'PPP1R1C',
 'C10orf113',
 'CDH10',
 'FAM181A',
 'MAGEC2',
 'PLSCR5',
 'TBC1D21',
 'CT476828.18',
 'KIF4B',
 'OR10G7',
 'ACTRT2',
 'AC270285.1',
 'PCDHB16',
 'KRBOX1',
 'GUCA2B',
 'SPANXN4',
 'GNG13',
 'AC004080.3',
 'SSX2B',
 'TEX49',
 'AL603764.2',
 'FSHR',
 'LRRC3B',
 'SMCO2',
 'OR10G6',
 'AC275546.2',
 'LCE2A',
 'PNLIPRP1',
 'MG910335.3',
 'NKX1-1',
 'FBP2',
 'AC270273.1',
 'AQP5',
 'SPEM1',
 'PCDHA7',
 'TY

In [168]:
for value in quasi_constant.features_to_drop_:
    print(X_train[value].value_counts() / len(X_train))

UGT1A5
0.0    1.0
Name: count, dtype: float64
SLC2A7
0.0    1.0
Name: count, dtype: float64
FABP12
0.0    1.0
Name: count, dtype: float64
OR8G5
0.0    0.977273
1.0    0.022727
Name: count, dtype: float64
SLC22A24
0.0    0.977273
1.0    0.022727
Name: count, dtype: float64
SFTPA1
0.0    0.977273
1.0    0.022727
Name: count, dtype: float64
PLA2G2A
0.0    1.0
Name: count, dtype: float64
LINC02203
0.0    1.0
Name: count, dtype: float64
PSG4
0.0    1.0
Name: count, dtype: float64
CPSF4L
0.0    1.0
Name: count, dtype: float64
KLK9
0.0    1.0
Name: count, dtype: float64
GABRA4
0.0    1.0
Name: count, dtype: float64
TMEM95
0.0    0.977273
2.0    0.022727
Name: count, dtype: float64
LYZL4
0.0    1.0
Name: count, dtype: float64
H3Y1
0.0    1.0
Name: count, dtype: float64
OMP
0.0    0.954545
1.0    0.045455
Name: count, dtype: float64
USP17L24
0.0    0.954545
1.0    0.045455
Name: count, dtype: float64
PRKACG
0.0    1.0
Name: count, dtype: float64
PCDHAC1
0.0    0.954545
1.0    0.045455
Name: cou

In [None]:
# remove quasi-constant features - transform method

print("Number of variables before removing quasi-constant: ", X_train.shape[1])

X_train = quasi_constant.transform(X_train)
X_test = quasi_constant.transform(X_test)

print("Number of variables after removing quasi-constant: ", X_train.shape[1])

Number of variables before removing quasi-constant:  19992
Number of variables after removing quasi-constant:  17425


In [174]:
dn = Data_normalizer_DESeq2()
X_train = dn.fit_transform(X_train)

dcf = DropConstantFeatures()
dcf.fit(X_train)

In [175]:
for value in dcf.features_to_drop_:
    print(X_train[value].value_counts() / len(X_train))

UGT1A5
0.0    1.0
Name: count, dtype: float64
SLC2A7
0.0    1.0
Name: count, dtype: float64
FABP12
0.0    1.0
Name: count, dtype: float64
PLA2G2A
0.0    1.0
Name: count, dtype: float64
LINC02203
0.0    1.0
Name: count, dtype: float64
PSG4
0.0    1.0
Name: count, dtype: float64
CPSF4L
0.0    1.0
Name: count, dtype: float64
KLK9
0.0    1.0
Name: count, dtype: float64
GABRA4
0.0    1.0
Name: count, dtype: float64
LYZL4
0.0    1.0
Name: count, dtype: float64
H3Y1
0.0    1.0
Name: count, dtype: float64
PRKACG
0.0    1.0
Name: count, dtype: float64
HSFY2
0.0    1.0
Name: count, dtype: float64
DUSP13
0.0    1.0
Name: count, dtype: float64
AC275401.1
0.0    1.0
Name: count, dtype: float64
PRAMEF19
0.0    1.0
Name: count, dtype: float64
AL662884.4
0.0    1.0
Name: count, dtype: float64
NYX
0.0    1.0
Name: count, dtype: float64
NMRK2
0.0    1.0
Name: count, dtype: float64
CCKAR
0.0    1.0
Name: count, dtype: float64
CBLN2
0.0    1.0
Name: count, dtype: float64
RIIAD1
0.0    1.0
Name: count, dty

In [177]:
from feature_engine.selection import DropConstantFeatures


dn = Data_normalizer_DESeq2()
X_train = dn.fit_transform(X_train)

dcf = DropConstantFeatures(tol=0.95)
dcf.fit(X_train)

In [178]:
for value in dcf.features_to_drop_:
    print(X_train[value].value_counts() / len(X_train))

UGT1A5
0.0    1.0
Name: count, dtype: float64
SLC2A7
0.0    1.0
Name: count, dtype: float64
FABP12
0.0    1.0
Name: count, dtype: float64
OR8G5
0.000000     0.977273
22.707825    0.022727
Name: count, dtype: float64
SLC22A24
0.000000     0.977273
22.707825    0.022727
Name: count, dtype: float64
SFTPA1
0.000000     0.977273
23.199461    0.022727
Name: count, dtype: float64
PLA2G2A
0.0    1.0
Name: count, dtype: float64
LINC02203
0.0    1.0
Name: count, dtype: float64
PSG4
0.0    1.0
Name: count, dtype: float64
CPSF4L
0.0    1.0
Name: count, dtype: float64
KLK9
0.0    1.0
Name: count, dtype: float64
GABRA4
0.0    1.0
Name: count, dtype: float64
TMEM95
0.000000     0.977273
45.415649    0.022727
Name: count, dtype: float64
LYZL4
0.0    1.0
Name: count, dtype: float64
H3Y1
0.0    1.0
Name: count, dtype: float64
OMP
0.000000     0.954545
22.707825    0.045455
Name: count, dtype: float64
USP17L24
0.000000     0.954545
22.612213    0.045455
Name: count, dtype: float64
PRKACG
0.0    1.0
Name:

95% tolerance is too low for DESeq2 filtered counts, will use 100% to get rid of all zeros.

In [184]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from feature_engine.selection import DropConstantFeatures

feature_transformer = Pipeline(
    [
        ("DESeq2normalizer", Data_normalizer_DESeq2()),
        ("constant_feature_drop", DropConstantFeatures()),
        ("minmaxscaler", MinMaxScaler()),
    ]
)

In [183]:
feature_transformer