# Neural network for classification of contaminants with MAT

## 1. Formulate/outline the problem: classification

Simple neural network for classification of the contaminants using MAT transcriptomes


In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import wandb
import tensorflow as tf
from tensorflow import keras

In [None]:
file_name = "gene_counts_NN_training.csv"

In [None]:
data = pd.read_csv(file_name)

## 2. Identify inputs and outputs

In [None]:
data["sample"].value_counts()

## 3. Prepare data

In [None]:
data_features = data.drop(columns=["sample"])
target = data["sample"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_features, target, test_size=0.2, random_state=0, shuffle=True, stratify=target
)

In [None]:
y_train.value_counts(), y_test.value_counts()

In [None]:
def to_normalize_DESeq2_style(data):
    # Ensure all values are non-negative
    data = data.applymap(lambda x: max(x, 0))

    # Take the log
    log_data = np.log1p(data)

    # Calculate the pseudo-reference sample for each gene
    log_data["pseudo_reference"] = log_data.mean(axis=1)

    # Filter out genes with -Inf as their average
    filtered_log_data = log_data[log_data["pseudo_reference"] != float("-inf")]

    # Subtract the gene pseudo-references from log counts
    ratio_data = filtered_log_data.iloc[:, :-1].sub(
        filtered_log_data["pseudo_reference"], axis=0
    )

    # Find the median of the ratios for each sample
    sample_medians = ratio_data.median(axis=0)

    # Convert medians to scaling factors
    scaling_factors = np.exp(sample_medians)

    # Divide the original counts by the scaling factors
    manually_normalized = data.div(scaling_factors)

    return manually_normalized


X_train = to_normalize_DESeq2_style(X_train)

X_test = to_normalize_DESeq2_style(X_test)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y_train)
encoded_labels = le.transform(y_train)
le_name_mapping = dict(zip(le.classes_, encoded_labels))

In [None]:
le_name_mapping

In [None]:
from sklearn.feature_selection import chi2, SelectKBest

# Define feature selection
fs = SelectKBest(score_func=chi2, k=500)

# Apply feature selection
X_train = fs.fit_transform(X_train, encoded_labels)

In [None]:
from sklearn.decomposition import PCA

n_components = 2
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)

# Scatter plot
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=encoded_labels, cmap="Set1")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA Visualization of Selected Features")
plt.colorbar(label="Class Label")
plt.show()

In [None]:
# Transform the test feature matrix
X_test = fs.transform(X_test)

In [None]:
y_train = pd.get_dummies(y_train, dtype=int)
y_test = pd.get_dummies(y_test, dtype=int)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## 4. Build an architecture from scratch or choose a pretrained model

In [None]:
keras.backend.clear_session()
keras.utils.set_random_seed(2)

In [None]:
WANDB_NOTEBOOK_NAME = "20230829 NN with Wandb.ipynb"

In [None]:
default_config = {
    "hidden_units": 56,
    "dropout_rates": 0.2,
    "batch_size": 15,
    "activation": "relu",
    "weights_limit": 0.01,
    "epochs": 300,
    "learning_rate": 0.001,
    "loss": "categorical_crossentropy",
}

In [None]:
run = wandb.init(project="NN-MATseq")

## 6. Train model

In [None]:
early_stop = keras.callbacks.EarlyStopping(monitor="loss", patience=3)

In [None]:
from wandb.keras import WandbCallback


def create_nn():
    with wandb.init(config=default_config) as run:
        dropout_rates = run.config.dropout_rates
        batch_size = run.config.batch_size
        activation = run.config.activation
        weights_limit = run.config.weights_limit
        epochs = run.config.epochs
        learning_rate = run.config.learning_rate
        loss = run.config.loss
        hidden_units = run.config.hidden_units

        inputs = keras.Input(shape=X_train.shape[1])
        x = keras.layers.UnitNormalization()(inputs)

        x = keras.layers.Dense(
            config.get("hidden_units"),
            activation=config.get("activation"),
            kernel_regularizer=keras.regularizers.L2(config.get("weights_limit")),
        )(x)
        x = keras.layers.Dropout(config.get("dropout_rates"))(x)

        x = keras.layers.Dense(
            config.get("hidden_units"),
            activation=config.get("activation"),
            kernel_regularizer=keras.regularizers.L2(config.get("weights_limit")),
        )(x)
        x = keras.layers.Dropout(config.get("dropout_rates"))(x)

        outputs = keras.layers.Dense(9, activation="softmax")(x)

        model = keras.Model(inputs=inputs, outputs=outputs, name="small_NN")

        optimizer = keras.optimizers.Adam(learning_rate=config.get("learning_rate"))

        model.compile(
            optimizer=optimizer,
            loss=config.get("loss"),
            metrics=[
                "accuracy",
                keras.metrics.AUC(name="auc"),
            ],
        )

        history = model.fit(
            X_train,
            y_train,
            epochs=config.get("epochs"),
            callbacks=[WandbCallback(), early_stop],
            validation_data=(X_test, y_test),
            verbose=1,
        )


model = create_nn()
model.summary()

In [None]:
wandb.run.finish()