# Neural network for classification of contaminants with MAT

## 1. Formulate/outline the problem: classification

Simple neural network for classification of the contaminants using MAT transcriptomes


In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from tensorflow import keras

In [None]:
file_name = "gene_counts_NN_training.csv"

In [120]:
data = pd.read_csv(file_name)

## 2. Identify inputs and outputs

In [121]:
data.head()

Unnamed: 0,sample,HUNK,KLHL8,ZNF576,UGT1A5,FCF1,SLC2A7,FABP12,TAF12,GLRX,...,ASIC2,TSN,EVA1A,NFATC1,PORCN,GALNT17,DEPDC4,REL,RPL23,AHSP
0,R848,1.0,282.0,84.0,0.0,500.0,0.0,0.0,124.0,281.0,...,0.0,722.0,0.0,688.0,45.0,0.0,0.0,12160.0,6042.0,1.0
1,CRX-527,0.0,35.0,5.0,0.0,63.0,0.0,0.0,14.0,20.0,...,0.0,55.0,0.0,94.0,2.0,0.0,0.0,484.0,746.0,0.0
2,CL-307,3.0,152.0,41.0,0.0,293.0,0.0,0.0,88.0,154.0,...,0.0,462.0,0.0,495.0,38.0,0.0,0.0,4353.0,3391.0,2.0
3,Pam3,0.0,55.0,28.0,0.0,152.0,0.0,0.0,32.0,60.0,...,0.0,204.0,0.0,234.0,10.0,0.0,0.0,1350.0,2014.0,1.0
4,Pam3,0.0,115.0,32.0,0.0,235.0,0.0,0.0,68.0,71.0,...,0.0,375.0,0.0,409.0,23.0,0.0,0.0,3349.0,2760.0,0.0


## 3. Prepare data

In [128]:
encoding = array(data["sample"])
encoding.reshape(-1, 1)

NameError: name 'array' is not defined

In [126]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown="ignore")

enc.fit(data["sample"])

ValueError: Expected 2D array, got 1D array instead:
array=['R848' 'CRX-527' 'CL-307' 'Pam3' 'Pam3' 'CRX-527' 'Fla-PA' 'LPS' 'CL-307'
 'CL-307' 'PGN' 'R848' 'LPS' 'IMDM' 'LTA' 'R848' 'IMDM' 'CRX-527' 'PGN'
 'LTA' 'PGN' 'LPS' 'IMDM' 'CL-307' 'IMDM' 'R848' 'PGN' 'CRX-527' 'CRX-527'
 'Fla-PA' 'CL-307' 'Pam3' 'LPS' 'LTA' 'R848' 'PGN' 'Pam3' 'Fla-PA' 'Pam3'
 'IMDM' 'LPS'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [104]:
target = pd.get_dummies(data["sample"], dtype=int)
target.head()

Unnamed: 0,CL-307,CRX-527,Fla-PA,IMDM,LPS,LTA,PGN,Pam3,R848
0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0


In [102]:
data_features = data.drop(columns=["sample"])

In [105]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_features, target, test_size=0.2, random_state=0, shuffle=True, stratify=target
)

In [None]:
X_train.shape, X_test.shape

In [106]:
y_train = to_categorical(y_train, num_classes=9)
y_test = to_categorical(y_test, num_classes=9)

In [119]:
y_train.shape, y_test.shape

((32, 9, 9), (9, 9, 9))

In [107]:
import numpy as np


def to_normalize_DESeq2_style(data):
    # Ensure all values are non-negative
    data = data.applymap(lambda x: max(x, 0))

    # Take the log
    log_data = np.log1p(data)

    # Calculate the pseudo-reference sample for each gene
    log_data["pseudo_reference"] = log_data.mean(axis=1)

    # Filter out genes with -Inf as their average
    filtered_log_data = log_data[log_data["pseudo_reference"] != float("-inf")]

    # Subtract the gene pseudo-references from log counts
    ratio_data = filtered_log_data.iloc[:, :-1].sub(
        filtered_log_data["pseudo_reference"], axis=0
    )

    # Find the median of the ratios for each sample
    sample_medians = ratio_data.median(axis=0)

    # Convert medians to scaling factors
    scaling_factors = np.exp(sample_medians)

    # Divide the original counts by the scaling factors
    manually_normalized = data.div(scaling_factors)

    return manually_normalized


X_train = to_normalize_DESeq2_style(X_train)

X_test = to_normalize_DESeq2_style(X_test)

## 4. Build an architecture from scratch or choose a pretrained model

In [None]:
keras.backend.clear_session()
keras.utils.set_random_seed(2)

In [108]:
# create our model
def create_nn():
    inputs = keras.Input(shape=X_train.shape[1])
    x = keras.layers.Dense(10, activation="relu")(inputs)
    outputs = keras.layers.Dense(9, activation="softmax")(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name="small_NN")
    return model


model = create_nn()
model.summary()

Model: "small_NN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 19992)]           0         
                                                                 
 dense_4 (Dense)             (None, 10)                199930    
                                                                 
 dense_5 (Dense)             (None, 9)                 99        
                                                                 
Total params: 200,029
Trainable params: 200,029
Non-trainable params: 0
_________________________________________________________________


## 5. Choose a loss function and optimizer

In [109]:
def compile_model(model):
    model.compile(
        optimizer="adam",
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[keras.metrics.RootMeanSquaredError()],
    )


compile_model(model)

In [111]:
model.compile(optimizer="adam", loss=keras.losses.CategoricalCrossentropy())

In [115]:
def compile_model(model):
    model.compile(
        optimizer="adam",
        loss=CategoricalCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )


compile_model(model)

## 6. Train model

In [117]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10


ValueError: in user code:

    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/engine/training.py", line 1051, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/engine/training.py", line 1109, in compute_loss
        return self.compiled_loss(
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/losses.py", line 1984, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/home/t.afanasyeva/mambaforge/envs/MATseq/lib/python3.10/site-packages/keras/backend.py", line 5559, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (32, 9, 9) and (32, 9) are incompatible


## 8. Measure performance

In [91]:
def plot_history(history, metrics):
    history_df = pd.DataFrame.from_dict(history.history)
    sns.lineplot(data=history_df[metrics])
    plt.xlabel("epochs")
    plt.ylabel("metric")

In [92]:
plot_history(history, ["accuracy", "val_accuracy"])

KeyError: "None of [Index(['accuracy', 'val_accuracy'], dtype='object')] are in the [columns]"

In [None]:
plot_history(history, ["loss", "val_loss"])