In [1]:
# standard libraries
import math
import os
import tempfile
import json
from pathlib import Path
import pickle

# standard scientific libraries
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from numpy import asarray, save, load
import pandas as pd
import seaborn as sns

# scikit-learn
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# tensorflow
import tensorflow as tf
import tensorflow_addons as tfa
import keras
from keras.models import Sequential
from keras.layers import Dense,Conv1D, Conv2D, MaxPooling2D, Dropout, Flatten, Input, MaxPooling1D
from keras.optimizers import RMSprop
from keras.utils import to_categorical

2024-05-01 23:52:26.857027: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.11.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or t

TODO
- fragment length (8- 14)
- classifier vs. single-sequence (GNRA)
- with/without decoys

ADD VALIDATION SET

# Model

In [2]:
# Config (hyperparameters, metrics, etc.)

DATA_DIR = Path("../data_generation/training_data/gnra_8_80_T/")
SET_NAME = str(DATA_DIR).split("/")[-1]
RESULTS_FILE = Path(f"results/{SET_NAME}")
EPOCHS = 20
BATCH_SIZE = 16
NUM_CLASSES = 2
METRICS = [
      keras.metrics.TruePositives(name = 'tp'),
      keras.metrics.FalsePositives(name = 'fp'),
      keras.metrics.TrueNegatives(name = 'tn'),
      keras.metrics.FalseNegatives(name = 'fn'),
      # keras.metrics.BinaryAccuracy(name = 'accuracy'), # TODO remove this?
      keras.metrics.CategoricalAccuracy(name='accuracy'),
      keras.metrics.Precision(name = 'precision'),
      keras.metrics.Recall(name = 'recall'),
      keras.metrics.AUC(name = 'auc'),
      keras.metrics.AUC(name = 'prc', curve = 'PR'), # precision-recall curve
      tfa.metrics.F1Score(name = 'f1', num_classes = NUM_CLASSES),
      tfa.metrics.MatthewsCorrelationCoefficient(name = 'mcc', num_classes = NUM_CLASSES)
]

2024-05-01 23:52:31.118308: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13762 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:86:00.0, compute capability: 7.5


In [3]:
# Load dataset

train_dict = np.load(DATA_DIR/"train_matrices.npz", allow_pickle=True)
x_train = np.stack(train_dict['arr_0'], axis=0)
y_train = np.load(DATA_DIR/"train_labels.npy", allow_pickle=True) # TODO as_type(int) needed?
y_train = to_categorical(y_train, num_classes=NUM_CLASSES)  # TODO categorical, one-hot encode labels

test_dict = np.load(DATA_DIR/"test_matrices.npz", allow_pickle=True)
x_test = np.stack(test_dict['arr_0'], axis=0)
y_test = np.load(DATA_DIR/"test_labels.npy", allow_pickle=True)
y_test = to_categorical(y_test, num_classes=NUM_CLASSES)

dev_dict = np.load(DATA_DIR/"dev_matrices.npz", allow_pickle=True)
x_dev = np.stack(dev_dict['arr_0'], axis=0)
y_dev = np.load(DATA_DIR/"dev_labels.npy", allow_pickle=True)
y_dev = to_categorical(y_dev, num_classes=NUM_CLASSES)

#! Merge dev and test sets (if validation set not used)
# x_test = np.concatenate((x_test, x_dev), axis=0)
# y_test = np.concatenate((y_test, y_dev), axis=0)

print("training features shape:", x_train.shape)
print("training labels shape:", y_train.shape)

print("\ntesting features shape:", x_test.shape)
print("testing labels shape:", y_test.shape)

print("\ndev (validation) features shape:", x_dev.shape)
print("dev (validation) labels shape:", y_dev.shape)

print("\ninput shape:", x_train.shape[1:])

INPUT_SHAPE = x_train.shape[1:]

training features shape: (33084, 8, 5)
training labels shape: (33084, 2)

testing features shape: (4136, 8, 5)
testing labels shape: (4136, 2)

dev (validation) features shape: (4135, 8, 5)
dev (validation) labels shape: (4135, 2)

input shape: (8, 5)


In [4]:
def make_model(metrics=METRICS, output_bias=None, input_shape=INPUT_SHAPE, pool_size = 2):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    
    model = Sequential([        
        # First convolution
        Conv1D(16, 3, strides=1, activation='relu', padding='same', # TODO set as 64?
               input_shape = input_shape,
               kernel_initializer = 'he_normal',
               bias_initializer = 'zeros'),
        MaxPooling1D(pool_size = pool_size, strides = 2), # TODO set strides?
        Dropout(0.2),
        Conv1D(32, 3, strides = 1, activation = 'relu', padding = 'same',
               kernel_initializer = 'he_normal',
               bias_initializer = 'zeros'),
        MaxPooling1D(pool_size = pool_size, strides = 2),
        Dropout(0.2),
        Flatten(),
        
        # Neuron hidden layer
        Dense(int(INPUT_SHAPE[0]/pool_size) * 32, activation = 'relu', kernel_initializer='he_normal', bias_initializer = output_bias),
        Dropout(0.2),
        
        # Output neuron
        # Dense(1, activation='sigmoid')  # Sigmoid for binary question. It will contain a value from 0-1 where 0 for class ('not GNRA') and 1 for the other ('GNRA')
        Dense(NUM_CLASSES, activation='softmax', bias_initializer=output_bias, kernel_initializer='glorot_uniform') # TODO categorical: Softmax for multiclass classification
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate = 1e-3),  # optimizer=RMSprop(lr=0.001),
        loss=keras.losses.CategoricalCrossentropy(), 
        # loss=keras.losses.BinaryCrossentropy(), # TODO remove?
        metrics=metrics)

    return model

In [5]:
# Calculate class weight

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
y_integers = np.argmax(y_train, axis=1)
class_weights = np.round(class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_integers), y=y_integers), 2)
d_class_weights = dict(enumerate(class_weights))

print(d_class_weights)

{0: 0.54, 1: 7.23}


In [6]:
# Train with class weights

model = make_model()
#weighted_model.load_weights(initial_weights)

history = model.fit(
    x_train,
    y_train,
    validation_data = (x_dev, y_dev),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    
    # The class weights go here
    class_weight=d_class_weights)

Epoch 1/20
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2024-05-01 23:52:36.030706: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [7]:
# Save model
model.save(RESULTS_FILE, overwrite=True)

# Save history
with open(RESULTS_FILE/"history", 'wb') as file:
    pickle.dump(history.history, file)



INFO:tensorflow:Assets written to: results/gnra_8_80_T/assets


INFO:tensorflow:Assets written to: results/gnra_8_80_T/assets
