In [1]:
# standard libraries
import math
import os
import tempfile
import json
from pathlib import Path
import pickle

# standard scientific libraries
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from numpy import asarray, save, load
import pandas as pd
import seaborn as sns

# scikit-learn
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

# tensorflow
import tensorflow as tf
import tensorflow_addons as tfa
import keras
from keras.models import Sequential
from keras.layers import Dense,Conv1D, Conv2D, MaxPooling2D, Dropout, Flatten, Input, MaxPooling1D
from keras.optimizers import RMSprop
from keras.utils import to_categorical

2024-08-11 18:23:55.645983: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.11.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or t

# Model

In [2]:
def make_model(metrics, num_classes, input_shape, output_bias=None, pool_size = 2):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    
    model = Sequential([        
        # First convolution
        Conv1D(16, 3, strides=1, activation='relu', padding='same', # TODO set as 64?
               input_shape = input_shape,
               kernel_initializer = 'he_normal',
               bias_initializer = 'zeros'),
        MaxPooling1D(pool_size = pool_size, strides = 2), # TODO set strides?
        Dropout(0.2),
        Conv1D(32, 3, strides = 1, activation = 'relu', padding = 'same',
               kernel_initializer = 'he_normal',
               bias_initializer = 'zeros'),
        MaxPooling1D(pool_size = pool_size, strides = 2),
        Dropout(0.2),
        Flatten(),
        
        # Neuron hidden layer
        Dense(int(input_shape[0]/pool_size) * 32, activation = 'relu', kernel_initializer='he_normal', bias_initializer = output_bias),
        Dropout(0.2),
        
        # Output neuron
        # Dense(1, activation='sigmoid')  # Sigmoid for binary question. It will contain a value from 0-1 where 0 for class ('not GNRA') and 1 for the other ('GNRA')
        Dense(num_classes, activation='softmax', bias_initializer=output_bias, kernel_initializer='glorot_uniform') # TODO categorical: Softmax for multiclass classification
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate = 1e-3),  # optimizer=RMSprop(lr=0.001),
        loss=keras.losses.CategoricalCrossentropy(), 
        # loss=keras.losses.BinaryCrossentropy(), # TODO remove?
        metrics=metrics)

    return model

In [3]:
def train(DATA_DIR, NUM_CLASSES, EPOCHS = 20, BATCH_SIZE = 16):
    DATA_DIR = Path(DATA_DIR)
    SET_NAME = str(DATA_DIR).split("/")[-1]
    RESULTS_FILE = RESULTS_DIR / SET_NAME
    #// RESULTS_FILE = Path(f"results/ANN/homology_reduced/{SET_NAME}")
    METRICS = [ 
        keras.metrics.TruePositives(name = 'tp'), # TODO is this correct? something is wrong
        keras.metrics.FalsePositives(name = 'fp'),
        keras.metrics.TrueNegatives(name = 'tn'),
        keras.metrics.FalseNegatives(name = 'fn'),
        # keras.metrics.BinaryAccuracy(name = 'accuracy'), # TODO remove this?
        keras.metrics.CategoricalAccuracy(name='accuracy'),
        keras.metrics.Precision(name = 'precision'),
        keras.metrics.Recall(name = 'recall'),
        keras.metrics.AUC(name = 'auc', curve='roc'),
        keras.metrics.AUC(name = 'prc', curve = 'PR'), # precision-recall curve
        tfa.metrics.F1Score(name = 'f1', num_classes = NUM_CLASSES),
        tfa.metrics.MatthewsCorrelationCoefficient(name = 'mcc', num_classes = NUM_CLASSES)
    ]

    # Load dataset
    
    train_dict = np.load(DATA_DIR/"train_matrices.npz", allow_pickle=True)
    x_train = np.stack(train_dict['arr_0'], axis=0)
    y_train = np.load(DATA_DIR/"train_labels.npy", allow_pickle=True)
    y_train = to_categorical(y_train, num_classes=NUM_CLASSES)

    dev_dict = np.load(DATA_DIR/"dev_matrices.npz", allow_pickle=True)
    x_dev = np.stack(dev_dict['arr_0'], axis=0)
    y_dev = np.load(DATA_DIR/"dev_labels.npy", allow_pickle=True)
    y_dev = to_categorical(y_dev, num_classes=NUM_CLASSES)

    print("Training features shape:", x_train.shape)
    print("Training labels shape:", y_train.shape)

    print("\nDev (validation) features shape:", x_dev.shape)
    print("Dev (validation) labels shape:", y_dev.shape)

    print("\nInput shape:", x_train.shape[1:])
    print()

    INPUT_SHAPE = x_train.shape[1:]

    # Calculate class weight

    # Scaling by total/2 helps keep the loss to a similar magnitude.
    # The sum of the weights of all examples stays the same.
    y_integers = np.argmax(y_train, axis=1)
    class_weights = np.round(class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_integers), y=y_integers), 2)
    d_class_weights = dict(enumerate(class_weights))

    # Train with class weights

    model = make_model(METRICS, NUM_CLASSES, INPUT_SHAPE)

    history = model.fit(
        x_train,
        y_train,
        validation_data = (x_dev, y_dev),
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        
        # The class weights go here
        class_weight=d_class_weights)
    
    # Save model
    model.save(RESULTS_FILE, overwrite=True)

    # Save history
    with open(RESULTS_FILE/"history", 'wb') as file:
        pickle.dump(history.history, file)

In [4]:
import os 
cwd = os.getcwd()
print(cwd)
PATH = Path("../data_generation/training_data/homology_reduced/")
#PATH = Path("..///")

RESULTS_DIR = Path("results/ANN/homology_reduced/")
for dir in os.listdir(PATH):
    if dir[-1] != "U" and "gnravall" not in dir and os.path.isdir(PATH/dir):
        if "clusters" in dir:
            pass
            # print(dir)
            # train(PATH/dir, 24)
        else:
            # pass
            train(PATH/dir, 2)

/mimer/NOBACKUP/groups/naiss2024-5-16/deep-learning-RNA-structure-prediction-sam/tloop_prediction


FileNotFoundError: [Errno 2] No such file or directory: '../data_generation/training_data/homology_reduced'