In [None]:
!pip install --upgrade tensorflow

In [None]:
import tensorflow as tf

In [None]:
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
from PIL import Image
np.random.seed(42)
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
import itertools


import keras
from keras.utils.np_utils import to_categorical # used for converting labels to one-hot-encoding
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras import backend as K
from keras.layers.normalization import BatchNormalization
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau, Callback
from keras.wrappers.scikit_learn import KerasClassifier
from keras.applications.resnet50 import ResNet50
from keras import backend as K 


import csv
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, balanced_accuracy_score

In [None]:
os.listdir('../input/data/data/NR-ER-train')

In [None]:
smiles_train = pd.read_csv('../input/data/data/NR-ER-train/names_labels.csv', names=["names", "label"])
smiles_test = pd.read_csv('../input/data/data/NR-ER-test/names_labels.csv', names=["names", "label"])

In [None]:
root = '/kaggle/input/data'
path_train_names = root + '/data/NR-ER-train/names_onehots.npy'
path_train_labels = root + '/data/NR-ER-train/names_labels.csv'
path_test_names = root + '/data/NR-ER-test/names_onehots.npy'
path_test_labels = root + '/data/NR-ER-test/names_labels.csv'

# Write Lables from csv to onehot list
def construct_labels(path_to_file):
        labels = []
        with open(path_to_file) as csv_file:
                csv_reader = csv.reader(csv_file, delimiter= ',')
                for row in csv_reader:
                        if int(row[1]) == 0:
                                labels.append([1,0])
                
                        elif int(row[1]) == 1:
                                labels.append([0,1])

        return np.asarray(labels)

# Write OneHots to list
def construct_names (path_to_file):
        names = []
        df = np.load(path_to_file, allow_pickle=True).tolist()
        names = df.get('onehots')
        return np.asarray(names).astype(np.float64)

In [None]:
y_train = construct_labels(path_train_labels)
y_test = construct_labels(path_test_labels)

X_train = construct_names(path_train_names)
X_test = construct_names(path_test_names)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=11)

In [None]:
X_train = X_train.transpose(0,2,1)
X_test = X_test.transpose(0,2,1)
X_val = X_val.transpose(0,2,1)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

In [None]:
train_df = pd.DataFrame(np.argmax(y_train, axis=-1))
train_df.hist()
train_df[0].value_counts()

In [None]:
val_df = pd.DataFrame(np.argmax(y_val, axis=-1))
val_df.hist()
val_df[0].value_counts()

In [None]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced', np.unique(np.argmax(y_train, axis=-1)), np.argmax(y_train, axis=-1))

## SpliceAI [1]
[1] https://www.sciencedirect.com/science/article/pii/S0092867418316295

In [None]:
class ResBlock(tf.keras.layers.Layer):
    def __init__(self, N, W, D):
        super(ResBlock, self).__init__()
        self.BN_1 = tf.keras.layers.BatchNormalization()
        self.BN_2 = tf.keras.layers.BatchNormalization()
        self.conv_1 = tf.keras.layers.Conv1D(N, W, dilation_rate=D, padding="same")
        self.conv_2 = tf.keras.layers.Conv1D(N, W, dilation_rate=D, padding="same")

    def call(self, inputs, training=None):
        x = self.BN_1(inputs, training)
        x = tf.keras.activations.relu(x)
        x = self.conv_1(x)
        x = self.BN_2(x, training)
        x = tf.keras.activations.relu(x)
        x = self.conv_2(x)

        return x + inputs


class SpliceAI80(tf.keras.Model):
    def __init__(self):
        super(SpliceAI80, self).__init__()
        self.conv_1 = tf.keras.layers.Conv1D(32, 1, dilation_rate=1)
        self.conv_2 = tf.keras.layers.Conv1D(32, 1, dilation_rate=1)
        self.conv_3 = tf.keras.layers.Conv1D(32, 1, dilation_rate=1)
        self.conv_4 = tf.keras.layers.Conv1D(1, 1, dilation_rate=1)

        self.block_1 = ResBlock(32, 11, 1)
        self.block_2 = ResBlock(32, 11, 1)
        self.block_3 = ResBlock(32, 11, 1)
        self.block_4 = ResBlock(32, 11, 1)

        self.crop = tf.keras.layers.Cropping1D(cropping=(41, 40))

    def call(self, inputs):
        x_1 = self.conv_1(inputs)

        # main branch
        x = self.block_1(x_1)
        x = self.block_2(x)
        x = self.block_3(x)
        x = self.block_4(x)
        x = self.conv_3(x)

        # residual branch
        x_1 = self.conv_2(x_1)

        # come together
        x = x + x_1
        x = self.crop(x)
        x = self.conv_4(x)
        out = tf.keras.activations.sigmoid(x)

        return out


class SpliceAI400(tf.keras.Model):
    def __init__(self):
        super(SpliceAI400, self).__init__()
        self.conv_1 = tf.keras.layers.Conv1D(32, 1, dilation_rate=1)
        self.conv_2 = tf.keras.layers.Conv1D(32, 1, dilation_rate=1)
        self.conv_3 = tf.keras.layers.Conv1D(32, 1, dilation_rate=1)
        self.conv_4 = tf.keras.layers.Conv1D(32, 1, dilation_rate=1)

        # first blocks
        self.block_1 = ResBlock(32, 11, 1)
        self.block_2 = ResBlock(32, 11, 1)
        self.block_3 = ResBlock(32, 11, 1)
        self.block_4 = ResBlock(32, 11, 1)

        # second blocks
        self.block_5 = ResBlock(32, 11, 4)
        self.block_6 = ResBlock(32, 11, 4)
        self.block_7 = ResBlock(32, 11, 4)
        self.block_8 = ResBlock(32, 11, 4)

        self.pool = tf.keras.layers.GlobalAveragePooling1D()
        self.fc = tf.keras.layers.Dense(2)

    def call(self, inputs):
        x = self.conv_1(inputs)
        x_1 = self.conv_2(x)

        # main branch
        x = self.block_1(x)
        x = self.block_2(x)
        x = self.block_3(x)
        x = self.block_4(x)
        x_2 = self.conv_3(x)

        x = self.block_5(x)
        x = self.block_6(x)
        x = self.block_7(x)
        x = self.block_8(x)
        x = self.conv_4(x)

        # come together
        x = x + x_1 + x_2
        x = self.pool(x)
        x = self.fc(x)
        out = tf.keras.activations.softmax(x)

        return out

In [None]:
model = SpliceAI400()

In [None]:

model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])

input_shape = (1, *X_train.shape[1:])
x = tf.random.normal(input_shape)
model(x)

print(model.summary())

# for this to succeed run `brew install graphviz && pip install pydot_ng`
tf.keras.utils.plot_model(
    model,
    to_file='model.png',
    show_shapes=False,
    show_layer_names=True,
    rankdir='TB',
)

In [None]:
epochs = 30
batch_size = 256

In [None]:
class Balanced_Accuracy(tf.keras.callbacks.Callback):
    def __init__(self, val_data, batch_size = 128):
        super().__init__()
        self.validation_data = val_data
        self.batch_size = batch_size
        
    def on_train_begin(self, logs={}):
        self._data = [] 

    def on_epoch_end(self, epoch, logs={}):
        batches = len(self.validation_data)
        total = batches * self.batch_size

        xVal, yVal = self.validation_data
        val_pred = np.argmax((self.model.predict(xVal, verbose= 0)), axis= 1)
        val_true = np.argmax(yVal, axis= 1)
            
        val_pred = np.squeeze(val_pred)
        _val_ba = balanced_accuracy_score(val_true, val_pred)
        
        print('val balanced accuracy: ', _val_ba)
        self._data.append({'val_balanced_accuracy': _val_ba})
        return

balanced_accuracy = Balanced_Accuracy((X_val, y_val), batch_size = batch_size)

In [None]:
class_weights = {i: class_weight[i] for i in range(2)}
history = model.fit(x= X_train, y=y_train, validation_data=(X_val, y_val), class_weight= class_weights,
                    epochs= epochs, batch_size= batch_size, verbose=1, 
                    callbacks=[balanced_accuracy]
                   )
 
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from sklearn.metrics import recall_score
pred_val = model.predict(X_val)
print(balanced_accuracy_score(np.argmax(y_val,  axis= -1), np.argmax(pred_val,  axis= -1)))
print(recall_score(np.argmax(y_val,  axis= -1), np.argmax(pred_val,  axis= -1)))
print(recall_score(np.argmax(y_val,  axis= -1), np.argmax(pred_val,  axis= -1), pos_label=0))

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(confusion_matrix(np.argmax(y_test,  axis= -1), np.argmax(y_pred,  axis= -1)))
print(balanced_accuracy_score(np.argmax(y_test,  axis= -1), np.argmax(y_pred,  axis= -1)))