In [None]:
import gc
import os
import random
from contextlib import contextmanager
from glob import glob
from pathlib import Path
from time import time

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import layers as L
from tqdm.notebook import tqdm

In [None]:
ll ../input

In [None]:
DEBUG = True # if False, make sure gpu ON

In [None]:
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
ORG_ROOT = ROOT / "input" / "stanford-covid-vaccine-new-sequences-augmentation" / "original"
OUTPUT_ROOT = ROOT / "working" / "output" 
OUTPUT_ROOT.mkdir(exist_ok=True)
AUG_ROOTS = [
    INPUT_ROOT / "stanford-covid-vaccine-new-sequences-augmentation" / "contrafold",
    INPUT_ROOT / "stanford-covid-vaccine-new-sequences-augmentation" / "vienna_2",
    INPUT_ROOT / "stanford-covid-vaccine-new-sequences-augmentation" / "rnasoft",
]

MODEL_ROOT = INPUT_ROOT / "stanford-covid-vaccine-onodera-models"


In [None]:
COLS_TARGET = ["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"]
COLS_EVALUATE = ["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"]


PARAMS_FEATURE = [
    # expXXX, [loop, structure, cat_channel]
    ["exp201", [1, 0, 0]],
    ["exp202", [1, 0, 0]],
    ["exp203", [1, 1, 0]],
    ["exp204", [1, 1, 1]],
    ["exp205", [0, 1, 0]],
    ["exp206", [0, 1, 1]],
    ["exp207", [1, 0, 1]],
    ["exp208", [1, 0, 1]],
    ["exp209", [1, 1, 0]],
    ["exp210", [1, 1, 0]],
    ["exp211", [1, 1, 0]],
    ["exp212", [1, 1, 0]],
]

test = pd.read_json(ORG_ROOT / "dataset.json", lines=True)

# replace and concat
cols = ["sequence", "structure", "predicted_loop_type"]
li_te = []
for aug in AUG_ROOTS:
    df = pd.read_json(aug / "dataset.json", lines=True)
    li_te.append(
        pd.merge(test.drop(cols, axis=1), df[["id"] + cols], how="left", on="id")
    )

test = pd.concat([test] + li_te, ignore_index=True)

if test[cols].isnull().sum().sum() == 0:
    print("test.shape:", test.shape)
else:
    raise Exception("Merge was not success")



In [None]:
# =============================================================================
# def
# =============================================================================
def get_structure_adj(test, ixs):
    Ss = []
    for i in ixs:
        seq_length = test["seq_length"].iloc[i]
        structure = test["structure"].iloc[i]
        sequence = test["sequence"].iloc[i]

        cue = []
        a_structures = {
            ("A", "U"): np.zeros([seq_length, seq_length]),
            ("C", "G"): np.zeros([seq_length, seq_length]),
            ("U", "G"): np.zeros([seq_length, seq_length]),
            ("U", "A"): np.zeros([seq_length, seq_length]),
            ("G", "C"): np.zeros([seq_length, seq_length]),
            ("G", "U"): np.zeros([seq_length, seq_length]),
        }
        a_structure = np.zeros([seq_length, seq_length])
        for i in range(seq_length):
            if structure[i] == "(":
                cue.append(i)
            elif structure[i] == ")":
                start = cue.pop()
                #                 a_structure[start, i] = 1
                #                 a_structure[i, start] = 1
                a_structures[(sequence[start], sequence[i])][start, i] = 1
                a_structures[(sequence[i], sequence[start])][i, start] = 1

        a_strc = np.stack([a for a in a_structures.values()], axis=2)
        a_strc = np.sum(a_strc, axis=2, keepdims=True)
        Ss.append(a_strc)

    Ss = np.array(Ss)
    # print(Ss.shape)
    return Ss


def get_distance_matrix(As):
    idx = np.arange(As.shape[1])
    Ds = []
    for i in range(len(idx)):
        d = np.abs(idx[i] - idx)
        Ds.append(d)

    Ds = np.array(Ds) + 1
    Ds = 1 / Ds
    Ds = Ds[None, :, :]
    Ds = np.repeat(Ds, len(As), axis=0)

    Dss = []
    for i in [1, 2, 4]:
        Dss.append(Ds ** i)
    Ds = np.stack(Dss, axis=3)
    # print(Ds.shape)
    return Ds


## sequence
def return_ohe(n, i):
    tmp = [0] * n
    tmp[i] = 1
    return tmp


def get_input(test, ixs, codes):
    mapping = {}
    vocab = ["A", "G", "C", "U"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_seq = np.stack(
        test.iloc[ixs]["sequence"].apply(
            lambda x: list(map(lambda y: mapping[y], list(x)))
        )
    )

    mapping = {}
    vocab = ["S", "M", "I", "B", "H", "E", "X"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_loop = np.stack(
        test.iloc[ixs]["predicted_loop_type"].apply(
            lambda x: list(map(lambda y: mapping[y], list(x)))
        )
    )

    mapping = {}
    vocab = [".", "(", ")"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_structure = np.stack(
        test.iloc[ixs]["structure"].apply(
            lambda x: list(map(lambda y: mapping[y], list(x)))
        )
    )

    li = [X_seq]
    if codes[0] == 1:
        li.append(X_loop)
    if codes[1] == 1:
        li.append(X_structure)
    X_node = np.concatenate(li, axis=2)

    ## interaction
    a = np.sum(X_node * (2 ** np.arange(X_node.shape[2])[None, None, :]), axis=2)

    if codes[:2] == [1, 0]:
        vocab = [17, 18, 20, 24, 33, 34, 36, 40, 65, 66, 68, 72, 129, 130, 132, 136,
                 257, 258, 260, 264, 513, 514, 516, 520, 1025, 1026, 1028, 1032]
    elif codes[:2] == [1, 1]:
        vocab = [2081, 2082, 2084, 2088, 2113, 2114, 2116, 2120, 2177, 2178, 2180,
                 2184, 2305, 2306, 2308, 2312, 2561, 2562, 2564, 2568, 3073, 3074,
                 3076, 3080, 4113, 4114, 4116, 4120, 8209, 8210, 8212, 8216]
    elif codes[:2] == [0, 1]:
        vocab = [17, 18, 20, 24, 33, 34, 36, 40, 65, 66, 68, 72]

    ohes = []
    for v in vocab:
        ohes.append(a == v)
    ohes = np.stack(ohes, axis=2)
    X_node = np.concatenate([X_node, ohes], axis=2).astype(np.float32)

    # print(X_node.shape)
    return X_node


def attention(x_inner, x_outer, n_factor, dropout):
    x_Q = L.Conv1D(
        n_factor,
        1,
        activation="linear",
        kernel_initializer="glorot_uniform",
        bias_initializer="glorot_uniform",
    )(x_inner)
    x_K = L.Conv1D(
        n_factor,
        1,
        activation="linear",
        kernel_initializer="glorot_uniform",
        bias_initializer="glorot_uniform",
    )(x_outer)
    x_V = L.Conv1D(
        n_factor,
        1,
        activation="linear",
        kernel_initializer="glorot_uniform",
        bias_initializer="glorot_uniform",
    )(x_outer)
    x_KT = L.Permute((2, 1))(x_K)
    res = L.Lambda(lambda c: K.batch_dot(c[0], c[1]) / np.sqrt(n_factor))([x_Q, x_KT])
    att = L.Lambda(lambda c: K.softmax(c, axis=-1))(res)
    att = L.Lambda(lambda c: K.batch_dot(c[0], c[1]))([att, x_V])
    return att


def multi_head_attention(x, y, n_factor, n_head, dropout):
    if n_head == 1:
        att = attention(x, y, n_factor, dropout)
    else:
        n_factor_head = n_factor // n_head
        heads = [attention(x, y, n_factor_head, dropout) for i in range(n_head)]
        att = L.Concatenate()(heads)
        att = L.Dense(
            n_factor,
            kernel_initializer="glorot_uniform",
            bias_initializer="glorot_uniform",
        )(att)
    x = L.Add()([x, att])
    x = L.LayerNormalization()(x)
    if dropout > 0:
        x = L.Dropout(dropout)(x)
    return x


def res(x, unit, kernel=3, rate=0.1):
    h = L.Conv1D(unit, kernel, 1, padding="same", activation=None)(x)
    h = L.LayerNormalization()(h)
    h = L.LeakyReLU()(h)
    h = L.Dropout(rate)(h)
    return L.Add()([x, h])


def forward(x, unit, kernel=3, rate=0.1):
    #     h = L.Dense(unit, None)(x)
    h = L.Conv1D(unit, kernel, 1, padding="same", activation=None)(x)
    h = L.LayerNormalization()(h)
    h = L.Dropout(rate)(h)
    #         h = tf.keras.activations.swish(h)
    h = L.LeakyReLU()(h)
    h = res(h, unit, kernel, rate)
    return h


def adj_attn(x, adj, unit, n=2, rate=0.1):
    x_a = x
    x_as = []
    for i in range(n):
        x_a = forward(x_a, unit)
        x_a = tf.matmul(adj, x_a)
        x_as.append(x_a)
    if n == 1:
        x_a = x_as[0]
    else:
        x_a = L.Concatenate()(x_as)
    x_a = forward(x_a, unit)
    return x_a


def get_base(config):
    node = tf.keras.Input(shape=(None, X_CHANNEL), name="node")
    adj = tf.keras.Input(shape=(None, None, A_CHANNEL), name="adj")
    # node = tf.keras.Input(shape=(None, None), name="node")
    # adj = tf.keras.Input(shape=(None, None, None), name="adj")

    adj_learned = L.Dense(1, "relu")(adj)
    adj_all = L.Concatenate(axis=3)([adj, adj_learned])

    xs = []
    xs.append(node)
    x1 = forward(node, 128, kernel=3, rate=0.0)
    x2 = forward(x1, 64, kernel=6, rate=0.0)
    x3 = forward(x2, 32, kernel=15, rate=0.0)
    x4 = forward(x3, 16, kernel=30, rate=0.0)
    x = L.Concatenate()([x1, x2, x3, x4])

    for unit in [64, 32]:
        x_as = []
        for i in range(adj_all.shape[3]):
            x_a = adj_attn(x, adj_all[:, :, :, i], unit, rate=0.0)
            x_as.append(x_a)
        x_c = forward(x, unit, kernel=30)

        x = L.Concatenate()(x_as + [x_c])
        x = forward(x, unit)
        x = multi_head_attention(x, x, unit, 4, 0.0)
        xs.append(x)

    x = L.Concatenate()(xs)

    model = tf.keras.Model(inputs=[node, adj], outputs=[x])
    return model


def get_ae_model(base, config):
    node = tf.keras.Input(shape=(None, X_CHANNEL), name="node")
    adj = tf.keras.Input(shape=(None, None, A_CHANNEL), name="adj")
    # node = tf.keras.Input(shape=(None, None), name="node")
    # adj = tf.keras.Input(shape=(None, None, None), name="adj")

    x = base([L.SpatialDropout1D(0.3)(node), adj])
    x = forward(x, 64, rate=0.3)
    p = L.Dense(X_CHANNEL, "sigmoid")(x)

    loss = -tf.reduce_mean(
        20 * node * tf.math.log(p + 1e-4) + (1 - node) * tf.math.log(1 - p + 1e-4)
    )
    model = tf.keras.Model(inputs=[node, adj], outputs=[loss])

    opt = get_optimizer()
    model.compile(optimizer=opt, loss=lambda t, y: y)
    return model


def get_model(base, config):
    node = tf.keras.Input(shape=(None, X_CHANNEL), name="node")
    adj = tf.keras.Input(shape=(None, None, A_CHANNEL), name="adj")
    # node = tf.keras.Input(shape=(None, None), name="node")
    # adj = tf.keras.Input(shape=(None, None, None), name="adj")

    x = base([node, adj])
    x = forward(x, 128, rate=0.4)
    x = L.Dense(len(COLS_EVALUATE), None)(x)

    model = tf.keras.Model(inputs=[node, adj], outputs=[x])

    opt = get_optimizer()
    model.compile(optimizer=opt, loss=lambda t, y: y)
    return model


def get_optimizer():
    adam = tf.optimizers.Adam()
    return adam


def get_subformat(df: pd.DataFrame, length):
    df_id = df.reset_index(drop=True)
    df_id["key"] = 0
    df_index = pd.DataFrame(range(length), columns=["index"])
    df_index["key"] = 0
    df_id = df_id.merge(df_index, how="outer")
    df_id["id_seqpos"] = df_id["id"] + "_" + df_id["index"].map(str)
    del df_id["key"]
    return df_id


def get_feature(ixs, params):

    exp = params[0]
    codes = params[1]

    # ixs = [0, 233, 466, 699]
    As = []
    ix = ixs[0]
    As.append(np.load(ORG_ROOT / f"bpps/{test.iloc[ix]['id']}.npy"))
    for aug in AUG_ROOTS:
        As.append(np.load(aug / f"bpps/{test.iloc[ix]['id']}.npy"))
    As = np.array(As)

    Ss = get_structure_adj(test, ixs)
    Ds = get_distance_matrix(As)

    # concat adjecent
    As = np.concatenate([As[:, :, :, None], Ss, Ds], axis=3).astype(np.float32)
    del Ss, Ds

    X_node = get_input(test, ixs, codes).astype(np.uint8)

    if codes[2]:
        a = len(AUG_ROOTS) + 1

        length = int(len(X_node) / a)
        X_node = np.concatenate(
            [X_node[length * i : length * (i + 1)] for i in range(a)], axis=-1
        )

        length = int(len(As) / a)
        As = np.concatenate(
            [As[length * i : length * (i + 1)] for i in range(a)], axis=-1
        )

    return X_node, As


@contextmanager
def timer(name):
    t0 = time()
    yield
    print(f"[{name}] done in {time() - t0:.0f} s")

In [None]:
# =============================================================================
# main
# =============================================================================


org_len = len(test) / (len(AUG_ROOTS) + 1)
index_list = [
    list(map(int, np.arange(org_len) + (org_len * i))) for i in range(len(AUG_ROOTS) + 1)
]
index_list = list(zip(*index_list))


for params in PARAMS_FEATURE:

    exp = params[0]
    codes = params[1]

    with timer(exp):
        fe_path = OUTPUT_ROOT / "".join(map(str, params[1]))
        fe_path.mkdir(exist_ok=True)
        sub_path = OUTPUT_ROOT / f"{exp}.csv"
        if sub_path.exists():
            print(f"[{exp}] Done")
            continue

        X_node, As = get_feature(list(index_list[0]), params)
        X_CHANNEL = X_node.shape[2]
        A_CHANNEL = As.shape[3]
        base = get_base({})
        model = get_model(base, {})

        sub = pd.read_csv(INPUT_ROOT / "stanford-covid-vaccine" / "post_deadline_files" / "new_sequences_submission.csv").set_index('id_seqpos')

        for fold in range(8):
            model.load_weights(MODEL_ROOT / exp / f"model{fold}")
            print('loaded', fold)
            df_rows = []
            for ixs in tqdm(index_list, total=org_len):

                ixs = list(ixs)
                id_ = ixs[0]

                if (fe_path / f"{id_}_X.npy").exists() and (fe_path / f"{id_}_A.npy").exists():
                    X_node = np.load(fe_path / f"{id_}_X.npy")
                    As = np.load(fe_path / f"{id_}_A.npy")
                else:
                    X_node, As = get_feature(ixs, params)
                    np.save(fe_path / f"{id_}_X.npy", X_node)
                    np.save(fe_path / f"{id_}_A.npy", As)

                pred = model.predict([X_node, As])
                df = pd.DataFrame(np.array(pred).mean(0),
                                  columns=COLS_EVALUATE)
                df["id_seqpos"] = list(map(lambda x: f"{id_}_{x}", range(len(df))))
                df_rows.append(df.set_index('id_seqpos'))
                
                if DEBUG:
                    break
                
            df_rows = pd.concat(df_rows)
            sub += df_rows
        
            if DEBUG:
                break

        sub /= 8
        sub.to_csv(sub_path)
        
        if DEBUG:
            break

In [None]:
ll output

In [None]:
pd.read_csv(OUTPUT_ROOT / 'exp201.csv')

### output is [here](https://www.kaggle.com/onodera/covid-233-onodera-outputs-v2)