In [10]:
import numpy as np
import pandas as pd
from glob import glob
import tensorflow as tf
import keras

In [11]:
# human antigens files
df_rep = pd.DataFrame(glob('../DeepTCR/Data/Human_Antigens/*/*.tsv'), columns=['filepath'])
df_rep[['HLA', 'Antigen']] = df_rep['filepath'].str.extract(r'/([^/-]+)-([^/-]+)/[^/]+$')

In [12]:
# load tcrseq data and label
df_tcr = []
for idx in df_rep.index:
    df_tcr.append(pd.read_csv(df_rep.loc[idx, 'filepath'], sep='\t'))
    df_tcr[-1]['index'] = idx
    df_tcr[-1][['HLA', 'Antigen']] = df_rep.loc[idx, ['HLA', 'Antigen']].values
df_tcr = pd.concat(df_tcr)
df_tcr['Antigen'] = df_tcr['Antigen'].astype('category')

In [13]:
# convert to tf dataset
idx = ~df_tcr[['aminoAcid', 'Antigen']].isna().any(axis=1)
X = df_tcr.loc[idx, 'aminoAcid'].values
y = df_tcr.loc[idx, 'Antigen'].cat.codes.values

In [14]:
y

array([], dtype=int8)

In [15]:
max_length = max(list(map(len, X)))
aa = set(''.join(X))
aa = dict(zip(aa, np.arange(1, len(aa) + 1)))

X = [[aa[l] for l in x] + [0] * (max_length - len(x)) for x in X]



ValueError: max() arg is an empty sequence

In [None]:
tfds = tf.data.Dataset.from_tensor_slices((X, y))
tfds = tfds.map(lambda X, y_true: (X, tf.one_hot(y_true, len(np.unique(y)))))
tfds = tfds.shuffle(idx.sum(), reshuffle_each_iteration=True)
tfds = tfds.batch(100, drop_remainder=True)

2025-06-18 17:37:26.111609: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1048 MB memory:  -> device: 0, name: NVIDIA H100 80GB HBM3, pci bus id: 0000:5f:00.0, compute capability: 9.0


In [None]:
tfds

<_BatchDataset element_spec=(TensorSpec(shape=(100, 23), dtype=tf.int32, name=None), TensorSpec(shape=(100, 7), dtype=tf.float32, name=None))>

In [None]:
# CDR3 AA encoding
tensors = []
tensors.append(keras.layers.Input(shape=(23, ),  dtype=tf.uint32))
tensors.append(keras.layers.Embedding(input_dim=len(aa) + 1, output_dim=64, mask_zero=True)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=64, kernel_size=9, strides=1, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=128, kernel_size=7, strides=2, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=256, kernel_size=5, strides=1, activation=None)(tensors[-1])[:, 0, :])
encoder = keras.Model(inputs=tensors[0], outputs=tensors[-1], name='encoder')

In [None]:
# VDJ usage encoding
tensors = []
tensors.append(keras.layers.Input(shape=(23, ),  dtype=tf.uint32))
tensors.append(keras.layers.Embedding(input_dim=len(aa) + 1, output_dim=64, mask_zero=True)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=64, kernel_size=9, strides=1, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=128, kernel_size=7, strides=2, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=256, kernel_size=5, strides=1, activation=None)(tensors[-1])[:, 0, :])
encoder = keras.Model(inputs=tensors[0], outputs=tensors[-1], name='encoder')

In [None]:
tensors = []
tensors.append(keras.layers.Input(shape=(encoder.output_shape[-1], ), dtype=tf.float32))
tensors.append(keras.layers.Dropout(rate=0.1)(tensors[-1]))
tensors.append(keras.layers.Dense(units=128, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Dropout(rate=0.05)(tensors[-1]))
tensors.append(keras.layers.Dense(units=64, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Dense(units=len(np.unique(y)), activation=keras.activations.sigmoid)(tensors[-1]))
classifier = keras.Model(inputs=tensors[0], outputs=tensors[-1], name='classifer')

In [None]:
tensors = []
tensors.append(keras.layers.Input(shape=encoder.input_shape[1:],  dtype=tf.uint32))
tensors.append(encoder(tensors[-1]))
tensors.append(classifier(tensors[-1]))
model = keras.Model(inputs=tensors[0], outputs=tensors[-1], name='full_model')

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=keras.losses.BinaryCrossentropy(from_logits=False)
)

In [None]:
model.fit(tfds, epochs=100)

Epoch 1/100


2025-06-18 17:37:27.455163: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
2025-06-18 17:37:29.396878: I external/local_xla/xla/service/service.cc:168] XLA service 0x1550ca3b8f50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-06-18 17:37:29.396908: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA H100 80GB HBM3, Compute Capability 9.0
2025-06-18 17:37:29.401369: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1750282649.490841  614658 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x155491d165d0>

In [None]:
y_pred.shape

NameError: name 'y_pred' is not defined

In [None]:
b = next(iter(tfds))

In [None]:
b[0].shape, b[1].shape

(TensorShape([100, 23]), TensorShape([100, 7]))