In [31]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from glob import glob

import keras

In [32]:
# Debug: Check if directory exists and what files are there
import os
from glob import glob

data_dir = '../DeepTCR/Data/Human_Antigens'

print(f"Checking directory: {data_dir}")
print(f"Directory exists: {os.path.exists(data_dir)}")

if os.path.exists(data_dir):
    print(f"Contents: {os.listdir(data_dir)}")
    
    # Check for TSV files
    tsv_files = glob(os.path.join(data_dir, '*/*.tsv'))
    print(f"TSV files found: {len(tsv_files)}")
    
    if tsv_files:
        print("First few TSV files:")
        for f in tsv_files[:3]:
            print(f"  {f}")
            
        # Check first TSV file content
        import pandas as pd
        first_file = tsv_files[0]
        columns = pd.read_csv(first_file, sep='\t', nrows=0).columns.tolist()
        shape = pd.read_csv(first_file, sep='\t').shape
        head = pd.read_csv(first_file, sep='\t').head()
        
        print(f"\nFirst file columns: {columns}")
        print(f"First file shape: {shape}")
        print("First file head:")
        print(head)
else:
    print("Directory does not exist!")
    print("You need to clone the DeepTCR repository:")
    print("Run: git clone https://github.com/sidhomj/DeepTCR.git")

Checking directory: ../DeepTCR/Data/Human_Antigens
Directory exists: True
Contents: ['A1-CTELKLSDY', 'A1-VTEHDTLLY', 'A2-GILGFVFTL', 'A2-GLCTLVAML', 'A2-NLVPMVATV', 'B7-LPRRSGAAGA', 'B7-TPRVTGGGAM']
TSV files found: 10
First few TSV files:
  ../DeepTCR/Data/Human_Antigens\A1-CTELKLSDY\A1-CTELKLSDY.tsv
  ../DeepTCR/Data/Human_Antigens\A1-VTEHDTLLY\A1-VTEHDTLLY.tsv
  ../DeepTCR/Data/Human_Antigens\A2-GILGFVFTL\A2-GILGFVFTL-Dash.tsv

First file columns: ['aminoAcid', 'counts', 'v_beta', 'j_beta']
First file shape: (25, 4)
First file head:
         aminoAcid  counts      v_beta      j_beta
0  CASSYSSSSYNEQFF       1  TCRBV06-06  TCRBJ02-01
1   CASSSGGPRAEQFF       1  TCRBV07-09  TCRBJ02-01
2  CATSRDLSFGYGYTF       1         NaN  TCRBJ01-02
3   CATSREQNNSPLHF       1         NaN  TCRBJ01-06
4    CATVGGIDQPQHF       1  TCRBV10-03  TCRBJ01-05


In [33]:
def load_tcr_dataset_from_dir(data_dir, batch_size=100, shuffle=True):
    # Collect Files
    df_rep = pd.DataFrame(glob(os.path.join(data_dir, '*/*.tsv')), columns=['filepath'])
    
    # Check if any files were found
    if df_rep.empty:
        raise FileNotFoundError(f"No TSV files found in {data_dir}")
    
    # Fix regex to handle both Windows backslashes and Unix forward slashes
    df_rep[['HLA', 'Antigen']] = df_rep['filepath'].str.extract(r'[/\\]([^/\\-]+)-([^/\\-]+)[/\\][^/\\]+$')

    # Load TSV into df
    df_tcr = []
    for idx in df_rep.index:
        df = pd.read_csv(df_rep.loc[idx, 'filepath'], sep='\t')
        df['index'] = idx
        df[['HLA', 'Antigen']] = df_rep.loc[idx, ['HLA', 'Antigen']].values
        df_tcr.append(df)
    df_tcr = pd.concat(df_tcr)
    df_tcr['Antigen'] = df_tcr['Antigen'].astype('category')

    # Embedding
    idx = ~df_tcr[['aminoAcid', 'Antigen']].isna().any(axis=1)
    X = df_tcr.loc[idx, 'aminoAcid'].values
    y = df_tcr.loc[idx, 'Antigen'].cat.codes.values
    
    # Check if sequences exist
    if len(X) == 0:
        raise ValueError("No valid amino acid sequences found in the data")
    
    label_map = dict(enumerate(df_tcr.loc[idx, 'Antigen'].cat.categories))

    max_length = max(map(len, X))
    vocab = sorted(set(''.join(X)))
    aa_dict = {aa: i + 1 for i, aa in enumerate(vocab)}

    # Padding
    X_encoded = np.zeros((len(X), max_length), dtype=np.int32)
    for i, seq in enumerate(X):
        for j, aa in enumerate(seq[:max_length]):
            X_encoded[i, j] = aa_dict.get(aa, 0)

    # Create TensorFlow dataset
    ds = tf.data.Dataset.from_tensor_slices((X_encoded, y))
    ds = ds.map(lambda x, y: (x, tf.one_hot(y, len(label_map))))
    if shuffle:
        ds = ds.shuffle(len(X_encoded))
    ds = ds.batch(batch_size, drop_remainder = True).prefetch(tf.data.AUTOTUNE)

    return ds, aa_dict, label_map

In [34]:
data_dir = '../DeepTCR/Data/Human_Antigens'

In [35]:
tfds, aa_dict, label_map = load_tcr_dataset_from_dir(data_dir)

In [36]:
tfds

<_PrefetchDataset element_spec=(TensorSpec(shape=(100, 23), dtype=tf.int32, name=None), TensorSpec(shape=(100, 7), dtype=tf.float32, name=None))>

In [37]:
aa_dict

{'A': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'K': 9,
 'L': 10,
 'M': 11,
 'N': 12,
 'P': 13,
 'Q': 14,
 'R': 15,
 'S': 16,
 'T': 17,
 'V': 18,
 'W': 19,
 'Y': 20}

In [38]:
label_map

{0: 'CTELKLSDY',
 1: 'GILGFVFTL',
 2: 'GLCTLVAML',
 3: 'LPRRSGAAGA',
 4: 'NLVPMVATV',
 5: 'TPRVTGGGAM',
 6: 'VTEHDTLLY'}

In [39]:
# CDR3 AA encoding
tensors = []
tensors.append(keras.layers.Input(shape=(23, ),  dtype=tf.uint32))
tensors.append(keras.layers.Embedding(input_dim=len(aa_dict) + 1, output_dim=64, mask_zero=True)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=64, kernel_size=9, strides=1, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=128, kernel_size=7, strides=2, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=256, kernel_size=5, strides=1, activation=None)(tensors[-1])[:, 0, :])
encoder = keras.Model(inputs=tensors[0], outputs=tensors[-1], name='encoder')



In [40]:
# VDJ usage encoding
tensors = []
tensors.append(keras.layers.Input(shape=(23, ),  dtype=tf.uint32))
tensors.append(keras.layers.Embedding(input_dim=len(aa_dict) + 1, output_dim=64, mask_zero=True)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=64, kernel_size=9, strides=1, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=128, kernel_size=7, strides=2, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Conv1D(filters=256, kernel_size=5, strides=1, activation=None)(tensors[-1])[:, 0, :])
encoder = keras.Model(inputs=tensors[0], outputs=tensors[-1], name='encoder')



In [41]:
tensors = []
tensors.append(keras.layers.Input(shape=(encoder.output_shape[-1], ), dtype=tf.float32))
tensors.append(keras.layers.Dropout(rate=0.1)(tensors[-1]))
tensors.append(keras.layers.Dense(units=128, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Dropout(rate=0.05)(tensors[-1]))
tensors.append(keras.layers.Dense(units=64, activation=keras.activations.relu)(tensors[-1]))
tensors.append(keras.layers.Dense(units=len(label_map), activation=keras.activations.sigmoid)(tensors[-1]))
classifier = keras.Model(inputs=tensors[0], outputs=tensors[-1], name='classifer')

In [42]:
tensors = []
tensors.append(keras.layers.Input(shape=encoder.input_shape[1:],  dtype=tf.uint32))
tensors.append(encoder(tensors[-1]))
tensors.append(classifier(tensors[-1]))
model = keras.Model(inputs=tensors[0], outputs=tensors[-1], name='full_model')

In [43]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=keras.losses.BinaryCrossentropy(from_logits=False)
)

In [44]:
model.fit(tfds, epochs=100)

Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 0.5758
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.3556
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.3520
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.3450
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.3348
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.3247
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.3219
Epoch 8/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.3004
Epoch 9/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.2889
Epoch 10/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - lo

<keras.src.callbacks.history.History at 0x237fcaf5610>

In [45]:
b = next(iter(tfds))

In [46]:
b[0].shape, b[1].shape

(TensorShape([100, 23]), TensorShape([100, 7]))