# Collecting all epitope-specific TCRs in one dataframe

In [1]:
# Imports
import os
import pandas as pd

# Set directory
os.chdir('path_to_your_dir')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Read in all epitope-specific TCRs parsed by TCRex
data_dir = './data/parsed/TCRex_data'

# Get a list of all epitopes
epitopes = os.listdir(data_dir)
epitopes.remove('.DS_Store')
epitopes.remove('.gitignore')

In [3]:
def read_data(data_dir, epitope):
    
    # All data parsed by TCRex
    sequences = pd.read_csv(os.path.join(data_dir, epitope,'training_data.tsv'),sep='\t')
    
    # Get epitope specific TCRs
    sequences = sequences[sequences['Class'] == 1]
    
    # Parse into airr format
    sequences['v_call'] = sequences['V_gene'].apply(lambda x: 'TRBV' + str(x))
    sequences['j_call'] = sequences['J_gene'].apply(lambda x: 'TRBJ' + str(x))
    sequences['junction_aa'] = sequences['CDR3_beta']
    
    return sequences[['v_call', 'junction_aa', 'j_call']]

In [4]:
# Assemble all epitope-specific TCRs in one df
df = pd.DataFrame()

for epitope in epitopes:
    # Read data
    sequences = read_data(data_dir,epitope)
    # Add epitope-specificity
    sequences['epitope'] = epitope
    df = pd.concat([df, sequences])


In [5]:
df

Unnamed: 0,v_call,junction_aa,j_call,epitope
360,TRBV07-06,CASSLARGVLMNTEAFF,TRBJ01-01,TVYDPLQPELDSFK
361,TRBV10-02,CASSKGSTEAFF,TRBJ01-01,TVYDPLQPELDSFK
362,TRBV27,CASSLMGGSSYEQYF,TRBJ02-07,TVYDPLQPELDSFK
363,TRBV07-02,CASSLVLASYEQYF,TRBJ02-07,TVYDPLQPELDSFK
364,TRBV04-01,CASSLMAGPGNIQYF,TRBJ02-04,TVYDPLQPELDSFK
...,...,...,...,...
1414,TRBV04-02,CASSQDSGQIDTGELFF,TRBJ02-02,ALSKGVHFV
1415,TRBV27,CASSLSGGWAGGLEQYF,TRBJ02-07,ALSKGVHFV
1416,TRBV27,CASSLSGTYYEQYF,TRBJ02-07,ALSKGVHFV
1417,TRBV27,CASSISVYSPLHF,TRBJ01-06,ALSKGVHFV


In [6]:
# Export df
df.to_csv('./data/final/all_tcrs.tsv',
                index=False)

In [7]:
# Group TCRs with identical CDR3 sequences together
df_epitope = (df.groupby(['junction_aa'])['epitope'].apply(', '.join)
           .reset_index().set_index('junction_aa'))
df_epitope['epitope'] = df_epitope['epitope'].apply(
    lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                   for y in x.split(','))))  
df_V = (df.groupby(['junction_aa'])['v_call'].apply(', '.join)
        .reset_index().set_index('junction_aa'))
df_V['v_call'] = df_V['v_call'].apply(
    lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                   for y in x.split(','))))
df_J = (df.groupby(['junction_aa'])['j_call'].apply(', '.join)
        .reset_index().set_index('junction_aa'))
df_J['j_call'] = df_J['j_call'].apply(
    lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                   for y in x.split(','))))

df = pd.concat([df_V, df_epitope, df_J], axis=1).reset_index()
df

Unnamed: 0,junction_aa,v_call,epitope,j_call
0,CAAADEEIGNQPQHF,TRBV10-03,ATDALMTGY,TRBJ01-05
1,CAAADRMTDTQYF,TRBV24-01,FVDGVPFVV,TRBJ02-03
2,CAAAERNTGELFF,TRBV28,YLQPRTFLL,TRBJ02-02
3,CAAAGRGLADTQYF,TRBV04-01,KPLEFGATSAAL,TRBJ02-03
4,CAAAVDHSTDTQYF,TRBV27,HTTDPSFLGRY,TRBJ02-03
...,...,...,...,...
42045,CVSSVDKGGTDTQYF,TRBV09,IIKDYGKQM,TRBJ02-03
42046,CWTVNTEAFF,TRBV04-02,TLIGDCATV,TRBJ01-01
42047,CYSSDDRVGEQFF,TRBV24-01,ILIEGIFFV,TRBJ02-01
42048,CYSSFQGYTEAFF,TRBV28,ILIEGIFFV,TRBJ01-01


In [8]:
# Export df
df.to_csv('./data/final/unique_CDR3s.tsv',
                index=False)