# Cluster epitope-specific TCRex data

In [1]:
# Imports
import os

from clustcr import Clustering
import pandas as pd

# Set directory
os.chdir('path_to_your_dir')


  from .autonotebook import tqdm as notebook_tqdm


### Read in TCRex data

In [2]:
# Read in all epitope-specific TCRs parsed by TCRex
# df may not contain , in epitopes: every tcr-epitope should take one row 
df = pd.read_csv('./data/final/all_tcrs.tsv')
df 

Unnamed: 0,v_call,junction_aa,j_call,epitope
0,TRBV07-06,CASSLARGVLMNTEAFF,TRBJ01-01,TVYDPLQPELDSFK
1,TRBV10-02,CASSKGSTEAFF,TRBJ01-01,TVYDPLQPELDSFK
2,TRBV27,CASSLMGGSSYEQYF,TRBJ02-07,TVYDPLQPELDSFK
3,TRBV07-02,CASSLVLASYEQYF,TRBJ02-07,TVYDPLQPELDSFK
4,TRBV04-01,CASSLMAGPGNIQYF,TRBJ02-04,TVYDPLQPELDSFK
...,...,...,...,...
44192,TRBV04-02,CASSQDSGQIDTGELFF,TRBJ02-02,ALSKGVHFV
44193,TRBV27,CASSLSGGWAGGLEQYF,TRBJ02-07,ALSKGVHFV
44194,TRBV27,CASSLSGTYYEQYF,TRBJ02-07,ALSKGVHFV
44195,TRBV27,CASSISVYSPLHF,TRBJ01-06,ALSKGVHFV


In [3]:
epitopes = list(set(df['epitope'].tolist()))

### Cluster TCRex data

In [4]:
# Cluster all TCRs in dataframe based on the selected column
def cluster_data(data, column):
    
    # Cluster data using default parameters
    clustering = Clustering(method='MCL')
    clustered_data = clustering.fit(data[column])
    
    return clustered_data

In [5]:
# Initialize empty df
results = pd.DataFrame()

# Loop over every epitope
for epitope in epitopes:
    
    # Read in all positive training data for the epitope
    data = df[df['epitope']==epitope]
    
    # Remove duplicated CDR3 beta sequences
    data = data.drop_duplicates(subset='junction_aa')

    # Cluster training data for selected epitope
    clustered_data = cluster_data(data, 'junction_aa')

    # Get motifs
    motifs = clustered_data.summary()
    
    if motifs.empty:
        print('no clusters for epitope: ', epitope)
    else:

        # Parse motifs into simple strings
        motifs = motifs.reset_index().rename(columns={'index':'cluster'}).set_index('cluster')
        motifs['clustcr_motif'] = motifs['motif']
        motifs['motif'] = motifs['motif'].str.replace(r'\[[A-Z]+\]','X',regex=True)
        motifs['motif'] = motifs['motif'].str.replace(r'\.','X',regex=True)
        motifs['motif'] = motifs['motif'].str.replace(r'[a-z]','X',regex=True)

        # Group CDR3 sequences per cluster
        cdr3 = clustered_data.clusters_df.groupby(['cluster'])['junction_aa'].apply(', '.join).reset_index().set_index('cluster')
        
        # Concatenate all info in one df
        info = pd.concat([motifs,cdr3], axis=1)
        info['epitope'] = epitope
        info = info.reset_index()
        
        # Append info to large results df
        results = pd.concat([results,info], axis=0)
    



Clustering using MCL approach.
Total time to run ClusTCR: 0.064s
Clustering using MCL approach.
Total time to run ClusTCR: 0.005s
Clustering using MCL approach.
Total time to run ClusTCR: 0.037s
Clustering using MCL approach.
Total time to run ClusTCR: 0.056s
Clustering using MCL approach.
Total time to run ClusTCR: 0.046s
Clustering using MCL approach.
Total time to run ClusTCR: 0.137s
Clustering using MCL approach.
Total time to run ClusTCR: 0.036s
Clustering using MCL approach.
Total time to run ClusTCR: 0.146s
Clustering using MCL approach.
Total time to run ClusTCR: 0.047s
Clustering using MCL approach.
Total time to run ClusTCR: 0.053s
Clustering using MCL approach.
Total time to run ClusTCR: 0.057s
Clustering using MCL approach.
Total time to run ClusTCR: 0.348s
Clustering using MCL approach.
Total time to run ClusTCR: 0.000s
no clusters for epitope:  LLMPILTLT
Clustering using MCL approach.
Total time to run ClusTCR: 0.045s
Clustering using MCL approach.
Total time to run ClusT

Total time to run ClusTCR: 0.118s
Clustering using MCL approach.
Total time to run ClusTCR: 0.004s
Clustering using MCL approach.
Total time to run ClusTCR: 0.040s
Clustering using MCL approach.
Total time to run ClusTCR: 0.004s
Clustering using MCL approach.
Total time to run ClusTCR: 0.040s
Clustering using MCL approach.
Total time to run ClusTCR: 0.048s


In [6]:
results

Unnamed: 0,cluster,size,motif,clustcr_motif,junction_aa,epitope
0,0,2,CASSXALSYNEQFF,CASS[FG]ALSYNEQFF,"CASSFALSYNEQFF, CASSGALSYNEQFF",ITEEVGHTDLMAAY
1,1,4,CASSLVXDSSYNEQFF,CASSLVsDSSYNEQFF,"CASSLVSDSSYNEQFF, CASSLVTDSSYNEQFF, CASSLASDSS...",ITEEVGHTDLMAAY
2,2,2,CASSLGXNTEAFF,CASSLG[LM]NTEAFF,"CASSLGLNTEAFF, CASSLGMNTEAFF",ITEEVGHTDLMAAY
3,3,2,CASSLADXYEQYF,CASSLAD[AS]YEQYF,"CASSLADAYEQYF, CASSLADSYEQYF",ITEEVGHTDLMAAY
4,4,2,CASSLVTDXNTEAFF,CASSLVTD[LM]NTEAFF,"CASSLVTDLNTEAFF, CASSLVTDMNTEAFF",ITEEVGHTDLMAAY
...,...,...,...,...,...,...
3,3,3,CXSSDRQSLVQF,C.SSDRQSLVQF,"CTSSDRQSLVQF, CVSSDRQSLVQF, CASSDRQSLVQF",LPPIVAKEI
4,4,2,CASSLXQSREQYF,CASSL[VA]QSREQYF,"CASSLAQSREQYF, CASSLVQSREQYF",LPPIVAKEI
5,5,2,CASSSRXGQEQYF,CASSSR[DG]GQEQYF,"CASSSRDGQEQYF, CASSSRGGQEQYF",LPPIVAKEI
6,6,2,CXSSDRQSLVQFF,C[AT]SSDRQSLVQFF,"CASSDRQSLVQFF, CTSSDRQSLVQFF",LPPIVAKEI


In [7]:
results.to_csv('./results/epitope_specific_clustering/epitope_specific_clusters.tsv', index=False)