# Cluster epitope-specific TCR motifs

In [1]:
# Imports
import os

from clustcr import Clustering
import pandas as pd

# Set directory
os.chdir('path_to_your_dir')

  from .autonotebook import tqdm as notebook_tqdm


### Read epitope-specific TCR motifs

In [2]:
# Read in unique motifs table, i.e. one row per motif and epitopes separated by comma
data = pd.read_csv('./results/epitope_specific_clustering/unique_motifs.tsv')
data

Unnamed: 0,motif,epitope
0,CAGRGXMNTEAFF,LPRRSGAAGA
1,CAGXDXNTGELFF,YLQPRTFLL
2,CAIRXGGDEQF,HSKKKCDEL
3,CAIRXGGDEQFF,HSKKKCDEL
4,CAISESGYRGPPGANVLTF,FLKEKGGL
...,...,...
1916,CXSSPRGGGETQXF,TPGPGVRYPL
1917,CXSSXWDRSSGANVLTF,NLVPMVATV
1918,CXSSYSRQWNTEAFF,NLVPMVATV
1919,CXXSDDRVGEQFF,ILIEGIFFV


### Cluster all epitope-specific TCR motifs with clusTCR

In [3]:
# Cluster all TCRs in dataframe based on the selected column
def cluster_data(data, column):
    
    # Cluster data using default parameters
    clustering = Clustering(method='MCL')
    clustered_data = clustering.fit(data[column])
    
    return clustered_data

In [4]:
clustered_data = cluster_data(data, 'motif')

Clustering using MCL approach.
Total time to run ClusTCR: 0.324s


In [5]:
# Get clusters overview
clusters = clustered_data.clusters_df
clusters = clusters.rename(columns={'junction_aa': 'motif'})
clusters

Unnamed: 0,motif,cluster
0,CASSGXGGYTF,0
1,CASSLXGGYTF,0
2,CASSXGTGVDQPQHF,1
3,CASSXGTGVNQPQHF,1
4,CASSXSTGVNQPQHF,1
...,...,...
459,CASSXGXYGYTF,133
460,CAXSTGDSNQPQHF,134
461,CAXXTGDSNQPQHF,134
462,CASSPXGGAYNEQFF,135


In [6]:
# Add epitope information to clustering results
def get_epitope(data,motif):
    selection = data[data['motif']==motif]
    return selection['epitope'].tolist()[0]

clusters['epitope'] = clusters['motif'].apply(lambda x: get_epitope(data,x))
clusters

Unnamed: 0,motif,cluster,epitope
0,CASSGXGGYTF,0,KAFSPEVIPMF
1,CASSLXGGYTF,0,KAFSPEVIPMF
2,CASSXGTGVDQPQHF,1,TPINLVRDL
3,CASSXGTGVNQPQHF,1,TPINLVRDL
4,CASSXSTGVNQPQHF,1,HTTDPSFLGRY
...,...,...,...
459,CASSXGXYGYTF,133,GILGFVFTL
460,CAXSTGDSNQPQHF,134,ELAGIGILTV
461,CAXXTGDSNQPQHF,134,EPLPQGQLTAY
462,CASSPXGGAYNEQFF,135,LPRRSGAAGA


In [7]:
# Report the motif and epitope combinations within each cluster
final = (clusters.groupby(['cluster','epitope'])['motif'].apply(', '.join)
                 .reset_index())
final.to_csv('./results/epitope_specific_clustering/cluster_motifs/motif_clusters.tsv', index=False)
final


Unnamed: 0,cluster,epitope,motif
0,0,KAFSPEVIPMF,"CASSGXGGYTF, CASSLXGGYTF"
1,1,HTTDPSFLGRY,CASSXSTGVNQPQHF
2,1,TPINLVRDL,"CASSXGTGVDQPQHF, CASSXGTGVNQPQHF"
3,2,FVDGVPFVV,"CASSQEXADTEAFF, CASSQEXANTEAFF"
4,3,FVDGVPFVV,CASSVGGXNTEAFF
...,...,...,...
374,133,FLNGSCGSV,CASSFGXYGYTF
375,133,GILGFVFTL,CASSXGXYGYTF
376,134,ELAGIGILTV,CAXSTGDSNQPQHF
377,134,EPLPQGQLTAY,CAXXTGDSNQPQHF


### Cluster statistics

In [8]:
# Merge multiple rows with the same cdr3 into one row
cls_motif = (clusters.groupby(['cluster'])['motif'].apply(', '.join)
                 .reset_index().set_index('cluster'))
cls_motif['motif'] = cls_motif['motif'].apply(
    lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                   for y in x.split(',')))) 
cls_epitope = (clusters.groupby(['cluster'])['epitope'].apply(', '.join)
                  .reset_index().set_index('cluster'))
cls_epitope['epitope'] = cls_epitope['epitope'].apply(
    lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                   for y in x.split(',')))) 
counts = clusters.groupby(['cluster']).size().reset_index().rename(columns={0:'size'}).set_index('cluster')



df = pd.concat([counts,cls_motif,cls_epitope],axis=1).reset_index().sort_values(by='size', ascending=False)
df

Unnamed: 0,cluster,size,motif,epitope
11,11,27,"CASSXGSYEQXF,CASSXXXYEQYF,CASSVGGXEQYF,CASSXSS...","KRWIILGLNK,KLVALGINAV,NLVPMVATV,RQLLFVVEV,YLNT..."
9,9,26,"CASSXLGXYEQYF,CASSGXGXYEQYF,CASSXGHSYEQYF,CASR...","FPPTSFGPL,KAYNVTQAF,HTTDPSFLGRY,FVDGVPFVV,KLPD..."
7,7,14,"CASSXTGXEAFF,CASSXGXTEAFF,CASSXGLTEAFF,CASSLXX...","GLCTLVAML,TLVPQEHYV,KAYNVTQAF,HTTDPSFLGRY,FVDG..."
6,6,11,"CASSXTXNTEAFF,CASSXGLNTEAFF,CASSLGXNTEAFF,CASS...","HLVDFQVTI,FVDGVPFVV,GILGFVFTL,LPRRSGAAGA,NLVPM..."
23,23,10,"CASSLGGXYEQYF,CASSXGTXYEQYF,CASSLADXYEQYF,CASS...","GLCTLVAML,LEPLVDLPI,WICLLQFAY,HTTDPSFLGRY,LPRR..."
...,...,...,...,...
65,65,2,"CASSYSXGNEQYF,CASSYSXGYEQYF","FVDGVPFVV,KLSYGIATV"
63,63,2,"CASSXTGGEQYF,CASSXXGGEQYF","LPRRSGAAGA,FVDGVPFVV"
60,60,2,"CASSLXQGTDTQYF,CASSQXQGTDTQYF","LPRRSGAAGA,KAYNVTQAF"
59,59,2,"CASSXGVNTGELFF,CASSXGLNTGELFF","NLVPMVATV,LPRRSGAAGA"


In [9]:
print('Nr of clusters: ', df.shape[0])

Nr of clusters:  136


In [10]:

impure = df[df['epitope'].str.contains(',')]
print('Nr of impure clusters: ',impure.shape[0])
print('Nr of pure clusters: ',df.shape[0] - impure.shape[0])


Nr of impure clusters:  107
Nr of pure clusters:  29
