# Clustering TCRex data with clusTCR

In [1]:
# Imports
import os

from clustcr import Clustering
import pandas as pd

# Set directory
os.chdir('path_to_your_dir')



  from .autonotebook import tqdm as notebook_tqdm


### Reading TCRex data

In [2]:
df = pd.read_csv('./data/final/unique_CDR3s.tsv', 
                 sep=',')
df

Unnamed: 0,junction_aa,v_call,epitope,j_call
0,CAAADEEIGNQPQHF,TRBV10-03,ATDALMTGY,TRBJ01-05
1,CAAADRMTDTQYF,TRBV24-01,FVDGVPFVV,TRBJ02-03
2,CAAAERNTGELFF,TRBV28,YLQPRTFLL,TRBJ02-02
3,CAAAGRGLADTQYF,TRBV04-01,KPLEFGATSAAL,TRBJ02-03
4,CAAAVDHSTDTQYF,TRBV27,HTTDPSFLGRY,TRBJ02-03
...,...,...,...,...
42045,CVSSVDKGGTDTQYF,TRBV09,IIKDYGKQM,TRBJ02-03
42046,CWTVNTEAFF,TRBV04-02,TLIGDCATV,TRBJ01-01
42047,CYSSDDRVGEQFF,TRBV24-01,ILIEGIFFV,TRBJ02-01
42048,CYSSFQGYTEAFF,TRBV28,ILIEGIFFV,TRBJ01-01


### Clustering

In [3]:
# Cluster all TCRs in dataframe based on the selected column
def cluster_data(data, column):
    
    # Cluster data using default parameters
    clustering = Clustering(method='MCL')
    clustered_data = clustering.fit(data[column])
    
    # Remove intermediate cluster files
    #clustering.batch_cleanup()
    
    return clustered_data

In [4]:
# Cluster all data
clustered_data = cluster_data(df, 'junction_aa')

# Get clusters overview
clusters = clustered_data.clusters_df
clusters

Clustering using MCL approach.
Total time to run ClusTCR: 16.363s


Unnamed: 0,junction_aa,cluster
0,CASSHARAEAFF,0
1,CASSLARAEAFF,0
2,CASSLHRAEAFF,0
3,CASSLGENEQFF,1
4,CASSLGGNEQFF,1
...,...,...
14210,CASRTGTHTDTQYF,2118
14211,CASSPRGVADEQYF,2119
14212,CASSPRGVQDEQYF,2119
14213,CASGGINNEQFF,2120


In [5]:
# Input data
nr_unique = df.shape[0]
print('Size of input data: ', nr_unique)

# Nr of clustered TCRs
nr_clustered = clusters.shape[0]
print('Nr of clustered TCRs: ', nr_clustered)

# Percentage of clustered TCRs
print('Percentage of clustered TCRs: ',(nr_clustered/nr_unique)*100)

# Nr of clusters
print('Nr of clusters: ',len(set(clusters['cluster'].tolist())))

Size of input data:  42050
Nr of clustered TCRs:  14215
Percentage of clustered TCRs:  33.80499405469679
Nr of clusters:  2121


In [6]:
# Add cluster info to TCRex df
def get_cluster(clusters, cdr3):
    if cdr3 in clusters['junction_aa'].tolist():
        cluster = clusters[clusters['junction_aa']==cdr3]['cluster'].tolist()[0]
    else:
        cluster = 'NA'
    return cluster

df['cluster'] = df['junction_aa'].apply(lambda x: get_cluster(clusters,x))

In [7]:
# Export clustering results
df.to_csv('./results/tcrex_clustering/tcrex_clusters.tsv',
                index=False)

### Summary of clustering results

In [8]:
# Count occurences of cluster sizes
summary = clusters.groupby('cluster').size().reset_index().rename(columns={0:'size'})
print('Max cluster size: ', summary['size'].max())

# Count numbers per bin
bins = [int(x) for x in [1,2,3,4,5,10,15,20,50,100,200,300,400,500]]
summary = summary['size'].value_counts(bins=bins)
summary = summary.reset_index().sort_values(by='index',ascending=True)

# Make df publish ready
summary['index']=summary['index'].astype('str')
summary['index']=summary['index'].str.replace('(0.999, 2.0]','2',regex=False)
summary['index']=summary['index'].str.replace('(2.0, 3.0]','3',regex=False)
summary['index']=summary['index'].str.replace('(3.0, 4.0]','4',regex=False)
summary['index']=summary['index'].str.replace('(4.0, 5.0]','5',regex=False)
summary['index']=summary['index'].str.replace('(',']',regex=False)
summary['index']=summary['index'].str.replace('.0','', regex=False)
summary

Max cluster size:  319


Unnamed: 0,index,count
0,2,1298
1,3,335
2,4,134
5,5,59
3,"]5, 10]",109
6,"]10, 15]",40
7,"]15, 20]",28
4,"]20, 50]",65
8,"]50, 100]",27
9,"]100, 200]",22


In [9]:
# Export cluster_sizes
summary.to_csv('./results/tcrex_clustering/clusters_sizes.tsv',
                index=False)

### Cluster purity

In [10]:
# Collect all clustered CDR3s
purity = df[df['cluster']!='NA']
# Count CDR3s per cluster
size = purity.groupby(['cluster']).size().reset_index().rename(columns={0:'size'})
size = size.set_index('cluster')

# Get epitopes per cluster
purity = (purity.groupby(['cluster'])['epitope'].apply(', '.join)
           .reset_index())
purity['epitope'] = purity['epitope'].apply(
    lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                   for y in x.split(','))))  

# Count nr of epitopes per cluster
purity['count'] = purity['epitope'].str.count(',') +1
purity = purity.set_index('cluster')

# Concat size and purity information
purity = pd.concat([purity,size], axis=1)

# Select pure clusters
purity = purity[purity['count']==1].sort_values(by='size',ascending=False)

In [11]:
# Get epitopes per cluster size
purity_epitope = (purity.groupby(['size'])['epitope'].apply(', '.join)
           .reset_index())
purity_epitope['epitope'] = purity_epitope['epitope'].apply(
    lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                   for y in x.split(',')))) 

In [12]:
# Get size counts of pure clusters
purity_count = purity.groupby('size').size().reset_index().rename(columns={0:'count'})

In [13]:
pure_clusters = pd.concat([purity_epitope,purity_count],axis=1)
pure_clusters = pure_clusters.rename(columns={'size':'cluster size', 'count': 'nr of clusters', 'epitope': 'epitope specificity'})
pure_clusters[['cluster size', 'nr of clusters', 'epitope specificity']]



Unnamed: 0,cluster size,cluster size.1,nr of clusters,epitope specificity
0,2,2,586,"SFHSLHLLF,SEPVLKGVKL,NQKLIANQF,GTSGSPIVNR,LPAA..."
1,3,3,117,"KPLEFGATSAAL,SFHSLHLLF,RAKFKQLL,HPKVSSEVHI,KRW..."
2,4,4,48,"QASQEVKNW,RAKFKQLL,HPKVSSEVHI,ITEEVGHTDLMAAY,F..."
3,5,5,12,"SFHSLHLLF,HTTDPSFLGRY,FPRPWLHGL,LLWNGPMAV,NQKL..."
4,6,6,12,"ILIEGIFFV,TPRVTGGGAM,HTTDPSFLGRY,KAYNVTQAF,HPK..."
5,7,7,5,"VLWAHGFEL,NLVPMVATV,LPPIVAKEI,GLCTLVAML"
6,8,8,4,"KLPDDFTGCV,IIKDYGKQM,LPPIVAKEI,KAYNVTQAF"
7,9,9,1,GLCTLVAML
8,10,10,2,"GILGFVFTL,IPSINVHHY"
9,11,11,1,GLCTLVAML


In [14]:
# Export pure cluster results
pure_clusters.to_csv('./results/tcrex_clustering/pure_clusters.tsv',
                index=False)

In [15]:
# Nr of pure clusters
pure_clusters['nr of clusters'].sum()

796