### Cluster positive and negative training data separately

In [1]:
import os

from clustcr import Clustering
import numpy as np
import pandas as pd

# Adjust the working directory
os.chdir('path_to_your_dir')

  from .autonotebook import tqdm as notebook_tqdm


### functions

In [2]:
# Cluster all TCRs in dataframe based on the selected column
def cluster_data(data, column):
    
    # Cluster data using MCL
    clustering = Clustering(method='MCL')
    clustered_data = clustering.fit(data[column])
    
    return clustered_data


In [3]:
def parse_motifs(motifs):
    if motifs.shape[0]>0:
        # Parse motifs into simple strings
        motifs['motif'] = motifs['motif'].str.replace(r'\[[A-Z]+\]','X',regex=True)
        motifs['motif'] = motifs['motif'].str.replace(r'\.','X',regex=True)
        motifs['motif'] = motifs['motif'].str.replace(r'[a-z]','X',regex=True)
    return motifs

In [4]:
def get_overlapping_motfifs(df):
    if df.shape[0]>0:
        # Find overlapping motifs
        df = (df.groupby(['motif'])['data'].apply(', '.join)
                   .reset_index())
        df['data'] = df['data'].apply(
            lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                           for y in x.split(','))))    
        df = df[df['data'].str.contains(',')]
    return df

# Clustering positive and negative data



In [5]:
# Get all epitope sequences
df = pd.read_csv('./data/final/all_tcrs.tsv')
epitopes = set(df['epitope'].tolist())

shared_motifs = pd.DataFrame()

for epitope in epitopes:
 
     # Read in all training data for the epitope
    data = pd.read_csv(os.path.join('data/parsed/tcrex_data',epitope,'training_data.tsv'), sep='\t')
    
    # Retain one CDR3 beta per class
    data = data.drop_duplicates(subset=['CDR3_beta', 'Class'], keep= 'first')

    # Remove CDR3 beta sequences present in both positive and negative data
    data = data.drop_duplicates(subset=['CDR3_beta'], keep= False)
   
    # Cluster control and positive data separately
    neg = data[data['Class']==0]
    pos = data[data['Class']==1]
    pos_clusters = cluster_data(pos,'CDR3_beta')
    neg_clusters = cluster_data(neg,'CDR3_beta')

    # Define simple motifs 
    pos_motifs = pos_clusters.summary()
    pos_motifs = parse_motifs(pos_motifs)
    pos_motifs['data'] = 'positive'
    neg_motifs = neg_clusters.summary()
    neg_motifs = parse_motifs(neg_motifs)
    neg_motifs['data'] = 'control'
    final = pd.concat([pos_motifs,neg_motifs])
    
    # Identify overlapping motifs between positive and negative clusters
    overlap = get_overlapping_motfifs(final)
    
    if overlap.shape[0]>0:
        overlap['epitope'] = epitope
        shared_motifs = pd.concat([shared_motifs, overlap])
        print(epitope)
        print(overlap.shape[0])
        pos_results = pos_clusters.clusters_df
        neg_results = neg_clusters.clusters_df

        for motif in overlap['motif'].tolist():
            print('\n',motif)
            print('Positive TCRs')
            pos = pos_motifs[pos_motifs['motif'] == motif].reset_index()['index'].tolist()[0]
            print(pos_results[pos_results['cluster']==pos])
            print('Negative TCRs')
            neg = neg_motifs[neg_motifs['motif'] == motif].reset_index()['index'].tolist()[0]
            print(neg_results[neg_results['cluster']==neg])



Clustering using MCL approach.
Total time to run ClusTCR: 0.113s
Clustering using MCL approach.
Total time to run ClusTCR: 0.136s
Clustering using MCL approach.
Total time to run ClusTCR: 0.091s
Clustering using MCL approach.
Total time to run ClusTCR: 0.011s
Clustering using MCL approach.
Total time to run ClusTCR: 0.095s
Clustering using MCL approach.
Total time to run ClusTCR: 0.014s
Clustering using MCL approach.
Total time to run ClusTCR: 0.057s
Clustering using MCL approach.
Total time to run ClusTCR: 0.006s
Clustering using MCL approach.
Total time to run ClusTCR: 0.004s
Clustering using MCL approach.
Total time to run ClusTCR: 0.005s
Clustering using MCL approach.
Total time to run ClusTCR: 0.004s
Clustering using MCL approach.
Total time to run ClusTCR: 0.001s
Clustering using MCL approach.
Total time to run ClusTCR: 0.046s
Clustering using MCL approach.
Total time to run ClusTCR: 0.005s
Clustering using MCL approach.
Total time to run ClusTCR: 0.056s
Clustering using MCL appr

Total time to run ClusTCR: 0.100s
Clustering using MCL approach.
Total time to run ClusTCR: 0.088s
Clustering using MCL approach.
Total time to run ClusTCR: 0.034s
Clustering using MCL approach.
Total time to run ClusTCR: 0.005s
Clustering using MCL approach.
Total time to run ClusTCR: 0.000s
Clustering using MCL approach.
Total time to run ClusTCR: 0.005s
Clustering using MCL approach.
Total time to run ClusTCR: 0.049s
Clustering using MCL approach.
Total time to run ClusTCR: 0.010s
Clustering using MCL approach.
Total time to run ClusTCR: 0.318s
Clustering using MCL approach.
Total time to run ClusTCR: 4.612s
NLVPMVATV
5

 CASSLXGXTEAFF
Positive TCRs
       junction_aa  cluster
342  CASSLEGGTEAFF       64
343  CASSLEGYTEAFF       64
344  CASSLGGGTEAFF       64
345  CASSLGGTTEAFF       64
346  CASSGGGGTEAFF       64
347  CASSLGETTEAFF       64
Negative TCRs
      junction_aa  cluster
84  CASSLKGETEAFF       11
85  CASSLKGFTEAFF       11
86  CASSLAGPTEAFF       11
87  CASSLAGSTEAFF    

Total time to run ClusTCR: 0.744s
Clustering using MCL approach.
Total time to run ClusTCR: 1.785s
GILGFVFTL
2

 CASSPGTGXYEQYF
Positive TCRs
        junction_aa  cluster
796  CASSPGTGTYEQYF       53
797  CASSPGTGYYEQYF       53
Negative TCRs
        junction_aa  cluster
985  CASSPGTGQYEQYF      263
986  CASSPGTGVYEQYF      263

 CASSXGVGTEAFF
Positive TCRs
       junction_aa  cluster
921  CASSLGVGTEAFF      105
922  CASSYGVGTEAFF      105
Negative TCRs
        junction_aa  cluster
1863  CASSFGVGTEAFF      581
1864  CASSSGVGTEAFF      581
Clustering using MCL approach.
Total time to run ClusTCR: 0.000s
Clustering using MCL approach.
Total time to run ClusTCR: 0.005s
Clustering using MCL approach.
Total time to run ClusTCR: 0.052s
Clustering using MCL approach.
Total time to run ClusTCR: 0.004s
Clustering using MCL approach.
Total time to run ClusTCR: 0.033s
Clustering using MCL approach.
Total time to run ClusTCR: 0.005s
Clustering using MCL approach.
Total time to run ClusTCR: 0.004s


In [6]:
shared_motifs

Unnamed: 0,motif,data,epitope
520,CASSXGQGXYEQYF,"control,positive",KLSYGIATV
763,CASSXSXNTEAFF,"control,positive",FVDGVPFVV
829,CASSXYNEQFF,"control,positive",FVDGVPFVV
744,CASSLXGXTEAFF,"control,positive",NLVPMVATV
831,CASSLXXEQYF,"control,positive",NLVPMVATV
856,CASSLXXTEAFF,"control,positive",NLVPMVATV
1539,CASSXGDEQFF,"control,positive",NLVPMVATV
2055,CASSXXTDTQYF,"control,positive",NLVPMVATV
688,CASSLSXYGYTF,"control,positive",HTTDPSFLGRY
1440,CASSSXDSYEQYF,"control,positive",HTTDPSFLGRY


In [7]:
# Save shared_motifs
shared_motifs = shared_motifs[['motif', 'epitope']]
shared_motifs.to_csv('./results/background/shared_motifs.tsv', index=False)