# Cluster positive and negative data together

In [1]:
import os

from clustcr import Clustering
import numpy as np
import pandas as pd


# Adjust the working directory
os.chdir('path_to_your_dir')

  from .autonotebook import tqdm as notebook_tqdm


### functions

In [2]:
# Define class '2' to shared CDR3 beta sequences
def define_classes(data):
    # Group TCRs with identical CDR3 sequences together
    data['Class'] = data['Class'].astype('str')
    data = (data.groupby(['CDR3_beta'])['Class'].apply(', '.join)
               .reset_index())
    data['Class'] = data['Class'].apply(
        lambda x: x if ',' not in str(x) else ','.join(set(y.strip()
                                                       for y in x.split(','))))  
    data['Class'] = data['Class'].apply(lambda x: '2' if ',' in x else x)
    
    return data


In [3]:
# Cluster all TCRs in dataframe based on the selected column
def cluster_data(data, column):
    
    clustering = Clustering(method='MCL')
    clustered_data = clustering.fit(data[column])
    
    return clustered_data

In [4]:
def get_class(data, cdr3):
    selection = data[data['CDR3_beta'] == cdr3] 
    return selection['Class'].tolist()[0]

### Cluster training data per epitope

In [5]:
# Get all epitope sequences
df = pd.read_csv('./data/final/all_tcrs.tsv')
epitopes = set(df['epitope'].tolist())

In [6]:
# Initialize empty dic
shared_clusters = {}

# Loop over every epitope
for epitope in epitopes:
    
    # Read in all training data for the epitope
    data = pd.read_csv(os.path.join('data/parsed/tcrex_data',epitope,'training_data.tsv'), sep='\t')
    
    # Define three classes: positive 1, negative 0, shared 2
    data = define_classes(data)
    
    # Remove shared CDR3 beta sequences
    data = data[data['Class'] != '2']

    # Cluster training data for selected epitope
    clustered_data = cluster_data(data, 'CDR3_beta')
    results = clustered_data.clusters_df
    
    # Add class info to results
    results['Class'] = results['junction_aa'].apply(lambda x: get_class(data,x))
    
    # Make df with clusters as row and nr of TCRs per classes in the columns
    results = results.groupby(['cluster','Class']).count().reset_index()
    final = pd.pivot_table(results, values='junction_aa', index=['cluster'],
                           columns=['Class'], aggfunc="sum")
  
    all_clusters = final.shape[0]
    
    # Store nr of clusters 
    shared_clusters[epitope] = {}
    shared_clusters[epitope]['nr'] =  0
    shared_clusters[epitope]['total'] =  all_clusters 
    
    # Select clusters containing both positive and negative tcrs
    if '0' in list(final):
        if '1' in list(final):
            final = final.dropna(subset=['0','1'], how='any')
            # Nr of clusters sharing positive and negative TCRs
            nr_shared = final.shape[0]
            shared_clusters[epitope]['nr'] =  nr_shared

    
    

Clustering using MCL approach.
Total time to run ClusTCR: 0.009s
Clustering using MCL approach.
Total time to run ClusTCR: 0.061s
Clustering using MCL approach.
Total time to run ClusTCR: 0.013s
Clustering using MCL approach.
Total time to run ClusTCR: 0.007s
Clustering using MCL approach.
Total time to run ClusTCR: 0.033s
Clustering using MCL approach.
Total time to run ClusTCR: 1.659s
Clustering using MCL approach.
Total time to run ClusTCR: 0.100s
Clustering using MCL approach.
Total time to run ClusTCR: 0.004s
Clustering using MCL approach.
Total time to run ClusTCR: 0.085s
Clustering using MCL approach.
Total time to run ClusTCR: 0.103s
Clustering using MCL approach.
Total time to run ClusTCR: 0.064s
Clustering using MCL approach.
Total time to run ClusTCR: 0.059s
Clustering using MCL approach.
Total time to run ClusTCR: 0.005s
Clustering using MCL approach.
Total time to run ClusTCR: 0.207s
Clustering using MCL approach.
Total time to run ClusTCR: 1.509s
Clustering using MCL appr

In [7]:
results = pd.DataFrame(shared_clusters)
results = results.transpose()
results['percentage'] = (results['nr']/results['total'])*100
results = results.sort_values(by='percentage', ascending=False)
results 

Unnamed: 0,nr,total,percentage
NLSALGIFST,5,9,55.555556
VLAWLYAAV,3,6,50.000000
LLMPILTLT,1,2,50.000000
FTISVTTEIL,5,13,38.461538
SSNVANYQK,2,6,33.333333
...,...,...,...
RPRGEVRFL,0,3,0.000000
NLDSKVGGNY,0,3,0.000000
KEIDRLNEV,0,4,0.000000
KRWIIMGLNK,0,15,0.000000


In [8]:
# Percentage of all clusters contain a mix of both positive and negative data. 
results['nr'].sum()/results['total'].sum()

0.1924344363586434

In [9]:
# Save table
results.to_csv('./results/background/shared_clusters.tsv')