# TCR Data treatment
The notebooks imports the concatenated TCR+phenotypic dataset and generates the data for the clonality analysis. The steps are the following:
1. New patient ID: A new patient ID that organizes better the patient characteristics is generated.
2. $\alpha\beta$ and $\gamma\delta$ productive columns: A column containing the sum of productives of all the alleles of $\alpha\beta$ chains is generated. The same for the $\gamma\delta$ chains.
3. CDR3 chain concatenation: All the possible combinations of $\alpha$ and $\beta$ chains are generated by concatenating the corresponding allele chains, i.e., A1_B1, A1_B2, A2_B1, A2_B2, both for amino acids and nucleotides. In case one of the chain alleles is NaN, all the concatenated chain is NaN. The same is done for the $\gamma\delta$ chains.
4. Clone definition: Generates the clones for $\alpha\beta$ and $\gamma\delta$ as unordered sets of sequences. Each cell will be assigned a clone (set) that cointains the valid (non-NaN) sequences for the corresponding chains. Identical clones are grouped together and assigned a group number. The number of clones in each group is calculated and added as a frequency column. 
**Author: Juan Sebastian Diaz Boada**<br>
*juan.sebastian.diaz.boada@ki.se*<br>
22/03/2022<br>
Environment: `TCR_python`<br>

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from itertools import product

In [2]:
module_path = os.path.abspath(os.path.join('..', 'bin'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from data_functions import *

### Settings

In [4]:
pd.set_option('display.max_columns',None)

In [5]:
DF = read_dataframe('../data/05_SS3_collected_TCRs/SS3_21_231/SS3_21_231.tsv')

# Merge

In [None]:
DF_1 = read_dataframe('../data/05_SS3_collected_TCRs/placa_pepit/placa_pepita.tsv')
DF_2 = read_dataframe('../data/05_SS3_collected_TCRs/placa_pepita/placa_pepita.tsv')

In [None]:
DF = pd.concat([DF_1,DF_2],axis=0)

# Clonality

In [6]:
DF

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J
Plate231_PB_M24,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Plate231_MUSL_E13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Plate231_PB_I14,True,182.962,False,True,TRAV4_GGTGAAAGGAACCA_TRAJ23,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,LVGERNQGGKLI,TRAV4*01,TRAJ23*01,False,199.233,False,False,TRAV8-4_CTGTGTATACACG_TRAJ5,gctgtgtatacacgggcaggagagcacttact,avytragehll,"TRAV8-4*01,TRAV8-4*04,TRAV8-4*05",TRAJ5*01,True,1895.490,False,True,TRBV3-1_CAGCCGGGACAGCAA_TRBJ1-5,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,ASSRDSNQPQH,TRBV3-1*01,TRBJ1-5*01,"TRBD1*01,TRBD2*01,TRBD2*02",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Plate231_PB_M5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Plate231_MUSL_A23,True,163.792,False,True,TRAV17_TACGGTCCCAACTGG_TRAJ9,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,ATVPTGGFKTI,TRAV17*01,TRAJ9*01,,,,,,,,,,True,629.671,False,True,TRBV3-1_CAAGATGCGGACAGGGACGGCTA_TRBJ1-2,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,ASSQDADRDGYT,TRBV3-1*01,TRBJ1-2*01,TRBD1*01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Plate231_PB_L13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Plate231_MUSL_D24,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Plate231_PB_I8,True,362.557,False,True,TRAV5_TGCAGCCCGCTCT_TRAJ37,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,AARSSNTGKLI,TRAV5*01,TRAJ37*02,,,,,,,,,,True,1674.770,False,True,TRBV7-6_CTTAGGGAACGGGGCCTTGAAC_TRBJ1-1,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,ASSLGNGALNTEAF,TRBV7-6*01,TRBJ1-1*01,"TRBD1*01,TRBD2*01",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
df = DF.copy()

In [8]:
df.columns

Index(['A_1_productive', 'A_1_TPM', 'A_1_stop_codon', 'A_1_in_frame', 'A_1_ID',
       'A_1_CDR3nt', 'A_1_CDR3aa', 'A_1_V', 'A_1_J', 'A_2_productive',
       'A_2_TPM', 'A_2_stop_codon', 'A_2_in_frame', 'A_2_ID', 'A_2_CDR3nt',
       'A_2_CDR3aa', 'A_2_V', 'A_2_J', 'B_1_productive', 'B_1_TPM',
       'B_1_stop_codon', 'B_1_in_frame', 'B_1_ID', 'B_1_CDR3nt', 'B_1_CDR3aa',
       'B_1_V', 'B_1_J', 'B_1_D', 'B_2_productive', 'B_2_TPM',
       'B_2_stop_codon', 'B_2_in_frame', 'B_2_ID', 'B_2_CDR3nt', 'B_2_CDR3aa',
       'B_2_V', 'B_2_J', 'B_2_D', 'G_1_productive', 'G_1_TPM',
       'G_1_stop_codon', 'G_1_in_frame', 'G_1_ID', 'G_1_CDR3nt', 'G_1_CDR3aa',
       'G_1_V', 'G_1_J', 'D_1_productive', 'D_1_TPM', 'D_1_stop_codon',
       'D_1_in_frame', 'D_1_ID', 'D_1_CDR3nt', 'D_1_CDR3aa', 'D_1_V', 'D_1_J',
       'D_1_D', 'G_2_productive', 'G_2_TPM', 'G_2_stop_codon', 'G_2_in_frame',
       'G_2_ID', 'G_2_CDR3nt', 'G_2_CDR3aa', 'G_2_V', 'G_2_J'],
      dtype='object')

# Data treating

In [9]:
# Replace Nans for zero in productive columns
cols = df.columns[df.columns.str.endswith('productive')|\
                    df.columns.str.endswith('stop_codon')|\
                    df.columns.str.endswith('in_frame')]
for i in cols:
    df.loc[:,i] = df.loc[:,i].fillna(0).astype(int)
df

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J
Plate231_PB_M24,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,
Plate231_MUSL_E13,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,
Plate231_PB_I14,1,182.962,0,1,TRAV4_GGTGAAAGGAACCA_TRAJ23,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,LVGERNQGGKLI,TRAV4*01,TRAJ23*01,0,199.233,0,0,TRAV8-4_CTGTGTATACACG_TRAJ5,gctgtgtatacacgggcaggagagcacttact,avytragehll,"TRAV8-4*01,TRAV8-4*04,TRAV8-4*05",TRAJ5*01,1,1895.490,0,1,TRBV3-1_CAGCCGGGACAGCAA_TRBJ1-5,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,ASSRDSNQPQH,TRBV3-1*01,TRBJ1-5*01,"TRBD1*01,TRBD2*01,TRBD2*02",0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,
Plate231_PB_M5,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,
Plate231_MUSL_A23,1,163.792,0,1,TRAV17_TACGGTCCCAACTGG_TRAJ9,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,ATVPTGGFKTI,TRAV17*01,TRAJ9*01,0,,0,0,,,,,,1,629.671,0,1,TRBV3-1_CAAGATGCGGACAGGGACGGCTA_TRBJ1-2,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,ASSQDADRDGYT,TRBV3-1*01,TRBJ1-2*01,TRBD1*01,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,
Plate231_PB_L13,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,
Plate231_MUSL_D24,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,
Plate231_PB_I8,1,362.557,0,1,TRAV5_TGCAGCCCGCTCT_TRAJ37,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,AARSSNTGKLI,TRAV5*01,TRAJ37*02,0,,0,0,,,,,,1,1674.770,0,1,TRBV7-6_CTTAGGGAACGGGGCCTTGAAC_TRBJ1-1,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,ASSLGNGALNTEAF,TRBV7-6*01,TRBJ1-1*01,"TRBD1*01,TRBD2*01",0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,


## Fill missing data

In [10]:
loci = ['A_1','A_2','B_1','B_2','G_1','G_2','D_1','D_2']
for l in loci:
    if not np.any(df.columns.str.contains(l)):
        df.insert(len(df.columns),l+'_productive',0)
        df.insert(len(df.columns),l+'_TPM',np.nan)
        df.insert(len(df.columns),l+'_stop_codon',0)
        df.insert(len(df.columns),l+'_in_frame',0)
        df.insert(len(df.columns),l+'_ID',np.nan)
        df.insert(len(df.columns),l+'_CDR3nt',np.nan)
        df.insert(len(df.columns),l+'_CDR3aa',np.nan)
        df.insert(len(df.columns),l+'_V',np.nan)
        df.insert(len(df.columns),l+'_J',np.nan)
        if l in ['B_1','B_2','D_1','D_2']:
            df.insert(len(df.columns),l+'_D',np.nan)

In [11]:
df

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J,D_2_productive,D_2_TPM,D_2_stop_codon,D_2_in_frame,D_2_ID,D_2_CDR3nt,D_2_CDR3aa,D_2_V,D_2_J,D_2_D
Plate231_PB_M24,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,
Plate231_MUSL_E13,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,
Plate231_PB_I14,1,182.962,0,1,TRAV4_GGTGAAAGGAACCA_TRAJ23,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,LVGERNQGGKLI,TRAV4*01,TRAJ23*01,0,199.233,0,0,TRAV8-4_CTGTGTATACACG_TRAJ5,gctgtgtatacacgggcaggagagcacttact,avytragehll,"TRAV8-4*01,TRAV8-4*04,TRAV8-4*05",TRAJ5*01,1,1895.490,0,1,TRBV3-1_CAGCCGGGACAGCAA_TRBJ1-5,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,ASSRDSNQPQH,TRBV3-1*01,TRBJ1-5*01,"TRBD1*01,TRBD2*01,TRBD2*02",0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,
Plate231_PB_M5,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,
Plate231_MUSL_A23,1,163.792,0,1,TRAV17_TACGGTCCCAACTGG_TRAJ9,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,ATVPTGGFKTI,TRAV17*01,TRAJ9*01,0,,0,0,,,,,,1,629.671,0,1,TRBV3-1_CAAGATGCGGACAGGGACGGCTA_TRBJ1-2,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,ASSQDADRDGYT,TRBV3-1*01,TRBJ1-2*01,TRBD1*01,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,
Plate231_PB_L13,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,
Plate231_MUSL_D24,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,
Plate231_PB_I8,1,362.557,0,1,TRAV5_TGCAGCCCGCTCT_TRAJ37,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,AARSSNTGKLI,TRAV5*01,TRAJ37*02,0,,0,0,,,,,,1,1674.770,0,1,TRBV7-6_CTTAGGGAACGGGGCCTTGAAC_TRBJ1-1,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,ASSLGNGALNTEAF,TRBV7-6*01,TRBJ1-1*01,"TRBD1*01,TRBD2*01",0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,


### 1.3 Reorder columns
The order of the columns is based on the productive sequences that TraCer assemble finds first. This is changed to follow the loci order (A, B, G, D) and allele order (1, 2) within a locus.

In [12]:
loci = ['A_1','A_2','B_1','B_2','G_1','G_2','D_1','D_2']
new_cols = []
for l in loci:
    new_cols = new_cols + list(df.columns[df.columns.str.startswith(l)])
df = df[new_cols]
df

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,D_2_productive,D_2_TPM,D_2_stop_codon,D_2_in_frame,D_2_ID,D_2_CDR3nt,D_2_CDR3aa,D_2_V,D_2_J,D_2_D
Plate231_PB_M24,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,
Plate231_MUSL_E13,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,
Plate231_PB_I14,1,182.962,0,1,TRAV4_GGTGAAAGGAACCA_TRAJ23,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,LVGERNQGGKLI,TRAV4*01,TRAJ23*01,0,199.233,0,0,TRAV8-4_CTGTGTATACACG_TRAJ5,gctgtgtatacacgggcaggagagcacttact,avytragehll,"TRAV8-4*01,TRAV8-4*04,TRAV8-4*05",TRAJ5*01,1,1895.490,0,1,TRBV3-1_CAGCCGGGACAGCAA_TRBJ1-5,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,ASSRDSNQPQH,TRBV3-1*01,TRBJ1-5*01,"TRBD1*01,TRBD2*01,TRBD2*02",0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,
Plate231_PB_M5,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,
Plate231_MUSL_A23,1,163.792,0,1,TRAV17_TACGGTCCCAACTGG_TRAJ9,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,ATVPTGGFKTI,TRAV17*01,TRAJ9*01,0,,0,0,,,,,,1,629.671,0,1,TRBV3-1_CAAGATGCGGACAGGGACGGCTA_TRBJ1-2,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,ASSQDADRDGYT,TRBV3-1*01,TRBJ1-2*01,TRBD1*01,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,
Plate231_PB_L13,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,
Plate231_MUSL_D24,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,
Plate231_PB_I8,1,362.557,0,1,TRAV5_TGCAGCCCGCTCT_TRAJ37,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,AARSSNTGKLI,TRAV5*01,TRAJ37*02,0,,0,0,,,,,,1,1674.770,0,1,TRBV7-6_CTTAGGGAACGGGGCCTTGAAC_TRBJ1-1,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,ASSLGNGALNTEAF,TRBV7-6*01,TRBJ1-1*01,"TRBD1*01,TRBD2*01",0,,0,0,,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,0,,0,0,,,,,,,0,,0,0,,,,,,


# $\alpha\beta$ and $\gamma\delta$ productive columns
Create columns of total $\alpha\beta$ and $\gamma\delta$ productives, summing up all their productive values per cell.

In [13]:
P = df.loc[:,df.columns.str.endswith('productive')] # Productive dataframe
P

Unnamed: 0,A_1_productive,A_2_productive,B_1_productive,B_2_productive,G_1_productive,G_2_productive,D_1_productive,D_2_productive
Plate231_PB_M24,0,0,0,0,0,0,0,0
Plate231_MUSL_E13,0,0,0,0,0,0,0,0
Plate231_PB_I14,1,0,1,0,0,0,0,0
Plate231_PB_M5,0,0,0,0,0,0,0,0
Plate231_MUSL_A23,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,0,0,0,0,0,0,0,0
Plate231_PB_L13,0,0,0,0,0,0,0,0
Plate231_MUSL_D24,0,0,0,0,0,0,0,0
Plate231_PB_I8,1,0,1,0,0,0,0,0


In [14]:
loci = ['A','B','G','D']
for l in loci:
    if l in ['A','G']:
        suffix = '_2_J'
    elif l in ['B','D']:
        suffix = '_2_D'
    idx = int(np.where(df.columns==l+suffix)[0][0])
    df.insert(idx+1,l+'_productive',P.loc[:,P.columns.str.startswith(l)].sum(axis=1))
df

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,A_productive,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,B_productive,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J,G_productive,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,D_2_productive,D_2_TPM,D_2_stop_codon,D_2_in_frame,D_2_ID,D_2_CDR3nt,D_2_CDR3aa,D_2_V,D_2_J,D_2_D,D_productive
Plate231_PB_M24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0
Plate231_MUSL_E13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0
Plate231_PB_I14,1,182.962,0,1,TRAV4_GGTGAAAGGAACCA_TRAJ23,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,LVGERNQGGKLI,TRAV4*01,TRAJ23*01,0,199.233,0,0,TRAV8-4_CTGTGTATACACG_TRAJ5,gctgtgtatacacgggcaggagagcacttact,avytragehll,"TRAV8-4*01,TRAV8-4*04,TRAV8-4*05",TRAJ5*01,1,1,1895.490,0,1,TRBV3-1_CAGCCGGGACAGCAA_TRBJ1-5,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,ASSRDSNQPQH,TRBV3-1*01,TRBJ1-5*01,"TRBD1*01,TRBD2*01,TRBD2*02",0,,0,0,,,,,,,1,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0
Plate231_PB_M5,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0
Plate231_MUSL_A23,1,163.792,0,1,TRAV17_TACGGTCCCAACTGG_TRAJ9,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,ATVPTGGFKTI,TRAV17*01,TRAJ9*01,0,,0,0,,,,,,1,1,629.671,0,1,TRBV3-1_CAAGATGCGGACAGGGACGGCTA_TRBJ1-2,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,ASSQDADRDGYT,TRBV3-1*01,TRBJ1-2*01,TRBD1*01,0,,0,0,,,,,,,1,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0
Plate231_PB_L13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0
Plate231_MUSL_D24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0
Plate231_PB_I8,1,362.557,0,1,TRAV5_TGCAGCCCGCTCT_TRAJ37,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,AARSSNTGKLI,TRAV5*01,TRAJ37*02,0,,0,0,,,,,,1,1,1674.770,0,1,TRBV7-6_CTTAGGGAACGGGGCCTTGAAC_TRBJ1-1,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,ASSLGNGALNTEAF,TRBV7-6*01,TRBJ1-1*01,"TRBD1*01,TRBD2*01",0,,0,0,,,,,,,1,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0


In [15]:
idx = int(np.where(df.columns=='B_productive')[0][0])
df.insert(idx+1,'AB_productive',P.iloc[:,:4].sum(axis=1))
df.insert(len(df.columns),'GD_productive',P.iloc[:,4:8].sum(axis=1))
df

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,A_productive,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,B_productive,AB_productive,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J,G_productive,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,D_2_productive,D_2_TPM,D_2_stop_codon,D_2_in_frame,D_2_ID,D_2_CDR3nt,D_2_CDR3aa,D_2_V,D_2_J,D_2_D,D_productive,GD_productive
Plate231_PB_M24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_E13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_I14,1,182.962,0,1,TRAV4_GGTGAAAGGAACCA_TRAJ23,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,LVGERNQGGKLI,TRAV4*01,TRAJ23*01,0,199.233,0,0,TRAV8-4_CTGTGTATACACG_TRAJ5,gctgtgtatacacgggcaggagagcacttact,avytragehll,"TRAV8-4*01,TRAV8-4*04,TRAV8-4*05",TRAJ5*01,1,1,1895.490,0,1,TRBV3-1_CAGCCGGGACAGCAA_TRBJ1-5,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,ASSRDSNQPQH,TRBV3-1*01,TRBJ1-5*01,"TRBD1*01,TRBD2*01,TRBD2*02",0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_M5,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_A23,1,163.792,0,1,TRAV17_TACGGTCCCAACTGG_TRAJ9,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,ATVPTGGFKTI,TRAV17*01,TRAJ9*01,0,,0,0,,,,,,1,1,629.671,0,1,TRBV3-1_CAAGATGCGGACAGGGACGGCTA_TRBJ1-2,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,ASSQDADRDGYT,TRBV3-1*01,TRBJ1-2*01,TRBD1*01,0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_L13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_D24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_I8,1,362.557,0,1,TRAV5_TGCAGCCCGCTCT_TRAJ37,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,AARSSNTGKLI,TRAV5*01,TRAJ37*02,0,,0,0,,,,,,1,1,1674.770,0,1,TRBV7-6_CTTAGGGAACGGGGCCTTGAAC_TRBJ1-1,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,ASSLGNGALNTEAF,TRBV7-6*01,TRBJ1-1*01,"TRBD1*01,TRBD2*01",0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0


# 3. CDR3 chain concatenation
Create columns of concatenated $\alpha-\beta$ and $\gamma-\delta$ chain combinations.

In [16]:
# CDR3 dataframes
CDR3nt = df.loc[:,df.columns.str.endswith('CDR3nt')]
CDR3aa = df.loc[:,df.columns.str.endswith('CDR3aa')]

In [17]:
CDR3aa

Unnamed: 0,A_1_CDR3aa,A_2_CDR3aa,B_1_CDR3aa,B_2_CDR3aa,G_1_CDR3aa,G_2_CDR3aa,D_1_CDR3aa,D_2_CDR3aa
Plate231_PB_M24,,,,,,,,
Plate231_MUSL_E13,,,,,,,,
Plate231_PB_I14,LVGERNQGGKLI,avytragehll,ASSRDSNQPQH,,,,,
Plate231_PB_M5,,,,,,,,
Plate231_MUSL_A23,ATVPTGGFKTI,,ASSQDADRDGYT,,,,,
...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,,,,,,,,
Plate231_PB_L13,,,,,,,,
Plate231_MUSL_D24,,,,,,,,
Plate231_PB_I8,AARSSNTGKLI,,ASSLGNGALNTEAF,,,,,


## 3.1 $\alpha\beta$ concatenation

In [18]:
AB_CDR3nt = CDR3nt.iloc[:,:4]
AB_CDR3aa = CDR3aa.iloc[:,:4]

In [19]:
AB_CDR3aa

Unnamed: 0,A_1_CDR3aa,A_2_CDR3aa,B_1_CDR3aa,B_2_CDR3aa
Plate231_PB_M24,,,,
Plate231_MUSL_E13,,,,
Plate231_PB_I14,LVGERNQGGKLI,avytragehll,ASSRDSNQPQH,
Plate231_PB_M5,,,,
Plate231_MUSL_A23,ATVPTGGFKTI,,ASSQDADRDGYT,
...,...,...,...,...
Plate231_MUSL_E23,,,,
Plate231_PB_L13,,,,
Plate231_MUSL_D24,,,,
Plate231_PB_I8,AARSSNTGKLI,,ASSLGNGALNTEAF,


### 3.1.1 Masking
Create a boolean mask based on the productives to filter out unproductive chains from concatenation.

In [20]:
AB_mask = P.iloc[:,:4].astype(bool)
AB_mask

Unnamed: 0,A_1_productive,A_2_productive,B_1_productive,B_2_productive
Plate231_PB_M24,False,False,False,False
Plate231_MUSL_E13,False,False,False,False
Plate231_PB_I14,True,False,True,False
Plate231_PB_M5,False,False,False,False
Plate231_MUSL_A23,True,False,True,False
...,...,...,...,...
Plate231_MUSL_E23,False,False,False,False
Plate231_PB_L13,False,False,False,False
Plate231_MUSL_D24,False,False,False,False
Plate231_PB_I8,True,False,True,False


### 3.1.2 Nucleotide concatenation

In [21]:
AB_mask.columns = AB_CDR3nt.columns
masked_ABnt = AB_CDR3nt.mask(~AB_mask)
masked_ABnt

Unnamed: 0,A_1_CDR3nt,A_2_CDR3nt,B_1_CDR3nt,B_2_CDR3nt
Plate231_PB_M24,,,,
Plate231_MUSL_E13,,,,
Plate231_PB_I14,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,
Plate231_PB_M5,,,,
Plate231_MUSL_A23,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,
...,...,...,...,...
Plate231_MUSL_E23,,,,
Plate231_PB_L13,,,,
Plate231_MUSL_D24,,,,
Plate231_PB_I8,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,


### 3.1.3 Amino acid concatenation

In [22]:
AB_mask.columns = AB_CDR3aa.columns
masked_ABaa = AB_CDR3aa.mask(~AB_mask)
masked_ABaa

Unnamed: 0,A_1_CDR3aa,A_2_CDR3aa,B_1_CDR3aa,B_2_CDR3aa
Plate231_PB_M24,,,,
Plate231_MUSL_E13,,,,
Plate231_PB_I14,LVGERNQGGKLI,,ASSRDSNQPQH,
Plate231_PB_M5,,,,
Plate231_MUSL_A23,ATVPTGGFKTI,,ASSQDADRDGYT,
...,...,...,...,...
Plate231_MUSL_E23,,,,
Plate231_PB_L13,,,,
Plate231_MUSL_D24,,,,
Plate231_PB_I8,AARSSNTGKLI,,ASSLGNGALNTEAF,


### 3.1.4 Inserting concatenation into dataset

In [None]:
# Create the four possible combinations of sequences
idx = product([0, 1],[2,3])
names = ['A1_B1','A1_B2','A2_B1','A2_B2']
it = 0
for i,j in list(idx):
    # The summing of strings returns NaN when one (or more) of the chains is NaN
    df.insert(len(df.columns),names[it]+'nt',masked_ABnt.iloc[:,i] +\
              masked_ABnt.iloc[:,j])
    df.insert(len(df.columns),names[it]+'aa',masked_ABaa.iloc[:,i] +\
              masked_ABaa.iloc[:,j])
    it = it +1

In [23]:
df

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,A_productive,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,B_productive,AB_productive,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J,G_productive,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,D_2_productive,D_2_TPM,D_2_stop_codon,D_2_in_frame,D_2_ID,D_2_CDR3nt,D_2_CDR3aa,D_2_V,D_2_J,D_2_D,D_productive,GD_productive
Plate231_PB_M24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_E13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_I14,1,182.962,0,1,TRAV4_GGTGAAAGGAACCA_TRAJ23,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,LVGERNQGGKLI,TRAV4*01,TRAJ23*01,0,199.233,0,0,TRAV8-4_CTGTGTATACACG_TRAJ5,gctgtgtatacacgggcaggagagcacttact,avytragehll,"TRAV8-4*01,TRAV8-4*04,TRAV8-4*05",TRAJ5*01,1,1,1895.490,0,1,TRBV3-1_CAGCCGGGACAGCAA_TRBJ1-5,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,ASSRDSNQPQH,TRBV3-1*01,TRBJ1-5*01,"TRBD1*01,TRBD2*01,TRBD2*02",0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_M5,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_A23,1,163.792,0,1,TRAV17_TACGGTCCCAACTGG_TRAJ9,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,ATVPTGGFKTI,TRAV17*01,TRAJ9*01,0,,0,0,,,,,,1,1,629.671,0,1,TRBV3-1_CAAGATGCGGACAGGGACGGCTA_TRBJ1-2,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,ASSQDADRDGYT,TRBV3-1*01,TRBJ1-2*01,TRBD1*01,0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_L13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_D24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_I8,1,362.557,0,1,TRAV5_TGCAGCCCGCTCT_TRAJ37,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,AARSSNTGKLI,TRAV5*01,TRAJ37*02,0,,0,0,,,,,,1,1,1674.770,0,1,TRBV7-6_CTTAGGGAACGGGGCCTTGAAC_TRBJ1-1,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,ASSLGNGALNTEAF,TRBV7-6*01,TRBJ1-1*01,"TRBD1*01,TRBD2*01",0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0


## 3.2 $\gamma\delta$ concatenation

In [24]:
GD_CDR3nt = CDR3nt.iloc[:,4:8]
GD_CDR3aa = CDR3aa.iloc[:,4:8]

### 3.2.1 Masking
Create a boolean mask based on the productives to filter out unproductive chains from concatenation.

In [25]:
GD_mask = P.iloc[:,4:8].astype(bool)
GD_mask

Unnamed: 0,G_1_productive,G_2_productive,D_1_productive,D_2_productive
Plate231_PB_M24,False,False,False,False
Plate231_MUSL_E13,False,False,False,False
Plate231_PB_I14,False,False,False,False
Plate231_PB_M5,False,False,False,False
Plate231_MUSL_A23,False,False,False,False
...,...,...,...,...
Plate231_MUSL_E23,False,False,False,False
Plate231_PB_L13,False,False,False,False
Plate231_MUSL_D24,False,False,False,False
Plate231_PB_I8,False,False,False,False


### 3.2.2 Nucleotide concatenation

In [26]:
GD_mask.columns = GD_CDR3nt.columns
masked_GDnt = GD_CDR3nt.mask(~GD_mask)
masked_GDnt

Unnamed: 0,G_1_CDR3nt,G_2_CDR3nt,D_1_CDR3nt,D_2_CDR3nt
Plate231_PB_M24,,,,
Plate231_MUSL_E13,,,,
Plate231_PB_I14,,,,
Plate231_PB_M5,,,,
Plate231_MUSL_A23,,,,
...,...,...,...,...
Plate231_MUSL_E23,,,,
Plate231_PB_L13,,,,
Plate231_MUSL_D24,,,,
Plate231_PB_I8,,,,


### 3.2.3 Amino acid concatenation

In [27]:
GD_mask.columns = GD_CDR3aa.columns
masked_GDaa = GD_CDR3aa.mask(~GD_mask)
masked_GDaa

Unnamed: 0,G_1_CDR3aa,G_2_CDR3aa,D_1_CDR3aa,D_2_CDR3aa
Plate231_PB_M24,,,,
Plate231_MUSL_E13,,,,
Plate231_PB_I14,,,,
Plate231_PB_M5,,,,
Plate231_MUSL_A23,,,,
...,...,...,...,...
Plate231_MUSL_E23,,,,
Plate231_PB_L13,,,,
Plate231_MUSL_D24,,,,
Plate231_PB_I8,,,,


### 3.2.4 Inserting concatenation into dataset

In [None]:
# Create the four possible combinations of sequences
idx = product([0, 1],[2,3])
names = ['G1_D1','G1_D2','G2_D1','G2_D2']
it = 0
for i,j in list(idx):
    # The summing of strings returns NaN when one (or more) of the chains is NaN
    df.insert(len(df.columns),names[it]+'nt',masked_GDnt.iloc[:,i] +\
              masked_GDnt.iloc[:,j])
    df.insert(len(df.columns),names[it]+'aa',masked_GDaa.iloc[:,i] +\
              masked_GDaa.iloc[:,j])
    it = it +1

In [28]:
df

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,A_productive,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,B_productive,AB_productive,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J,G_productive,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,D_2_productive,D_2_TPM,D_2_stop_codon,D_2_in_frame,D_2_ID,D_2_CDR3nt,D_2_CDR3aa,D_2_V,D_2_J,D_2_D,D_productive,GD_productive
Plate231_PB_M24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_E13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_I14,1,182.962,0,1,TRAV4_GGTGAAAGGAACCA_TRAJ23,CTCGTGGGTGAAAGGAACCAGGGAGGAAAGCTTATC,LVGERNQGGKLI,TRAV4*01,TRAJ23*01,0,199.233,0,0,TRAV8-4_CTGTGTATACACG_TRAJ5,gctgtgtatacacgggcaggagagcacttact,avytragehll,"TRAV8-4*01,TRAV8-4*04,TRAV8-4*05",TRAJ5*01,1,1,1895.490,0,1,TRBV3-1_CAGCCGGGACAGCAA_TRBJ1-5,GCCAGCAGCCGGGACAGCAATCAGCCCCAGCAT,ASSRDSNQPQH,TRBV3-1*01,TRBJ1-5*01,"TRBD1*01,TRBD2*01,TRBD2*02",0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_M5,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_A23,1,163.792,0,1,TRAV17_TACGGTCCCAACTGG_TRAJ9,GCTACGGTCCCAACTGGAGGCTTCAAAACTATC,ATVPTGGFKTI,TRAV17*01,TRAJ9*01,0,,0,0,,,,,,1,1,629.671,0,1,TRBV3-1_CAAGATGCGGACAGGGACGGCTA_TRBJ1-2,GCCAGCAGCCAAGATGCGGACAGGGACGGCTACACC,ASSQDADRDGYT,TRBV3-1*01,TRBJ1-2*01,TRBD1*01,0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_MUSL_E23,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_L13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_MUSL_D24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0
Plate231_PB_I8,1,362.557,0,1,TRAV5_TGCAGCCCGCTCT_TRAJ37,GCAGCCCGCTCTAGCAACACAGGCAAACTAATC,AARSSNTGKLI,TRAV5*01,TRAJ37*02,0,,0,0,,,,,,1,1,1674.770,0,1,TRBV7-6_CTTAGGGAACGGGGCCTTGAAC_TRBJ1-1,GCCAGCAGCTTAGGGAACGGGGCCTTGAACACTGAAGCTTTC,ASSLGNGALNTEAF,TRBV7-6*01,TRBJ1-1*01,"TRBD1*01,TRBD2*01",0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0


# 4. Clone definition
Generate clones defined as a unique combination of chains. Clones can be $\alpha\beta$ clones or $\gamma\delta$ clones. Each clone is created by generating a set of all non-NaN sequences coming from any of the alleles of the loci.  

## 4.1 Generate clones

In [29]:
cols = ['A_1_CDR3nt','A_2_CDR3nt','B_1_CDR3nt','B_2_CDR3nt']
seq_set_ABnt = generate_clone_sets(masked_ABnt,cols)
tcr_ABnt = concat_seqs_in_set(seq_set_ABnt)

In [30]:
cols = ['A_1_CDR3aa','A_2_CDR3aa','B_1_CDR3aa','B_2_CDR3aa']
seq_set_ABaa = generate_clone_sets(masked_ABaa,cols)
tcr_ABaa = concat_seqs_in_set(seq_set_ABaa)

In [31]:
cols = ['G_1_CDR3nt','G_2_CDR3nt','D_1_CDR3nt','D_2_CDR3nt']
seq_set_GDnt = generate_clone_sets(masked_GDnt,cols)
tcr_GDnt = concat_seqs_in_set(seq_set_GDnt)

In [32]:
cols = ['G_1_CDR3aa','G_2_CDR3aa','D_1_CDR3aa','D_2_CDR3aa']
seq_set_GDaa = generate_clone_sets(masked_GDaa,cols)
tcr_GDaa = concat_seqs_in_set(seq_set_GDaa)

## 4.2 Group clones + frequency
Assign a group number to each clone. Give the number -1 to empty clones (sets).

In [33]:
# ABnt
df.insert(len(df.columns),'TCR_AB_nt',tcr_ABnt)
df = group_col_with_freq(df,'TCR_AB_nt',group_unique=False)
# ABaa
df.insert(len(df.columns),'TCR_AB_aa',tcr_ABaa)
df = group_col_with_freq(df,'TCR_AB_aa',group_unique=False)
# GDnt
df.insert(len(df.columns),'TCR_GD_nt',tcr_GDnt)
df = group_col_with_freq(df,'TCR_GD_nt',group_unique=False)
# GDaa
df.insert(len(df.columns),'TCR_GD_aa',tcr_GDaa)
df = group_col_with_freq(df,'TCR_GD_aa',group_unique=False)

In [34]:
df

Unnamed: 0,A_1_productive,A_1_TPM,A_1_stop_codon,A_1_in_frame,A_1_ID,A_1_CDR3nt,A_1_CDR3aa,A_1_V,A_1_J,A_2_productive,A_2_TPM,A_2_stop_codon,A_2_in_frame,A_2_ID,A_2_CDR3nt,A_2_CDR3aa,A_2_V,A_2_J,A_productive,B_1_productive,B_1_TPM,B_1_stop_codon,B_1_in_frame,B_1_ID,B_1_CDR3nt,B_1_CDR3aa,B_1_V,B_1_J,B_1_D,B_2_productive,B_2_TPM,B_2_stop_codon,B_2_in_frame,B_2_ID,B_2_CDR3nt,B_2_CDR3aa,B_2_V,B_2_J,B_2_D,B_productive,AB_productive,G_1_productive,G_1_TPM,G_1_stop_codon,G_1_in_frame,G_1_ID,G_1_CDR3nt,G_1_CDR3aa,G_1_V,G_1_J,G_2_productive,G_2_TPM,G_2_stop_codon,G_2_in_frame,G_2_ID,G_2_CDR3nt,G_2_CDR3aa,G_2_V,G_2_J,G_productive,D_1_productive,D_1_TPM,D_1_stop_codon,D_1_in_frame,D_1_ID,D_1_CDR3nt,D_1_CDR3aa,D_1_V,D_1_J,D_1_D,D_2_productive,D_2_TPM,D_2_stop_codon,D_2_in_frame,D_2_ID,D_2_CDR3nt,D_2_CDR3aa,D_2_V,D_2_J,D_2_D,D_productive,GD_productive,TCR_AB_nt,freq_TCR_AB_nt,group_TCR_AB_nt,TCR_AB_aa,freq_TCR_AB_aa,group_TCR_AB_aa,TCR_GD_nt,freq_TCR_GD_nt,group_TCR_GD_nt,TCR_GD_aa,freq_TCR_GD_aa,group_TCR_GD_aa
Plate231_PB_M24,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,198,0,,198,0,,371,0,,371,0
Plate231_MUSL_E13,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,198,0,,198,0,,371,0,,371,0
Plate231_PB_L23,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,198,0,,198,0,,371,0,,371,0
Plate231_PB_K10,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,198,0,,198,0,,371,0,,371,0
Plate231_MUSL_G17,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,198,0,,198,0,,371,0,,371,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plate231_PB_J21,1,676.787,0,1,TRAV41_GCGCTTATTC_TRAJ45,GCTTATTCAGGAGGAGGTGCTGACGGACTCACC,AYSGGGADGLT,TRAV41*01,TRAJ45*01,0,,0,0,,,,,,1,1,551.906,0,1,TRBV10-2_AGTGACGCTATTTTTAATT_TRBJ1-6,GCCAGCAGTGACGCTATTTTTAATTCACCCCTCCAC,ASSDAIFNSPLH,TRBV10-2*01,TRBJ1-6*01,,0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,GCCAGCAGTGACGCTATTTTTAATTCACCCCTCCACGCTTATTCAG...,1,83,,198,0,,371,0,ACDTRVYWGIRCTDKLI,1,8
Plate231_PB_J2,1,1090.240,0,1,TRAV13-2_AGAATTCCGGGGGTGG_TRAJ4,GCAGAGAATTCCGGGGGTGGCTACAATAAGCTGATT,AENSGGGYNKLI,TRAV13-2*01,TRAJ4*01,0,,0,0,,,,,,1,1,722.038,0,1,TRBV7-3_GCAGCCAGGGACAGAACTA_TRBJ1-2,GCCAGCAGCCAGGGACAGAACTATGGCTACACC,ASSQGQNYGYT,TRBV7-3*01,TRBJ1-2*01,TRBD1*01,0,,0,0,,,,,,,1,2,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,GCCAGCAGCCAGGGACAGAACTATGGCTACACCGCAGAGAATTCCG...,1,120,,198,0,,371,0,AAWEHPSDYKKL,1,9
Plate231_MUSL_D16,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,198,0,,198,0,,371,0,AAWDPTGWFKI,1,10
Plate231_PB_M5,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,0,,0,0,,,,,,0,,0,0,,,,,,0,0,,0,0,,,,,,,0,,0,0,,,,,,,0,0,,198,0,ASSRDSNQPQHLVGERNQGGKLI,10,1,,371,0,AAWDPFKKLF,1,11


In [36]:
with pd.option_context('display.max_rows',None):
    display(df.loc[:,['TCR_AB_aa','freq_TCR_AB_aa','group_TCR_AB_aa']].\
            sort_values('group_TCR_AB_aa',ascending=True))

Unnamed: 0,TCR_AB_aa,freq_TCR_AB_aa,group_TCR_AB_aa
Plate231_PB_M24,,198,0
Plate231_PB_N7,,198,0
Plate231_PB_J19,,198,0
Plate231_PB_I13,,198,0
Plate231_MUSL_B18,,198,0
Plate231_MUSL_A19,,198,0
Plate231_PB_N2,,198,0
Plate231_PB_I5,,198,0
Plate231_MUSL_C8,,198,0
Plate231_MUSL_A18,,198,0


## Export dataset

In [None]:
df.to_csv(out_file,sep=',')