# Read/Merge Datasets

In [3]:
# imports
import numpy as np
import anndata as ad
import pandas as pd
import scanpy as sc
import seaborn as sns
import copy
# import harmonypy as hm
# import cellanova as canova
from tqdm import tqdm
import os
# import pyarrow
# set matplotlib to show figures inline
%matplotlib inline

## Read Satpathy Data

In [4]:
# read data from matrix
satpathy_adata = sc.read_10x_mtx(f'../data/Satpathy2022/10X', gex_only=False)

# view the adata object
satpathy_adata

AnnData object with n_obs × n_vars = 96750 × 31053
    var: 'gene_ids'

In [5]:
# add metadata to adata object
satpathy_metadata = pd.read_csv("/Genomics/argo/users/ta6403/ParameterEstimation/data/Satpathy2022/metadata.tsv", sep="\t", index_col=0)

for c in satpathy_metadata.columns:
    satpathy_adata.obs[c] = satpathy_metadata[c]

# experiment defined in Satpathy's github
satpathy_adata.obs['experiment'] = 'exp1'
exp2 = satpathy_adata.obs['orig.ident'].isin(["Spleen_Chronic_D21_PD1Pos"
                                 , "Spleen_Chronic_D21_CX3CR1Pos"
                                 , "Spleen_Chronic_D21_CX3CR1Neg_Slamf6Pos"
                                 , "Spleen_Chronic_D21_CX3CR1Neg"
                                 , "Spleen_Chronic_D21_TetPos_rep1"
                                 , "Spleen_Chronic_D21_TetNeg_rep1"])
satpathy_adata.obs.loc[exp2.to_numpy(), 'experiment'] = 'exp2'

PermissionError: [Errno 13] Permission denied: '/Genomics/argo/users/ta6403/ParameterEstimation/data/Satpathy2022/metadata.tsv'

In [None]:
# view metadata
satpathy_adata.obs

In [None]:
# subset only spleen chronic day 21 cells
satpathy_adata = satpathy_adata[satpathy_adata.obs.lcmv=='Chronic_D21']
# satpathy_adata = satpathy_adata[satpathy_adata.obs.tissue=='Spleen']

# filter only cells with both TRA and TRB
# satpathy_adata = satpathy_adata[~satpathy_adata.obs['tcr_cdr3s_aa'].isna()]
# satpathy_adata = satpathy_adata[satpathy_adata.obs['tcr_cdr3s_aa'].str.contains('TRA:')]
# satpathy_adata = satpathy_adata[satpathy_adata.obs['tcr_cdr3s_aa'].str.contains('TRB:')]

satpathy_adata.obs

In [None]:
satpathy_adata.obs

## Read Cui Data

In [6]:
# read data from Cui 2023
cui_adatas= [sc.read_10x_mtx(f'/Genomics/argo/users/ta6403/TExhaustion/Reanalysis/Cui2023/M{i+1}', gex_only=False) for i in range(3)]
# correct for file processed in v2
cui_adatas[-1].var['feature_types'] = ['Gene Expression'] * len(cui_adatas[-1].var['gene_ids'])


# concatenate all data and save as adata
cui_adata = cui_adatas[0].concatenate(cui_adatas[1:])

cui_adata.obs['cell_id'] = range(len(cui_adata.obs))

# view the adata object
cui_adata


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


AnnData object with n_obs × n_vars = 9094 × 31053
    obs: 'batch', 'cell_id'
    var: 'gene_ids', 'feature_types'

In [7]:
# create dictionary mapping batch numbers to names
batch_name_dict = {'0':'M1', '1':'M2', '2':'M3'}

# use dictionary to replace batch numbers with names
cui_adata.obs = cui_adata.obs.replace({'batch': batch_name_dict})

# view updated obs
cui_adata.obs


Unnamed: 0,batch,cell_id
AAACCTGAGCTCCTTC-1-0,M1,0
AAACCTGAGGATGCGT-1-0,M1,1
AAACCTGAGGTAAACT-1-0,M1,2
AAACCTGGTGGTTTCA-1-0,M1,3
AAACCTGGTTATCACG-1-0,M1,4
...,...,...
TTTGGTTTCTCAAACG-1-2,M3,9089
TTTGTCAAGATGTCGG-1-2,M3,9090
TTTGTCACACGGACAA-1-2,M3,9091
TTTGTCACATGGGACA-1-2,M3,9092


In [8]:
# read Cui 2023 metadata
cui_metadata = pd.read_csv('../data/Cui2023/cui_metadata.tsv', sep='\t')

# filter to only those with TCRs
cui_metadata = cui_metadata[~cui_metadata['cdr3'].isna()]
cui_metadata = cui_metadata[cui_metadata['chain']!='Multi']
cui_metadata = cui_metadata[cui_metadata['productive']==True]

barcodes = []
tcrs_aa = []
tcrs_nt = []
mice = []

# TRA and TRB are separate rows of the dataset, so we combine them into one dataframe with TRA/TRB as one row
for barcode in tqdm(cui_metadata['barcode'].unique()):
    cui_subset = cui_metadata[cui_metadata.barcode==barcode]
    vcs = cui_subset.chain.value_counts()
    if len(vcs)<2 or len(vcs)>3:
        continue
    cui_subset = cui_subset.sort_values('chain')
    mouse_int = int(cui_subset.mouse.iloc[0][1])-1
    barcode = barcode+f'-{mouse_int}'
    if vcs['TRA']==1 and vcs['TRB']==1:
        tcrs_aa.append(f'TRA:{cui_subset.cdr3.iloc[0]};TRB:{cui_subset.cdr3.iloc[1]}')
        tcrs_nt.append(f'TRA:{cui_subset.cdr3_nt.iloc[0]};TRB:{cui_subset.cdr3_nt.iloc[1]}')
        barcodes.append(barcode) # barcode the same index as the dataframe
    if vcs['TRA']==2 and vcs['TRB']==1:
        tcrs_aa.append(f'TRA:{cui_subset.cdr3.iloc[0]};TRA:{cui_subset.cdr3.iloc[1]};TRB:{cui_subset.cdr3.iloc[2]}')
        tcrs_nt.append(f'TRA:{cui_subset.cdr3_nt.iloc[0]};TRA:{cui_subset.cdr3_nt.iloc[1]};TRB:{cui_subset.cdr3_nt.iloc[2]}')
        barcodes.append(barcode) # barcode the same index as the dataframe
    if vcs['TRB']==2 and vcs['TRA']==1:
        tcrs_aa.append(f'TRA:{cui_subset.cdr3.iloc[0]};TRB:{cui_subset.cdr3.iloc[1]};TRB:{cui_subset.cdr3.iloc[2]}')
        tcrs_nt.append(f'TRA:{cui_subset.cdr3_nt.iloc[0]};TRB:{cui_subset.cdr3_nt.iloc[1]};TRB:{cui_subset.cdr3_nt.iloc[2]}')
        barcodes.append(barcode) # barcode the same index as the dataframe



cui_metadata = pd.DataFrame({'tcr_cdr3s_aa': tcrs_aa, 'tcr_cdr3s_nt': tcrs_nt}, index=barcodes)
cui_metadata


  2%|▏         | 155/8273 [00:00<00:10, 770.93it/s]100%|██████████| 8273/8273 [00:10<00:00, 768.61it/s]


Unnamed: 0,tcr_cdr3s_aa,tcr_cdr3s_nt
AAACCTGAGGATGCGT-1-0,TRA:CAAREGGSALGRLHF;TRA:CAASPDNNAPRF;TRB:CASSD...,TRA:TGTGCAGCAAGGGAGGGAGGTTCAGCCTTAGGGAGGCTGCAT...
AAACCTGAGGTAAACT-1-0,TRA:CAMREGGGTGSKLSF;TRB:CASSIWDGQNTLYF,TRA:TGTGCTATGAGAGAGGGCGGAGGCACTGGGTCTAAGCTGTCA...
AAACCTGGTGGTTTCA-1-0,TRA:CAVSPDYSNNRLTL;TRB:CASRANSYNSPLYF,TRA:TGTGCTGTGAGCCCGGACTACAGCAACAACAGACTTACTTTG...
AAACCTGGTTATCACG-1-0,TRA:CAMREGTEGADRLTF;TRB:CASSLIQGRYAEQFF,TRA:TGTGCTATGAGAGAGGGAACAGAAGGTGCAGATAGACTCACC...
AAACCTGTCACATAGC-1-0,TRA:CAAYTSSSFSKLVF;TRB:CASSTWGNTGQLYF,TRA:TGTGCTGCATACACATCCTCCTCCTTCAGCAAGCTGGTGTTT...
...,...,...
TTTGCGCTCATGTCTT-1-2,TRA:CAVSDSGTYQRF;TRB:CASSPGQYNSPLYF,TRA:TGTGCTGTGAGCGATTCTGGGACTTACCAGAGGTTT;TRB:T...
TTTGGTTGTCAGCTAT-1-2,TRA:CAIEDSNYQLIW;TRB:CSADHGNYAEQFF;TRB:CASSPGQ...,TRA:TGTGCTATAGAGGATAGCAACTATCAGTTGATCTGG;TRB:T...
TTTGGTTTCTCAAACG-1-2,TRA:CAVSDSGTYQRF;TRB:CASSPGQYNSPLYF,TRA:TGTGCTGTGAGCGATTCTGGGACTTACCAGAGGTTT;TRB:T...
TTTGTCAAGATGTCGG-1-2,TRA:CAASGGSALGRLHF;TRB:CASSRKRGGQNTLYF,TRA:TGTGCAGCAAGTGGCGGTTCAGCCTTAGGGAGGCTGCATTTT...


In [9]:
cui_adata.obs

Unnamed: 0,batch,cell_id
AAACCTGAGCTCCTTC-1-0,M1,0
AAACCTGAGGATGCGT-1-0,M1,1
AAACCTGAGGTAAACT-1-0,M1,2
AAACCTGGTGGTTTCA-1-0,M1,3
AAACCTGGTTATCACG-1-0,M1,4
...,...,...
TTTGGTTTCTCAAACG-1-2,M3,9089
TTTGTCAAGATGTCGG-1-2,M3,9090
TTTGTCACACGGACAA-1-2,M3,9091
TTTGTCACATGGGACA-1-2,M3,9092


In [10]:
# add TCRs back to adata
cui_merged = pd.merge(cui_metadata, cui_adata.obs, how='right', left_index=True, right_index=True)
cui_adata = cui_adata[cui_merged.cell_id]
cui_adata.obs = cui_merged

In [11]:
cui_adata.obs

Unnamed: 0,tcr_cdr3s_aa,tcr_cdr3s_nt,batch,cell_id
AAACCTGAGCTCCTTC-1-0,,,M1,0
AAACCTGAGGATGCGT-1-0,TRA:CAAREGGSALGRLHF;TRA:CAASPDNNAPRF;TRB:CASSD...,TRA:TGTGCAGCAAGGGAGGGAGGTTCAGCCTTAGGGAGGCTGCAT...,M1,1
AAACCTGAGGTAAACT-1-0,TRA:CAMREGGGTGSKLSF;TRB:CASSIWDGQNTLYF,TRA:TGTGCTATGAGAGAGGGCGGAGGCACTGGGTCTAAGCTGTCA...,M1,2
AAACCTGGTGGTTTCA-1-0,TRA:CAVSPDYSNNRLTL;TRB:CASRANSYNSPLYF,TRA:TGTGCTGTGAGCCCGGACTACAGCAACAACAGACTTACTTTG...,M1,3
AAACCTGGTTATCACG-1-0,TRA:CAMREGTEGADRLTF;TRB:CASSLIQGRYAEQFF,TRA:TGTGCTATGAGAGAGGGAACAGAAGGTGCAGATAGACTCACC...,M1,4
...,...,...,...,...
TTTGGTTTCTCAAACG-1-2,TRA:CAVSDSGTYQRF;TRB:CASSPGQYNSPLYF,TRA:TGTGCTGTGAGCGATTCTGGGACTTACCAGAGGTTT;TRB:T...,M3,9089
TTTGTCAAGATGTCGG-1-2,TRA:CAASGGSALGRLHF;TRB:CASSRKRGGQNTLYF,TRA:TGTGCAGCAAGTGGCGGTTCAGCCTTAGGGAGGCTGCATTTT...,M3,9090
TTTGTCACACGGACAA-1-2,,,M3,9091
TTTGTCACATGGGACA-1-2,,,M3,9092


## Read Yermanos Data

In [12]:
# read data from matrix
yermanos_adata = sc.read_10x_mtx(f'../data/Yermanos2022/10X', gex_only=False)

# view the adata object
yermanos_adata

AnnData object with n_obs × n_vars = 4503 × 32285
    var: 'gene_ids', 'feature_types'

In [13]:
barcodes = pd.read_csv('../data/Yermanos2022/10X/barcodes.tsv.gz', header=None)

yermanos_adata.obs['barcode'] = np.array(barcodes[0])

yermanos_adata.obs.set_index('barcode')
yermanos_adata.obs['sample'] = 'sample'+(yermanos_adata.obs.barcode.str[-1].astype(int)+2).astype(str)

yermanos_adata.obs['cell_id'] = np.arange(len(yermanos_adata))

del yermanos_adata.obs['barcode']
yermanos_adata.obs

Unnamed: 0,sample,cell_id
AAACCTGAGAAGGACA-1,sample3,0
AAACCTGCACAGCGTC-1,sample3,1
AAACCTGCACCGAATT-1,sample3,2
AAACCTGGTAGCGCTC-1,sample3,3
AAACCTGTCCAGGGCT-1,sample3,4
...,...,...
TTTGGTTTCATCACCC-2,sample4,4498
TTTGTCAAGTACATGA-2,sample4,4499
TTTGTCACACCATCCT-2,sample4,4500
TTTGTCAGTGCAGACA-2,sample4,4501


In [14]:
# read Cui 2023 metadata
yermanos_metadata = pd.read_csv('../data/Yermanos2022/10X/metadata.csv')

# filter to only those with TCRs
yermanos_metadata = yermanos_metadata[~yermanos_metadata['cdr3'].isna()]
yermanos_metadata = yermanos_metadata[yermanos_metadata['chain']!='Multi']
yermanos_metadata = yermanos_metadata[yermanos_metadata['productive']==True]

barcodes = []
tcrs_aa = []
tcrs_nt = []

# TRA and TRB are separate rows of the dataset, so we combine them into one dataframe with TRA/TRB as one row
for barcode in tqdm(yermanos_metadata['barcode'].unique()):
    yermanos_subset = yermanos_metadata[yermanos_metadata.barcode==barcode]
    vcs = yermanos_subset.chain.value_counts()
    if len(vcs)<2 or len(vcs)>3:
        continue
    yermanos_subset = yermanos_subset.sort_values('chain')
    if vcs['TRA']==1 and vcs['TRB']==1:
        tcrs_aa.append(f'TRA:{yermanos_subset.cdr3.iloc[0]};TRB:{yermanos_subset.cdr3.iloc[1]}')
        tcrs_nt.append(f'TRA:{yermanos_subset.cdr3_nt.iloc[0]};TRB:{yermanos_subset.cdr3_nt.iloc[1]}')
        barcodes.append(barcode) # barcode the same index as the dataframe
    if vcs['TRA']==2 and vcs['TRB']==1:
        tcrs_aa.append(f'TRA:{yermanos_subset.cdr3.iloc[0]};TRA:{yermanos_subset.cdr3.iloc[1]};TRB:{yermanos_subset.cdr3.iloc[2]}')
        tcrs_nt.append(f'TRA:{yermanos_subset.cdr3_nt.iloc[0]};TRA:{yermanos_subset.cdr3_nt.iloc[1]};TRB:{yermanos_subset.cdr3_nt.iloc[2]}')
        barcodes.append(barcode) # barcode the same index as the dataframe
    if vcs['TRB']==2 and vcs['TRA']==1:
        tcrs_aa.append(f'TRA:{yermanos_subset.cdr3.iloc[0]};TRB:{yermanos_subset.cdr3.iloc[1]};TRB:{yermanos_subset.cdr3.iloc[2]}')
        tcrs_nt.append(f'TRA:{yermanos_subset.cdr3_nt.iloc[0]};TRB:{yermanos_subset.cdr3_nt.iloc[1]};TRB:{yermanos_subset.cdr3_nt.iloc[2]}')
        barcodes.append(barcode) # barcode the same index as the dataframe

yermanos_metadata = pd.DataFrame({'tcr_cdr3s_aa': tcrs_aa, 'tcr_cdr3s_nt': tcrs_nt}, index=barcodes)
yermanos_metadata


  5%|▌         | 136/2640 [00:00<00:01, 1355.97it/s]100%|██████████| 2640/2640 [00:01<00:00, 1377.99it/s]


Unnamed: 0,tcr_cdr3s_aa,tcr_cdr3s_nt
AAACCTGAGAAGGACA-1,TRA:CAVSAGSNYNVLYF;TRB:CASSFWGDSSYEQYF;TRB:CAS...,TRA:TGCGCAGTCAGTGCGGGGTCTAATTACAACGTGCTTTACTTC...
AAACCTGTCGTGACAT-1,TRA:CDSGTYQRF;TRB:CASRDRGRNSPLYF;TRB:CASSFWGDS...,TRA:TGTGATTCTGGGACTTACCAGAGGTTT;TRB:TGTGCCAGCA...
AAACGGGTCCACGCAG-1,TRA:CAVRDNYAQGLTF;TRB:CASSSRTTNSDYTF,TRA:TGTGCTGTGAGGGATAACTATGCCCAGGGATTAACCTTC;TR...
AAAGATGAGAGTAAGG-1,TRA:CAVRDNYAQGLTF;TRB:CASSSRTTNSDYTF,TRA:TGTGCTGTGAGGGATAACTATGCCCAGGGATTAACCTTC;TR...
AAAGATGAGGAATTAC-1,TRA:CAASSDTNAYKVIF;TRB:CASKTGGPYEQYF,TRA:TGTGCAGCTAGTTCTGACACAAATGCTTACAAAGTCATCTTT...
...,...,...
TTTGGTTAGAGAACAG-2,TRA:CAMREITGNTGKLIF;TRB:CASSLLYEQYF,TRA:TGTGCTATGAGAGAGATAACAGGCAATACCGGAAAACTCATC...
TTTGGTTAGAGTACCG-2,TRA:CAANYGNEKITF;TRB:CASSFVTNQDTQYF,TRA:TGTGCTGCAAACTATGGAAATGAGAAAATAACTTTT;TRB:T...
TTTGGTTGTCGACTGC-2,TRA:CALGFMGYKLTF;TRB:CASSIWQGTNERLFF,TRA:TGTGCTCTGGGTTTCATGGGCTACAAACTTACCTTC;TRB:T...
TTTGTCAAGTACATGA-2,TRA:CAAIYPGYQNFYF;TRB:CASSDNGNTEVFF,TRA:TGTGCAGCAATCTATCCGGGTTACCAGAACTTCTATTTT;TR...


In [15]:
# add TCRs back to adata
yermanos_merged = pd.merge(yermanos_metadata, yermanos_adata.obs, how='right', left_index=True, right_index=True)
yermanos_adata = yermanos_adata[yermanos_merged.cell_id]

In [16]:
yermanos_adata.obs = yermanos_merged
yermanos_adata.obs

Unnamed: 0,tcr_cdr3s_aa,tcr_cdr3s_nt,sample,cell_id
AAACCTGAGAAGGACA-1,TRA:CAVSAGSNYNVLYF;TRB:CASSFWGDSSYEQYF;TRB:CAS...,TRA:TGCGCAGTCAGTGCGGGGTCTAATTACAACGTGCTTTACTTC...,sample3,0
AAACCTGCACAGCGTC-1,,,sample3,1
AAACCTGCACCGAATT-1,,,sample3,2
AAACCTGGTAGCGCTC-1,,,sample3,3
AAACCTGTCCAGGGCT-1,,,sample3,4
...,...,...,...,...
TTTGGTTTCATCACCC-2,,,sample4,4498
TTTGTCAAGTACATGA-2,TRA:CAAIYPGYQNFYF;TRB:CASSDNGNTEVFF,TRA:TGTGCAGCAATCTATCCGGGTTACCAGAACTTCTATTTT;TR...,sample4,4499
TTTGTCACACCATCCT-2,TRA:CAAIYPGYQNFYF;TRB:CASSDHTNTEVFF,TRA:TGTGCAGCAATCTATCCGGGTTACCAGAACTTCTATTTT;TR...,sample4,4500
TTTGTCAGTGCAGACA-2,,,sample4,4501


## Read Wherry Data

In [17]:
wherry_adata = sc.read_10x_mtx(f'../data/Wherry2022/10X', gex_only=False)
wherry_adata

AnnData object with n_obs × n_vars = 3856 × 31053
    var: 'gene_ids', 'feature_types'

## Merge Datasets

In [18]:
satpathy_adata.obs['dataset'] = 'Satpathy'
cui_adata.obs['dataset'] = 'Cui'
cui_adata.obs['ident'] = 'Cui 2023'
yermanos_adata.obs['dataset'] = 'Yermanos'
yermanos_adata.obs['ident'] = 'Yermanos 2022'
wherry_adata.obs['dataset'] = 'Wherry'
wherry_adata.obs['ident'] = 'Wherry 2022'

satpathy_adata.obs['batch'] = 'Sathpathy_'+satpathy_adata.obs['experiment']
cui_adata.obs['batch'] = 'Cui_'+cui_adata.obs['batch'].astype(str)
yermanos_adata.obs['batch'] = 'Yermanos_'+yermanos_adata.obs['sample'].astype(str)
wherry_adata.obs['batch'] = 'Wherry'

try:
    del satpathy_adata.obs['tissue']
    del satpathy_adata.obs['lcmv']
    del satpathy_adata.obs['experiment']
    del satpathy_adata.obs['UMAP_1']
    del satpathy_adata.obs['UMAP_2']
    del satpathy_adata.obs['gated_D8_prog']
    del satpathy_adata.obs['is_D8_prog']
except:
    pass

adata = satpathy_adata.concatenate(cui_adata, yermanos_adata, wherry_adata, batch_key=None)

  satpathy_adata.obs['dataset'] = 'Satpathy'

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


In [19]:
# create unique cell id
adata.obs['cell_id'] = range(len(adata.obs))

adata.obs.loc[~adata.obs['tcr_cdr3s_aa'].str.contains('TRA:', na=False), 'tcr_cdr3s_aa'] = np.nan
adata.obs.loc[~adata.obs['tcr_cdr3s_aa'].str.contains('TRA:', na=False), 'tcr_cdr3s_nt'] = np.nan
adata.obs.loc[~adata.obs['tcr_cdr3s_aa'].str.contains('TRB:', na=False), 'tcr_cdr3s_aa'] = np.nan
adata.obs.loc[~adata.obs['tcr_cdr3s_aa'].str.contains('TRB:', na=False), 'tcr_cdr3s_nt'] = np.nan

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,tcr_cdr3s_aa,tcr_cdr3s_nt,sort,ident,batch,cell_id,dataset,sample
Spleen_Chronic_D21_CD8_TetPos_gex_AAACCTGAGATCGGGT-1-0,Spleen_Chronic_D21_TetPos,12513.0,3674.0,,,TetPos,TEx_prog,Sathpathy_exp1,0,Satpathy,
Spleen_Chronic_D21_CD8_TetPos_gex_AAACCTGAGCCACTAT-1-0,Spleen_Chronic_D21_TetPos,3212.0,1535.0,TRA:CAAFANSGTYQRF;TRB:CASSEDWVEQYF,TRA:TGTGCAGCTTTTGCAAATTCTGGGACTTACCAGAGGTTT;TR...,TetPos,TEx_KLR,Sathpathy_exp1,1,Satpathy,
Spleen_Chronic_D21_CD8_TetPos_gex_AAACCTGAGCGTTGCC-1-0,Spleen_Chronic_D21_TetPos,4726.0,1959.0,TRA:CAFEVVGQLTF;TRB:CASSFRDSSYEQYF,TRA:TGTGCTTTCGAGGTTGTGGGGCAGCTCACTTTC;TRB:TGTG...,TetPos,TEx,Sathpathy_exp1,2,Satpathy,
Spleen_Chronic_D21_CD8_TetPos_gex_AAACCTGAGGCTCTTA-1-0,Spleen_Chronic_D21_TetPos,3969.0,1722.0,TRA:CALSEPNYNVLYF;TRB:CASSDWDNTEVFF,TRA:TGTGCTCTGAGTGAGCCTAATTACAACGTGCTTTACTTC;TR...,TetPos,TEx,Sathpathy_exp1,3,Satpathy,
Spleen_Chronic_D21_CD8_TetPos_gex_AAACCTGCAAAGCAAT-1-0,Spleen_Chronic_D21_TetPos,4089.0,1833.0,TRA:CAVSIPGSWQLIF;TRB:CASSSRAGNTEVFF,TRA:TGTGCTGTGAGCATCCCTGGCAGCTGGCAACTCATCTTT;TR...,TetPos,TEx_int,Sathpathy_exp1,4,Satpathy,
...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGAGGTCTGGA-1-3,,,,,,,Wherry 2022,Wherry,85804,Wherry,
TTTGTTGAGTGAACAT-1-3,,,,,,,Wherry 2022,Wherry,85805,Wherry,
TTTGTTGAGTGGCCTC-1-3,,,,,,,Wherry 2022,Wherry,85806,Wherry,
TTTGTTGCAAGAGCTG-1-3,,,,,,,Wherry 2022,Wherry,85807,Wherry,


In [20]:
adata.obs.tcr_cdr3s_aa.value_counts()

tcr_cdr3s_aa
TRA:CAFEVVGQLTF;TRB:CASSFRDSSYEQYF                            3748
TRA:CAVSAWVGDNSKLIW;TRB:CASSRDSLTEVFF                         1839
TRA:CAQLEGADRLTF;TRB:CASSDHTNTEVFF                            1300
TRA:CAMSDLSNNAGAKLTF;TRB:CASSRDNNYAEQFF                       1007
TRA:CAALNNYAQGLTF;TRB:CASSLIRTGGYEQYF                          959
                                                              ... 
TRA:CAVSLDRGSALGRLHF;TRA:CALGPDTNAYKVIF;TRB:CASSIGTSYAEQFF       1
TRA:CAANYNQGKLIF;TRA:CAMREGEQGTGSKLSF;TRB:CASSFDWGGGQDTQYF       1
TRA:CAANTGYQNFYF;TRB:CASSQDREDSDYTF                              1
TRA:CALSERYNQGKLIF;TRB:CASSLNLEVFF                               1
TRA:CAASSNSNNRIFF;TRB:CASSDHDNTEVFF                              1
Name: count, Length: 10757, dtype: int64

In [21]:
adata.write('../data/datasets_combined.h5ad')