In [1]:
import pandas as pd
import math
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sys
import gzip as gz
import scipy
from importlib import reload
import glob
import tqdm
import itertools
from scipy.spatial.distance import cdist
import scanpy as sc

import sys
sys.path.append('/Genomics/pritykinlab/dillon/perturbseq/scripts/utils')
import dataloader
import umap_analysis
import reimplementation
import normalization
import adata_utils
from scipy.stats import mannwhitneyu

In [2]:
plt.rcParams["axes.titlesize"] = 25
plt.rcParams["axes.labelsize"] = 25
plt.rcParams["xtick.labelsize"] = 20
plt.rcParams["ytick.labelsize"] = 20
plt.rcParams["lines.markersize"] = 5
plt.rcParams['axes.grid'] = False
plt.rcParams['xtick.bottom'] = True
plt.rcParams['ytick.left'] = True
plt.rcParams['xtick.major.size'] = 5
plt.rcParams['xtick.minor.size'] = 3
plt.rcParams['ytick.major.size'] = 5
plt.rcParams['ytick.minor.size'] = 3
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['legend.fontsize'] = 15
plt.rcParams['legend.title_fontsize'] = 15
plt.rcParams['legend.edgecolor'] = 'dimgray'
plt.rcParams['figure.figsize'] = (7,7)

In [3]:
adata = sc.read_h5ad("/Genomics/pritykinlab/share/published_studies_misc/COVID_TCR_RenZhang2021/COVID19_ALL.h5ad")

In [20]:
adata

AnnData object with n_obs × n_vars = 1462702 × 27943
    obs: 'celltype', 'majorType', 'sampleID', 'PatientID', 'datasets', 'City', 'Age', 'Sex', 'Sample type', 'CoVID-19 severity', 'Sample time', 'Sampling day (Days after symptom onset)', 'SARS-CoV-2', 'Single cell sequencing platform', 'BCR single cell sequencing', 'TCR single cell sequencing', 'Outcome', 'Comorbidities', 'COVID-19-related medication and anti-microbials', 'Leukocytes [G/L]', 'Neutrophils [G/L]', 'Lymphocytes [G/L]', 'Unpublished'
    uns: 'neighbors', 'pca'
    obsm: 'X_pca', 'X_tsne', 'har_emb'
    obsp: 'connectivities', 'distances'

In [19]:
adata.raw.X[0].toarray()

array([[0., 0., 0., ..., 0., 3., 0.]], dtype=float32)

In [12]:
adata.X[0].toarray()

array([[0.       , 0.       , 0.       , ..., 0.       , 1.0692092,
        0.       ]], dtype=float32)

In [4]:
adata[(adata.obs['TCR single cell sequencing'] == "Yes") & (adata.obs['majorType'].isin(['CD8', 'CD4']))]

View of AnnData object with n_obs × n_vars = 515498 × 27943
    obs: 'celltype', 'majorType', 'sampleID', 'PatientID', 'datasets', 'City', 'Age', 'Sex', 'Sample type', 'CoVID-19 severity', 'Sample time', 'Sampling day (Days after symptom onset)', 'SARS-CoV-2', 'Single cell sequencing platform', 'BCR single cell sequencing', 'TCR single cell sequencing', 'Outcome', 'Comorbidities', 'COVID-19-related medication and anti-microbials', 'Leukocytes [G/L]', 'Neutrophils [G/L]', 'Lymphocytes [G/L]', 'Unpublished'
    uns: 'neighbors', 'pca'
    obsm: 'X_pca', 'X_tsne', 'har_emb'
    obsp: 'connectivities', 'distances'

In [5]:
tcr_data = pd.read_csv("/Genomics/pritykinlab/share/published_studies_misc/COVID_TCR_RenZhang2021/GSE158055_covid19_BCR_TCR/GSE158055_covid19_tcr_vdjnt_pclone.tsv.gz",
                       sep='\t')

In [6]:
tcr_data.head(50)

Unnamed: 0,cellBarcode,sampleID,PatientID,TCRA_cgene,TCRA_vgene,TCRA_dgene,TCRA_jgene,TCRA_cdr3aa,TCRA_cdr3nt,TCRB_cgene,...,TCR_pclone.id,TCR_pclone.seq,TCR_pclone.freq,TCR_pclonal,TCR_pidentifier,TCR_sclone.id,TCR_sclone.seq,TCR_sclone.freq,TCR_sclonal,TCR_sidentifier
0,AAACCTGAGAAACCTA-13,S-M044-1,P-M044,TRAC,TRAV4,,TRAJ34,CLVDLYNTDKLIF,TGCCTCGTGGACCTCTATAACACCGACAAGCTCATCTTT,TRBC2,...,pclone_P-M044:90842:1,P-M044:TRAV4_None_TRAJ34_TGCCTCGTGGACCTCTATAAC...,1,False,90842.0,sclone_S-M044-1:1212:1,S-M044-1:TRAV4_None_TRAJ34_TGCCTCGTGGACCTCTATA...,1,False,1212.0
1,AAACCTGAGTCTCCTC-13,S-M044-1,P-M044,TRAC,TRAV13-2,,TRAJ8,CAEMENTGFQKLVF,TGTGCAGAGATGGAGAACACAGGCTTTCAGAAACTTGTATTT,TRBC2,...,pclone_P-M044:89490:145,P-M044:TRAV13-2_None_TRAJ8_TGTGCAGAGATGGAGAACA...,145,True,89490.0,sclone_S-M044-1:514:75,S-M044-1:TRAV13-2_None_TRAJ8_TGTGCAGAGATGGAGAA...,75,True,514.0
2,AAACCTGGTAACGTTC-13,S-M044-1,P-M044,TRAC,TRAV13-2,,TRAJ8,CAEMENTGFQKLVF,TGTGCAGAGATGGAGAACACAGGCTTTCAGAAACTTGTATTT,TRBC2,...,pclone_P-M044:89490:145,P-M044:TRAV13-2_None_TRAJ8_TGTGCAGAGATGGAGAACA...,145,True,89490.0,sclone_S-M044-1:514:75,S-M044-1:TRAV13-2_None_TRAJ8_TGTGCAGAGATGGAGAA...,75,True,514.0
3,AAACCTGGTAGCCTCG-13,S-M044-1,P-M044,TRAC,TRAV1-2,,TRAJ20,CAVRDGDYKLSF,TGTGCTGTGAGAGATGGCGACTACAAGCTCAGCTTT,TRBC2,...,pclone_P-M044:88705:1,P-M044:TRAV1-2_None_TRAJ20_TGTGCTGTGAGAGATGGCG...,1,False,88705.0,sclone_S-M044-1:66:1,S-M044-1:TRAV1-2_None_TRAJ20_TGTGCTGTGAGAGATGG...,1,False,66.0
4,AAACCTGTCCACGAAT-13,S-M044-1,P-M044,TRAC,TRAV13-2,,TRAJ8,CAEMENTGFQKLVF,TGTGCAGAGATGGAGAACACAGGCTTTCAGAAACTTGTATTT,TRBC2,...,pclone_P-M044:89490:145,P-M044:TRAV13-2_None_TRAJ8_TGTGCAGAGATGGAGAACA...,145,True,89490.0,sclone_S-M044-1:514:75,S-M044-1:TRAV13-2_None_TRAJ8_TGTGCAGAGATGGAGAA...,75,True,514.0
5,AAACCTGTCTTAGAGC-13,S-M044-1,P-M044,TRAC,TRAV1-2,,TRAJ20,CAVTANDYKLSF,TGTGCTGTGACCGCTAACGACTACAAGCTCAGCTTT,TRBC2,...,pclone_P-M044:88693:6,P-M044:TRAV1-2_None_TRAJ20_TGTGCTGTGACCGCTAACG...,6,True,88693.0,sclone_S-M044-1:59:3,S-M044-1:TRAV1-2_None_TRAJ20_TGTGCTGTGACCGCTAA...,3,True,59.0
6,AAACGGGCAAACTGCT-13,S-M044-1,P-M044,TRAC,TRAV13-1,,TRAJ7,CAASSSRYGNNRLAF,TGTGCAGCAAGTAGTTCGAGATATGGGAACAACAGACTCGCTTTT,TRBC2,...,pclone_P-M044:89416:1,P-M044:TRAV13-1_None_TRAJ7_TGTGCAGCAAGTAGTTCGA...,1,False,89416.0,sclone_S-M044-1:477:1,S-M044-1:TRAV13-1_None_TRAJ7_TGTGCAGCAAGTAGTTC...,1,False,477.0
7,AAACGGGCATCGATGT-13,S-M044-1,P-M044,TRAC,TRAV13-1,,TRAJ53,CAASFGSNYKLTF,TGTGCAGCAAGTTTCGGTAGCAACTATAAACTGACATTT,TRBC2,...,pclone_P-M044:89407:1,P-M044:TRAV13-1_None_TRAJ53_TGTGCAGCAAGTTTCGGT...,1,False,89407.0,sclone_S-M044-1:471:1,S-M044-1:TRAV13-1_None_TRAJ53_TGTGCAGCAAGTTTCG...,1,False,471.0
8,AAACGGGGTCGAACAG-13,S-M044-1,P-M044,TRAC,TRAV12-2,,TRAJ11,CAAEYSTLTF,TGTGCCGCCGAATACAGCACCCTCACCTTT,TRBC2,...,pclone_P-M044:89098:204,P-M044:TRAV12-2_None_TRAJ11_TGTGCCGCCGAATACAGC...,204,True,89098.0,sclone_S-M044-1:300:117,S-M044-1:TRAV12-2_None_TRAJ11_TGTGCCGCCGAATACA...,117,True,300.0
9,AAAGCAAAGGATATAC-13,S-M044-1,P-M044,TRAC,TRAV5,,TRAJ36,CAEPTGANNLFF,TGTGCAGAGCCAACTGGGGCAAACAACCTCTTCTTT,TRBC2,...,pclone_P-M044:90952:2,P-M044:TRAV5_None_TRAJ36_TGTGCAGAGCCAACTGGGGCA...,2,True,90952.0,sclone_S-M044-1:1268:2,S-M044-1:TRAV5_None_TRAJ36_TGTGCAGAGCCAACTGGGG...,2,True,1268.0


In [22]:
tcr_data['TCR_pclone.id'].value_counts()

pclone_P-M025:30070:660     660
pclone_P-S084:141450:643    643
pclone_P-HC004:36283:507    507
pclone_P-M026:30505:495     495
pclone_P-S053:77375:484     484
                           ... 
pclone_P-S087:159078:1        1
pclone_P-S087:158430:1        1
pclone_P-S087:158466:1        1
pclone_P-S087:158999:1        1
pclone_P-S050:73809:1         1
Name: TCR_pclone.id, Length: 163234, dtype: int64