In [354]:
library(tidyverse)
library(LymphoSeq2)
library(forcats)
library(lubridate)
library(readxl)
library(ggpubr)
library(patchwork)
library(vegan)
library(tidygraph)
library(ggraph)
library(gtools)
library(graphlayouts)
library(Biostrings)
library(corrgram)
library(GGally)
library(ggseqlogo)
library(ggh4x)
library(ggcorrplot)
library(ggdendro)
library("ltm")
library("extrafont")
library(svglite)
library(UpSetR)
library(patchwork)
library(grid)
library(gridExtra)
library(ComplexUpset)
library(Seurat)
library(SeuratDisk)
library(SeuratData)
library(SingleCellExperiment)
library(SeuratWrappers)
library(org.Hs.eg.db)
library(ggseqlogo)
library(ggmsa)
# Suppress summarise info
options(dplyr.summarise.inform = FALSE)
options(repr.matrix.max.rows=5, repr.matrix.max.columns=5)

ggmsa v1.3.4  Document: http://yulab-smu.top/ggmsa/

If you use ggmsa in published research, please cite:
L Zhou, T Feng, S Xu, F Gao, TT Lam, Q Wang, T Wu, H Huang, L Zhan, L Li, Y Guan, Z Dai*, G Yu* ggmsa: a visual exploration tool for multiple sequence alignment and associated data. Briefings in Bioinformatics. DOI:10.1093/bib/bbac222



## Path configuration

### Configure main paths

In [None]:
analysis_path <- "/path/to/kstme/folder" 
tcr_sequencing_path <- "/path/to/ImmunoSEQ/folder" 
tcr_database_path <- "/path/to/TCR/database/folder" 
tcr_v3_sequencing_path <- "/path/to/Adaptive/v3/formated/ImmunoSEQ/data/" 
scrna_sequencing_path <- "/path/to/scRNASeq/results/folder"
results_path <- "/path/to/store/figures/tables/and/checkpoints" #You must create three sub-folders under path called figures, tables, checkpoints
reference_path <- "/path/to/reference/genomes" 

### Metadata path configuration

In [23]:
hippos_lab_metrics_path <- str_c(analysis_path, "metadata/Hippos_titer_values.xls", sep = "/")
hippos_hiv_status_path <- str_c(analysis_path, "metadata/Hippos_hiv_status.xls", sep = "/")
hippos_metadata_path <- str_c(analysis_path, "metadata/KSTME_main_metadata.csv", sep = "/")
hippos_rnaseq_formating_path <- str_c(analysis_path, "metadata/KSTME_RNASeq_formating_metadata.csv", sep = "/")
blghana_metadata_path <- str_c(analysis_path, "metadata/BLGhana_metadata.csv", sep = "/")
bluganda_metadata_path <- str_c(analysis_path, "metadata/BLUganda_metadata.csv", sep = "/")
public_rnaseq_metadata_path <- str_c(analysis_path, "metadata/RNASeq_metadata.csv", sep = "/")

### AIRR-Seq data paths

In [427]:
hippos_path <- str_c(tcr_v3_sequencing_path, "ks", sep = "/")
blghana_path <- str_c(tcr_sequencing_path, "TRB/Burkitt_Ghana", sep = "/")
bluganda_path <- str_c(tcr_sequencing_path, "TRB/UCI", sep = "/")
kspbmc_trb_path <- str_c(tcr_sequencing_path, "TRB/KS", sep = "/")
kskobs_trb_path <- str_c(tcr_sequencing_path, "TRB/KOBS", sep = "/")
ksu035_trb_path <- str_c(tcr_sequencing_path, "U035_KSHV", sep = "/")
arks_pbmc_path <- str_c(tcr_sequencing_path, "ACSR_ARKS_2", sep = "/")

### AIRR-Seq ananlysis results

In [6]:
# GLIPH analysis 
bl_gliph_trb_input_path <- str_c(analysis_path, "gliph/bl_gliph_run/bl_gliph_trb_input_table.tsv", sep = "/")
bl_gliph_clusters_path <- str_c(analysis_path, "gliph/bl_gliph_run/bl_gliph_cluster.csv", sep = "/")
arks_gliph_input_path <- str_c(analysis_path, "gliph/arks_gliph_run/arks_gliph_run.tsv", sep = "/")
arks_gliph_clusters_path <- str_c(analysis_path, "gliph/arks_gliph_run/arks_gliph_cluster.csv", sep = "/")
kstme_gliph_input_path <-  str_c(analysis_path, "gliph/kstme_tumor_normal_final/gliph_trb_table.tsv", sep = "/")
kstme_gliph_hla_input_path <- str_c(analysis_path, "gliph/kstme_tumor_normal_final/gliph_hla_table.tsv", sep = "/")
kstme_gliph_clusters_path <- str_c(analysis_path, "gliph/kstme_tumor_normal_final/gliph_clusters_cluster.csv", sep = "/") 
kstme_gliph_hlapred_path <- str_c(analysis_path, "gliph/kstme_tumor_normal_final/gliph_clusters_HLA.csv", sep = "/")
# CompAIRR results
comp_path <- str_c(tcr_sequencing_path, "Preprocessing/tables/compairr/compairr_output.tsv", sep = "/")
# Public antigen paths
vdjdb_slim_path <- str_c(tcr_database_path, "VdjDB/vdjdb.slim.txt", sep = "/")
mcpas_tcr_path <- str_c(tcr_database_path, "McPASDB/McPAS-TCR.csv", sep = "/")

### RNA-Seq and Nanostring data paths

In [15]:
hiv_fasta_path <- str_c(reference_path, "HIV_genomes/ncbi_dataset/data/GCF_000864765.1/protein.faa", sep = "/")
hhv8_fasta_path <- str_c(reference_path, "KSHV_genomes/ncbi_dataset/data/GCF_000838265.1/protein.faa", sep = "/")
cibersortx_results_path <- str_c(analysis_path, "cibersort_inputs/outputs", sep = "/")
hiv_gene_expression_path <- str_c(analysis_path, "rnaseq/hiv/salmon/kstme_hiv_genes_counts.tsv", sep = "/")
hhv8_gene_expression_path <- str_c(analysis_path, "rnaseq/hhv8/salmon/kstme_hhv8_genes_counts.tsv", sep = "/")
ilc_path <- str_c(analysis_path, "CRI_iAtlas_Portal_end_epi.csv", sep = "/") 
gtex_nses_expression_path <- str_c(analysis_path, "cibersort_inputs/quant/gtex_nses/gene_tpm_2017-06-05_v8_skin_not_sun_exposed_suprapubic.gct", sep = "/")
gtex_ses_expression_path <- str_c(analysis_path, "cibersort_inputs/quant/gtex_ses/gene_tpm_2017-06-05_v8_skin_sun_exposed_lower_leg.gct", sep = "/")
lidenge_batchone_expression_path <- str_c(analysis_path, "rnaseq/human/cibersort_inputs/lidenge_batchone.tsv", sep = "/")
lidenge_batchtwo_expression_path <- str_c(analysis_path, "rnaseq/human/cibersort_inputs/lidenge_batchtwo.tsv", sep = "/")
lidenge_batchthree_expression_path <- str_c(analysis_path, "rnaseq/human/cibersort_inputs/lidenge_batchthree.tsv", sep = "/")
lidenge_batchfour_expression_path <- str_c(analysis_path, "rnaseq/human/cibersort_inputs/lidenge_batchfour.tsv", sep = "/")
tso_expression_path <- str_c(analysis_path, "rnaseq/human/cibersort_inputs/tso_epidemic.tsv", sep = "/")
hippos_epidemic_expression <- str_c(analysis_path, "rnaseq/human/cibersort_inputs/hippos_epidemic_merged.tsv", sep = "/")
hippos_endemic_expression <- str_c(analysis_path, "rnaseq/human/cibersort_inputs/hippos_endemic_merged.tsv", sep = "/")
# Targeted gene expression
targerted_expression_path <- str_c(analysis_path, "nanostring/nanostring_counts.xlsx", sep = "/")

### scRNA-Seq GEX + VDJ data paths

In [465]:
seurat_input_path <- str_c(scrna_sequencing_path, "KS/preprocess/object/TCRmerged", sep = "/")
azimuth_reference_path <- str_c(analysis_path, "pbmc_multimodal.h5seurat", sep = "/")

ERROR: Error in parse(text = x, srcfile = src): <text>:3:0: unexpected end of input
1: seurat_input_path <- str_c(scrna_sequencing_path, "KS/preprocess/object/TCRmerged", sep = 
2: azimuth_reference_path <- str_c(analysis_path, "pbmc_multimodal.h5seurat", sep = "/")
  ^


### GLIPH2 results

In [439]:
kspbmc_gliph_clusters_path <- str_c(analysis_path, "gliph/kspbmc_gliph_run/gliph_clusters_cluster.csv", sep = "/")
kspbmc_gliph_hlapreds_path <- str_c(analysis_path, "gliph/kspbmc_gliph_run/gliph_clusters_HLA.csv", sep = "/")

### Output paths

In [12]:
figures_path <- str_c(results_path, "figures", sep = "/")
table_path <- str_c(results_path, "tables", sep = "/")
checkpoint_path <- str_c(results_path, "checkpoints", sep = "/")

## Format metadata

### HIPPOS

#### Load raw titer tables from the HIPPOS study

In [25]:
hippos_titer_table <- readxl::read_excel(hippos_lab_metrics_path) |>
  dplyr::select(repertoire_id, ptid, collected_date_idi,
    matches("visitcode|hivvl|cd4abs|cd8abs|colldt_kshv_or\\d+|titer_kshv_or\\d+|
    colldt_kshv_pl\\d+|titer_kshv_pl\\d+"))
kstme_titer_table

repertoire_id,ptid,collected_date_idi,visitcode_idi,hivvlyn_idi,hivvl_lod_idi,hivvl_detected_idi,hivvl_detectedcat_idi,hivvl_idi,cd4abs_test_date_idi,⋯,titer_kshv_pl2,titer_kshv_pl3,titer_kshv_pl4,titer_kshv_pl5,titer_kshv_pl6,titer_kshv_pl7,visitcode_kshv_pl8,titer_kshv_pl8,titer_kshv_pl9,titer_kshv_pl10
<chr>,<chr>,<dttm>,<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dttm>,⋯,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
008_001_A,U008-001,2012-10-04,V00,NO,,,,,,⋯,,,,,,,,,,
008_001_A,U008-001,2012-10-08,V01,YES,20,YES,DETECTED>LOD,276908,2012-10-08,⋯,,,,,,,,,,
008_001_A,U008-001,2012-12-31,V05,YES,20,YES,DETECTED>LOD,347326,2012-12-31,⋯,,,,,,,,,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
008_261_V9_10_26_21,U008-261,,,,,,,,,⋯,,,,,,,,,,
008_264_V1_5_6_21,U008-264,,,,,,,,,⋯,,,,,,,,,,


Next we are going to create a table that shows the study entry dates for each of the individuals enrolled in the HIPPOS cohort

In [26]:
hippos_study_entry_table <- hippos_titer_table |> 
  dplyr::select(ptid, matches("date|colldt"))  |> 
  pivot_longer(-ptid, names_to = "group", values_to = "date") |> 
  group_by(ptid) |> 
  summarize(min_date = min(date, na.rm = TRUE)) |>
  ungroup() |> 
  dplyr::rename(patient_id = ptid, study_entry_date = min_date) |> 
  mutate(patient_id = str_replace(str_remove(patient_id, "U"), "-", "_"),
    study_entry_date = as_date(study_entry_date)) 
hippos_study_entry_table

patient_id,study_entry_date
<chr>,<date>
008_001,2012-10-04
008_002,2012-10-15
008_003,2012-10-22
⋮,⋮
008_263,2021-04-20
008_264,2021-04-26


#### HIV titer values
HIV titre values are just recorded at every entry for individuals with epidemic 
KS. PTIDs in the titre tables do not match the rest of our metadata this needs
to standardized first. The visit codes need to be transformed into consistent 
factors. Dates need to be converted to relative time frames. When the viral 
loads are below LOD, titre values are set to NA.

In [27]:
hippos_hiv_titer_table <- read_excel(hippos_hiv_status_path) |> #Just one sheet in dataset
  dplyr::select(-c(ptid, visitcode, accession_no, hivrna_comments, 
    hivrna_detectedcat_bl, hivrna_dt_bl, hivrna_bl)) |> #We just need the HIV counts in blood by visit
  dplyr::rename(patient_id = PTID, screen_date = hivscreendt, titre = hivrna,
    collection_date = hivrna_dt, visit_code = visitname) |> #Rename variables 
  filter(!is.na(hiv)) |> #Remove missing information
  mutate(patient_id = str_replace(str_extract(patient_id, "008-\\d+"), 
      "-", "_"), #Need to convert the PTIDs to the format compatible with sequencing data
    collection_date = as_date(collection_date), 
    screen_date = as_date(screen_date),
    visit_code = factor(visit_code, levels = c("V00", "V01", "V02", "V03",
      "V04", "V05", "V06", "V07", "V08", "V09", "V10", "V11")), #The max visit observed was V11
    days_in_study = interval(screen_date, collection_date) %/% days(1),
    category = "HIV", units = "copies/mL") |>
  dplyr::select(patient_id, visit_code, titre, category, units, days_in_study) |>
  arrange(patient_id, visit_code)
hippos_hiv_titer_table

patient_id,visit_code,titre,category,units,days_in_study
<chr>,<fct>,<dbl>,<chr>,<chr>,<dbl>
008_001,V01,276908,HIV,copies/mL,4
008_001,V05,347326,HIV,copies/mL,88
008_001,V08,685,HIV,copies/mL,174
⋮,⋮,⋮,⋮,⋮,⋮
008_265,V01,139653,HIV,copies/mL,
008_265,V05,141524,HIV,copies/mL,


#### CD4 values 

These follow the same table format as the CD8 table, so the code is 
essentially reused here and in the next code chunk. It is important to note that
CD4+ T-cell counts were mostly collected for just the individuals with epidemic
KS. Though a handful of individuals with endemic KS do have CD4+ counts
associated with their case

In [29]:
hippos_cd4_titer_table <- hippos_titer_table |> 
  dplyr::select(ptid, visitcode_idi, cd4abs_test_date_idi, cd4abs_idi) |> 
  dplyr::rename(patient_id = ptid, visit_code = visitcode_idi, 
    collection_date = cd4abs_test_date_idi, titre = cd4abs_idi) |>
  mutate(patient_id = str_replace(str_remove(patient_id, "U"), "-", "_"),
    collection_date = as_date(collection_date),
    visit_code = factor(visit_code, levels = c("V00", "V01", "V02", "V03", 
      "V04", "V05", "V06", "V07", "V08", "V09", "V10", "V11")),
    category = "CD4_count", units = "cells/uL") |>
  left_join(hippos_study_entry_table, by = "patient_id") |>
  mutate(days_in_study = interval(study_entry_date, collection_date) %/% days(1)) |> 
  ungroup() |> 
  dplyr::select(-collection_date, -study_entry_date) |>
  filter(!is.na(titre))
hippos_cd4_titer_table

patient_id,visit_code,titre,category,units,days_in_study
<chr>,<fct>,<dbl>,<chr>,<chr>,<dbl>
008_001,V01,127,CD4_count,cells/uL,4
008_001,V05,127,CD4_count,cells/uL,88
008_001,V08,225,CD4_count,cells/uL,174
⋮,⋮,⋮,⋮,⋮,⋮
008_263,V09,286,CD4_count,cells/uL,275
008_263,V10,406,CD4_count,cells/uL,366


#### KSHV plasma titer
Now we repeat this process for KSHV titres in peripheral blood samples

In [31]:
hippos_plasma_titre_dates <- hippos_titer_table |> 
  dplyr::select(ptid, visitcode_kshv_pl, matches("colldt_kshv_or1")) |>
  pivot_longer(cols = -c(ptid, visitcode_kshv_pl)) |>
  dplyr::rename(patient_id = ptid, visit_code = visitcode_kshv_pl, observation = name,
    collection_date = value) |> 
  mutate(patient_id = str_replace(str_remove(patient_id, "U"), "-", "_"),
    collection_date = as_date(collection_date), 
    observation = str_remove(observation, "colldt_kshv_"),
    observation = str_replace(observation, "or", "pl")) |>
  filter(!is.na(collection_date)) |> 
  left_join(hippos_study_entry_table, by = "patient_id") |>
  mutate(days_in_study = interval(study_entry_date, collection_date) %/% days(1))
# Separate out titres for each swab at each time point
hippos_kshv_plasma_titre_values <- hippos_titer_table |> 
  dplyr::select(ptid, visitcode_kshv_pl, matches("titer_kshv_pl1")) |>
  pivot_longer(cols = -c(ptid, visitcode_kshv_pl)) |>
  dplyr::rename(patient_id = ptid, visit_code = visitcode_kshv_pl, observation = name,
    titre = value) |> 
  mutate(patient_id = str_replace(str_remove(patient_id, "U"), "-", "_"),
    observation = str_remove(observation, "titer_kshv_")) |>
  filter(!is.na(titre)) 
# Merge the dates and titres on patient_id, visit_code, and swab number. As 
# mentioned earlier, for now, we are just keeping one titre value for each
# visit. For each visit the lowest value of days_in_study and the highest value
# of titre in recorded
hippos_kshv_plasma_titre_table <- left_join(hippos_plasma_titre_dates,
    hippos_kshv_plasma_titre_values,
    by = c("patient_id", "visit_code", "observation")) |>
  dplyr::select(-study_entry_date) |>
  ungroup() |> 
  group_by(patient_id, visit_code) |> 
  summarize(titre = max(titre), days_in_study = min(days_in_study)) |> 
  ungroup() |> 
  mutate(visit_code = factor(visit_code, levels = c("V00", "V01", "V02", "V03", 
    "V04", "V05", "V06", "V07", "V08", "V09", "V10", "V11")),
    category = "KSHV_plasma", units = "copies/mL")
hippos_kshv_plasma_titre_table

patient_id,visit_code,titre,days_in_study,category,units
<chr>,<fct>,<dbl>,<dbl>,<chr>,<chr>
008_001,V01,0.00,4,KSHV_plasma,copies/mL
008_001,V02,5062.27,26,KSHV_plasma,copies/mL
008_001,V03,2158.66,46,KSHV_plasma,copies/mL
⋮,⋮,⋮,⋮,⋮,⋮
008_263,V01,19.85711,0,KSHV_plasma,copies/mL
008_264,V01,23.17499,10,KSHV_plasma,copies/mL


In [32]:
# Now we can merge all six tables into one titre table for whole study
hippos_titer_tables <- bind_rows(hippos_hiv_titer_table, hippos_cd4_titer_table, 
  hippos_kshv_plasma_titre_table)
hippos_titer_tables

patient_id,visit_code,titre,category,units,days_in_study
<chr>,<fct>,<dbl>,<chr>,<chr>,<dbl>
008_001,V01,276908,HIV,copies/mL,4
008_001,V05,347326,HIV,copies/mL,88
008_001,V08,685,HIV,copies/mL,174
⋮,⋮,⋮,⋮,⋮,⋮
008_263,V01,19.85711,KSHV_plasma,copies/mL,0
008_264,V01,23.17499,KSHV_plasma,copies/mL,10


In [33]:
# By pivoting wide, we can generate a table with one row for each of the
# subjects in the study. 
hippos_titre_table_wide <- hippos_titer_tables |> 
  pivot_wider(id_cols = c(patient_id, visit_code), 
    names_from = category, values_from = titre) |>
  group_by(patient_id, visit_code) |>
  summarize_all(max)
hippos_titre_table_wide

patient_id,visit_code,HIV,CD4_count,KSHV_plasma
<chr>,<fct>,<dbl>,<dbl>,<dbl>
008_001,V01,276908,127,0.00
008_001,V02,,,5062.27
008_001,V03,,,2158.66
⋮,⋮,⋮,⋮,⋮
008_265,V01,139653,,
008_265,V05,141524,,


### Sequencing metadata
The current study was concieved with a subset of the Hippos datasets where we
had collected many modalities of sequencing data. This dataset will be reffered
to as the KSTME dataset. A total of 622 TRB repertoires from the 130 individuals in the study. With a total of 436 repertoires (74 PBMCs, 84 NAT, 268 Tumor, and 10 scREP) from individuals with epidemic KS and 152 repertoires (42 PBMCs, 26 NAT, 80 Tumor, and 4 scREP) from individuals with endemic KS. This is accompanied with 27 nanostring datasets and 52 bulk RNA-Seq datasets from Endemic and Epidemic KS lesions. For each of the individuals with sequencing data, we also have accompanying HLA typing information. 

In [34]:
hippos_sequencing_table <- readr::read_csv(hippos_metadata_path, 
  show_col_types = FALSE)
hippos_sequencing_table

cohort,patient_id,repertoire_id,tra_repertoire_id,trb_repertoire_id,rna_libraries,nanostring_libraries,gex_library,vdj_library,sc_library_count,⋯,DPB1allele2,DQA1allele1,DQA1allele2,DQB1allele1,DQB1allele2,DRB1allele1,DRB1allele2,DRB345allele1,DRB345allele2,timestamp
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Hippos,008_001,008_001_A,,008_001_A,,001-A,,,,⋯,DPB1*105:01:01,DQA1*01:02:01,DQA1*04:01:01,DQB1*04:02:01,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01,1/12/23
Hippos,008_001,008_001_B,008_001_B,008_001_B,008-001-B-12,001-B,,,,⋯,DPB1*105:01:01,DQA1*01:02:01,DQA1*04:01:01,DQB1*04:02:01,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01,1/12/23
Hippos,008_001,008_001_C,,,008-001-C-13,,,,,⋯,DPB1*105:01:01,DQA1*01:02:01,DQA1*04:01:01,DQB1*04:02:01,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01,1/12/23
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
TsoEtAl,SRR5787176,SRR5787176,,,p83,,,,,⋯,,,,,,,,,,
TsoEtAl,SRR5787177,SRR5787177,,,p23,,,,,⋯,,,,,,,,,,


#### Merge laboratory and sequencing metadata
The sequencing data was obtained from a subset of tissue that was used to 
quantify the laboratory metrics. Therefore we can merge the two tables by 
patient ID and visit code

In [35]:
hippos_metadata_table <- left_join(hippos_sequencing_table, hippos_titre_table_wide,
    by = c("patient_id", "visit_code")) |>
  mutate(phenotype = case_when(
    hiv_status == "Positive" & ks_status == "Positive" ~ "Epidemic KS",
    hiv_status == "Negative" & ks_status == "Positive" ~ "Endemic KS",
    hiv_status == "Negative" & ks_status == "Negative" ~ "Control"))
hippos_metadata_table

cohort,patient_id,repertoire_id,tra_repertoire_id,trb_repertoire_id,rna_libraries,nanostring_libraries,gex_library,vdj_library,sc_library_count,⋯,DQB1allele2,DRB1allele1,DRB1allele2,DRB345allele1,DRB345allele2,timestamp,HIV,CD4_count,KSHV_plasma,phenotype
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>
Hippos,008_001,008_001_A,,008_001_A,,001-A,,,,⋯,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01,1/12/23,276908,127,0,Epidemic KS
Hippos,008_001,008_001_B,008_001_B,008_001_B,008-001-B-12,001-B,,,,⋯,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01,1/12/23,276908,127,0,Epidemic KS
Hippos,008_001,008_001_C,,,008-001-C-13,,,,,⋯,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01,1/12/23,276908,127,0,Epidemic KS
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
TsoEtAl,SRR5787176,SRR5787176,,,p83,,,,,⋯,,,,,,,,,,Epidemic KS
TsoEtAl,SRR5787177,SRR5787177,,,p23,,,,,⋯,,,,,,,,,,Epidemic KS


### BL metadata
As a way to benchmark the T-cell responses in KS, we compare the diversity of 
the T-cell repertoire of the KS TME with the repertoire of other Pathogen
associated malingnancies like BL. Here we use two datasets of pediatric BL from
Ghana and Uganda. But before we load the datasets, lets format the metadata

In [36]:
blghana_metadata_table <- read_csv(blghana_metadata_path, show_col_types = FALSE) |>
  dplyr::select(fileName, tissueType) |> 
  dplyr::rename(trb_repertoire_id = fileName, tissue_type = tissueType) |>
  mutate(patient_id = str_extract(trb_repertoire_id, "\\w+"),
    cohort = "BL - Ghana",
    phenotype = "BL") 
head(blghana_metadata_table)

trb_repertoire_id,tissue_type,patient_id,cohort,phenotype
<chr>,<chr>,<chr>,<chr>,<chr>
H003840_01,Tumor,H003840_01,BL - Ghana,BL
H003840_02,Tumor,H003840_02,BL - Ghana,BL
H031201_02,Tumor,H031201_02,BL - Ghana,BL
⋮,⋮,⋮,⋮,⋮
H057518_02,Tumor,H057518_02,BL - Ghana,BL
H057519_01,Tumor,H057519_01,BL - Ghana,BL


In [37]:
bluganda_metadata_table <- read_csv(bluganda_metadata_path,
  show_col_types = FALSE, name_repair = "unique_quiet") |>
  dplyr::select(samples, patientID, tissueType, aAllele1:DRB345allele2, HIVStatus,
    EBVstatus, BL, vitalStatus, sex, age) |> 
  dplyr::rename(trb_repertoire_id = samples, tissue_type = tissueType,
    hiv_status = HIVStatus, ebv_status = EBVstatus, bl_status = BL,
    died = vitalStatus, gender = sex, patient_id = patientID) |>
  mutate(bl_status = recode(bl_status, "Yes" = "Positive", "No" = "Negative"),
    died = recode(died, "Alive" = "No", "Dead" = "Yes"),
    patient_id = as.character(patient_id),
    cohort = "BL - Uganda",
    phenotype = "BL") 
head(bluganda_metadata_table)


trb_repertoire_id,patient_id,tissue_type,aAllele1,aAllele2,bAllele1,bAllele2,cAllele1,cAllele2,DPA1allele1,⋯,DRB345allele1,DRB345allele2,hiv_status,ebv_status,bl_status,died,gender,age,cohort,phenotype
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
MVQ80482a,009_0103,Tumor,A*02:05:01,A*74:01,B*45:01:01,B*58:02,C*06:02:01,C*06:02:01,DPA1*01:03:01,⋯,DRB3*03:01:01,DRB4*01:03:01,Negative,Positive,Positive,No,Male,10,BL - Uganda,BL
MVQ81040A,009_0109,Tumor,A*02:01:01,A*30:01:01,B*39:10:01,B*45:01:01,C*12:03:01,C*16:01:01,DPA1*01:03:01,⋯,,,Negative,Positive,Positive,No,Male,12,BL - Uganda,BL
MVQ81231A,009_0112,Tumor,A*01:01:01,A*74:01,B*45:01:01,B*58:01:01,C*03:02:02,C*06:02:01,DPA1*01:03:01,⋯,DRB3*02:02:01,DRB3*03:01:01,Negative,Negative,Positive,Yes,Female,7,BL - Uganda,BL
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
MVQ90143A,009_0128,Tumor,A*31:04,A*66:02,B*15:03:01,B*81:01,C*08:02:01,C*08:04:01,DPA1*01:03:01,⋯,DRB4*01:01:01,DRB5*01:01:01,Positive,Negative,Positive,Yes,Male,5,BL - Uganda,BL
MVQ91163A,009_0145,Tumor,,,,,,,,⋯,,,Negative,Negative,Negative,Yes,Female,8,BL - Uganda,BL


### Merge metadata

In the present study we will use multiple sources of data:
1. AIRR-Seq, RNA-Seq and Nanostring data from the HIPPOS cohort
2. RNA-Seq data from Tso et al.,
3. RNA-Seq data from Lidenge et al.,
4. AIRR-Seq data from pediatric BL from Ghana and Uganda

In the next few codeblock we will compile a joint `study_metdata` and summarize
the samples counts of the study

In [38]:
study_metadata <- bind_rows(hippos_metadata_table, blghana_metadata_table,
  bluganda_metadata_table) |> 
  mutate(repertoire_id = if_else(is.na(repertoire_id), trb_repertoire_id, 
    repertoire_id))
study_metadata

cohort,patient_id,repertoire_id,tra_repertoire_id,trb_repertoire_id,rna_libraries,nanostring_libraries,gex_library,vdj_library,sc_library_count,⋯,DRB345allele1,DRB345allele2,timestamp,HIV,CD4_count,KSHV_plasma,phenotype,DPBallele2,ebv_status,bl_status
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
Hippos,008_001,008_001_A,,008_001_A,,001-A,,,,⋯,DRB3*01:01:02,DRB3*03:01:01,1/12/23,276908,127,0,Epidemic KS,,,
Hippos,008_001,008_001_B,008_001_B,008_001_B,008-001-B-12,001-B,,,,⋯,DRB3*01:01:02,DRB3*03:01:01,1/12/23,276908,127,0,Epidemic KS,,,
Hippos,008_001,008_001_C,,,008-001-C-13,,,,,⋯,DRB3*01:01:02,DRB3*03:01:01,1/12/23,276908,127,0,Epidemic KS,,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
BL - Uganda,009_0232,MVQ108912A,,MVQ108912A,,,,,,⋯,DRB3*02:02:01,DRB3*02:02:01,,,,,BL,DPB1*105:01,Positive,Positive
BL - Uganda,009_0249,MVQ194745A,,MVQ194745A,,,,,,⋯,DRB3*01:01:02,DRB3*02:02:01,,,,,BL,DPB1*105:01,Positive,Positive


## Section 1: Viral gene expression, and immune cell type abundance in KS tumor and control tissue

### Prepare HIV gene expression table

In [9]:
suppressWarnings({
hiv_dictionary <- read.table(hiv_fasta_path, sep="\n")$V1[grep(">", readLines(hiv_fasta_path))] |>
  as_tibble() |>
  separate(value, into = c("pid", "gene_name", "org"), sep = "\\s") |>
  mutate(pid = str_remove(pid, ">"),
    gene_name = if_else(gene_name == "Envelope", "Env", gene_name)) |>
    pull(gene_name)
names(hiv_dictionary) <- read.table(hiv_fasta_path, sep="\n")$V1[grep(">", readLines(hiv_fasta_path))] |>
  as_tibble() |>
  separate(value, into = c("pid", "gene_name", "org"), sep = "\\s") |>
  mutate(pid = str_remove(pid, ">"),
    gene_name = if_else(gene_name == "Envelope", "Env", gene_name)) |>
    pull(pid)
head(hiv_dictionary)})

In [10]:
# Prepare HIV gene expression table
hiv_expression_table <- read_tsv(hiv_gene_expression_path,
    show_col_types = FALSE) |>
  mutate(Name = str_remove_all(Name, "lcl\\|NC_001802.1_cds_|_\\d+$"),
    Name = hiv_dictionary[Name]) 
hiv_gene_names <- hiv_expression_table |>
  pull(Name)
head(hiv_expression_table)

Name,008_001_B,008_001_C,008_001_I,008_003_C,008_004_D,008_005_C,008_006_B,008_006_C,008_007_B,⋯,008_029_C,008_030_B,008_030_C,008_032_B,008_034_B,008_037_B,008_122_B,008_140_B,008_175_B,008_211_D
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Vpr,0.0,0,109427.0,0,0.0,0,0,146631,108206.0,⋯,146745,36466.7,532094,77129.1,0,73775.5,0,821708,0,0
Tat,57184.7,0,55645.2,658517,392418.0,0,0,0,0.0,⋯,227184,142175.0,0,142841.0,0,20955.2,0,0,0,0
Vif,0.0,0,57443.8,137771,52443.8,0,226270,203311,88059.1,⋯,42024,92065.8,0,41340.5,292284,53127.4,436469,0,0,0
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Nef,0.0,0,285122,62371.1,430348,0,0,92812.1,258860,⋯,280027,337332,191108,354976,0,395778,387058.0,0,567126,0
Env,40654.9,0,105637,80992.5,56292,634090,0,80359.4,125382,⋯,126295,116803,159696,144293,113451,107418,61869.2,178292,201805,777618


### Prepare HHV8 gene expression tables

In [11]:
# Prepare HHV8 gene dictionary
suppressWarnings({
hhv8_dictionary <- read.table(hhv8_fasta_path, sep="\n")$V1[grep(">", readLines(hhv8_fasta_path))] |>
  as_tibble() |>
  separate(value, into = c("pid", "gene_name", "org"), sep = "\\s") |>
  mutate(pid = str_remove(pid, ">"),
    gene_name = if_else(gene_name == "Envelope", "Env", gene_name)) |>
    pull(gene_name)
names(hhv8_dictionary) <- read.table(hhv8_fasta_path, sep="\n")$V1[grep(">", readLines(hhv8_fasta_path))] |>
  as_tibble() |>
  separate(value, into = c("pid", "gene_name", "org"), sep = "\\s") |>
  mutate(pid = str_remove(pid, ">"),
    gene_name = if_else(gene_name == "Envelope", "Env", gene_name)) |>
    pull(pid)
head(hhv8_dictionary)})

In [13]:
# Prepare HHV8 gene expression data
hhv8_expression_table <- read_tsv(hhv8_gene_expression_path,
    show_col_types = FALSE) |>
  dplyr::mutate(Name = str_remove_all(Name, "lcl\\|NC_009333.1_cds_|_\\d+$"),
    Name = hhv8_dictionary[Name])
# Break up KSHV genes into groups based on stage of expression
#https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1001013#s5
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3894221/
kshv_groups <- c("vIRF-3" = "La", "ORF73" = "La", "K4.2" = "IE", "ORF45" = "IE",
  "ORF48" = "IE", "ORF50" = "IE", "K8" = "IE", "ORF70" = "E1", "ORF10" = "E1",
  "ORF56" = "E1", "ORF11" = "E1", "K3" = "E1", "ORF40" = "E1", "ORF59" = "E1",
  "K15" = "E1", "ORF54" = "E1", "ORF2" = "E2", "K1" = "E2", "K2" = "E2",
  "ORF46" = "E2", "ORF9" = "E2", "K5" = "E2", "K7" = "E2", "K6" = "E2",
  "vIRF-1" = "E2", "K14" = "E2", "vIRF-2" = "E2", "ORF49" = "E2", "ORF74" = "E2",
  "ORF7" = "E2", "ORF6" = "E2", "ORF44" = "E3", "ORF31" = "E3", "ORF19" = "E3",
  "ORF37" = "E3",  "ORF57" = "E3", "ORF17" = "E3", "ORF36" = "E3", "ORF29" = "E3",
  "ORF21" = "E3", "ORF61" = "E3", "ORF60" = "E3", "ORF66" = "E3", "ORF16" = "E3",
  "vIRF-4" = "E3", "ORF69" = "E3", "ORF23" = "L4", "ORF22" = "L4", "ORF25" = "L4",
  "ORF20" = "L4", "ORF26" = "L4", "ORF24" = "L4", "ORF27" = "L4", "ORF30" = "L4",
  "ORF28" = "L4", "ORF34" = "L4", "ORF18" = "L4", "ORF35" = "L4", "ORF63" = "L4",
  "ORF62" = "L4", "ORF65" = "L4", "ORF32" = "L4", "ORF38" = "L4", "ORF33" = "L4", 
  "ORF43" = "L4", "ORF64" = "L4", "ORF67" = "L4", "ORF68" = "L4", "ORF42" = "L5",
  "ORF4" = "L5", "ORF39" = "L5", "K8.1" = "L5", "ORF8" = "L5", "ORF47" = "L5",
  "ORF75" = "L5", "ORF58" = "L5", "ORF52" = "L5", "ORF53" = "L5", "ORF55" = "L5",
  "K12" = "La", "K4" = "E1", "K4.1" = "E1", "ORF17.5" = "E2", "ORF67A" = "L5",
  "ORF71" = "La", "ORF72" = "La" )
  head(hhv8_expression_table)

Name,008_001_B,008_001_C,008_001_I,008_003_C,008_004_D,008_005_C,008_006_B,008_006_C,008_007_B,⋯,008_098_C,008_122_B,008_140_B,008_140_D,008_175_B,008_189_B,008_196_C,008_211_D,008_216_D,008_234_D
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ORF43,313.259,22.1425,1172.3,194.571,618.727,0,15.0339,69.0059,0.000,⋯,0.000,0,163.988,0.00,791.9940,28.4459,98.8281,0.000,296.225,640.2350
ORF4,116.006,24.6127,0.0,648.511,1721.300,0,4563.1200,3313.0000,344.554,⋯,729.133,0,1569.200,2361.93,84.9846,1699.8700,278.5380,239.734,0.000,385.9460
ORF19,116.241,9.8651,1306.5,0.000,1724.840,0,16.7403,0.0000,0.000,⋯,0.000,0,0.000,0.00,0.0000,32.1476,0.0000,0.000,0.000,77.3667
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
ORF36,295.124,150.500,0,1100.03,3079.28,0,21.2659,48.8927,2192.8,⋯,0.0,6102.28,0.000,0.000,618.423,297.66,0.00,0.00,583.544,713.919
K8.1,5628.400,837.321,0,9254.41,8110.30,0,335.3740,444.0080,22716.7,⋯,690.7,5818.84,333.623,713.613,3145.210,4220.06,1020.26,215.76,1569.710,9186.370


### Prepare CibersortX results

In [21]:
# Load and format CibersortX results
cibersort_table <- list.files(cibersortx_results_path, pattern = "hippos",
    full.names = TRUE) |>
    map_df(~read_csv(.x, show_col_types = FALSE)) |>
    mutate(Mixture = str_remove(Mixture, "_quant"),
        Mixture = str_replace_all(Mixture, "-", "_"),
        Mixture = str_extract(Mixture, "008_\\d+_\\w")) |>
    dplyr::select(-`P-value`, -Correlation, -RMSE) |> 
    pivot_longer(c(-Mixture, -`Absolute score (sig.score)`), names_to = "cell_type", values_to = "score") |>
    mutate(percentage = (score*100)/`Absolute score (sig.score)`) |>
    pivot_wider(id_cols = Mixture, names_from = cell_type, values_from = percentage)
cibersort_table 

Mixture,B cells naive,B cells memory,Plasma cells,T cells CD8,T cells CD4 naive,T cells CD4 memory resting,T cells CD4 memory activated,T cells follicular helper,T cells regulatory (Tregs),⋯,Monocytes,Macrophages M0,Macrophages M1,Macrophages M2,Dendritic cells resting,Dendritic cells activated,Mast cells resting,Mast cells activated,Eosinophils,Neutrophils
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
008_061_C,7.912886,0,3.921065,19.70740,0,3.996909,1.182755,5.857817,0.5966646,⋯,1.1638893,10.835613,11.778665,23.80936,1.9529614,0,2.60010266,0.0000000,0,0.00000000
008_098_B,6.596161,0,3.678067,21.50593,0,0.000000,9.657895,3.845360,0.9626662,⋯,0.0000000,1.311762,6.150924,39.87146,0.0000000,0,0.00000000,0.0000000,0,0.23090733
008_098_C,5.366013,0,2.474363,42.23846,0,0.000000,4.580758,3.435627,6.1273057,⋯,0.5155276,0.000000,4.586556,24.21384,0.1408064,0,0.03316393,0.2484124,0,0.03748412
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
008_034_B,0.000000,0.3521525,4.483397,25.12626,0,0,0,16.59834,0.0000000,⋯,0.1082690,0.00000,22.656619,14.69550,0.3275799,0,8.387082,0,0,1.730945
008_037_B,7.057123,0.0000000,14.341140,18.42728,0,0,0,14.77712,0.3843394,⋯,0.5192566,8.19816,7.489275,20.11378,0.0000000,0,3.648959,0,0,0.000000


### Format HIV table to include all samples form the study

In [22]:
# Generate HIV sample list table
# We will need this table to generate the final heatmap. 
# Since many samples do not show expression of HIV, this table will help
# generate a full table with 51 samples
sample_list <- cibersort_table |> 
  pull(Mixture)
hiv_gene_list <- hiv_expression_table |>
  pull(Name)

hiv_sample_table <- expand_grid(sample_list, hiv_gene_list) |>
  mutate(tpm = NULL) |>
  dplyr::rename(samples = sample_list, Name = hiv_gene_list)
sample_table <- cibersort_table |> 
  pull(Mixture) |> 
  as_tibble() |> 
  dplyr::rename(samples = value)
head(hiv_sample_table)

samples,Name
<chr>,<chr>
008_061_C,Vpr
008_061_C,Tat
008_061_C,Vif
⋮,⋮
008_061_C,Nef
008_061_C,Env


In [41]:
# Prep RNA-Seq metadata
rna_seq_metadata <-  study_metadata |> 
  filter(!is.na(rna_libraries)) |> 
  dplyr::select(cohort, repertoire_id, rna_libraries, tissue_type, phenotype)
rna_seq_metadata

cohort,repertoire_id,rna_libraries,tissue_type,phenotype
<chr>,<chr>,<chr>,<chr>,<chr>
Hippos,008_001_B,008-001-B-12,Tumor,Epidemic KS
Hippos,008_001_C,008-001-C-13,Tumor,Epidemic KS
Hippos,008_001_I,008-001-I-282,Tumor,Epidemic KS
⋮,⋮,⋮,⋮,⋮
TsoEtAl,SRR5787176,p83,Tumor,Epidemic KS
TsoEtAl,SRR5787177,p23,Tumor,Epidemic KS


### Figure 1

In [94]:
plot_order <- cibersort_table |> 
    left_join(rna_seq_metadata, by = c("Mixture" = "repertoire_id")) |>
    dplyr::arrange(phenotype, `Macrophages M2`) |>
    pull(Mixture)

In [95]:
# Plot cibersortX panel
cibersort_plot <- cibersort_table |>
  pivot_longer(cols = `B cells naive`:Neutrophils, names_to = "cell_type", 
    values_to = "abundance" ) |> 
  left_join(rna_seq_metadata, by = c("Mixture" = "repertoire_id")) |>
  mutate(Mixture = factor(Mixture, levels = plot_order)) |>
  ggplot(aes(x = cell_type, y = Mixture, fill = abundance)) +
  geom_tile(color = "white", lwd = 0.25, linetype = 1) +
  labs(y = "TRB repertoire ID", x = "Cell type", fill = "Cell type\nabundance") +
  ggtitle("Immune cell-type\ncomposition") +
  #scale_fill_gradientn(colors = rev(c('#b2182b','#d6604d','#d1e5f0'))) +
  scale_fill_viridis_b(option = "plasma") + 
  theme_classic(base_size = 20) +
  facet_grid(rows = vars(phenotype),scales = "free", space = "free", margins = F) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, face="bold"),  
    axis.title.x = element_blank(), 
    axis.text.y = element_blank(), axis.line.y = element_blank(), 
    axis.title.y = element_blank(), axis.ticks.y = element_blank(), 
    strip.background.x = element_blank(), 
    strip.background.y = element_rect(linewidth = 2, color = "black", linetype = 1), 
    strip.text.y.right = element_text(face = "bold"),
    strip.placement = "inside", plot.title = element_text(face = "bold")) 

In [96]:
# Plot HIV gene expression panel
hiv_plot <- hiv_expression_table |>
  pivot_longer(-Name, names_to = "samples",
    values_to = "tpm") |>
  full_join(hiv_sample_table) |> 
  left_join(rna_seq_metadata, by = c("samples" = "repertoire_id")) |>
  mutate(tpm = if_else(tpm == 0, NA_integer_, tpm),
    samples = factor(samples, levels = plot_order)) |>
  ggplot(aes(x = Name, y = samples, fill = tpm)) +
  geom_tile(color = "white", lwd = 0.25, linetype = 1) +
  labs(x = "Gene", fill = "TPM") +
  ggtitle("HIV gene\nexpression") +
  scale_fill_viridis_c(option = "viridis", 
    trans = scales::pseudo_log_trans(sigma = 0.001),
    breaks = c(0, 10, 100, 1000, 10000, 100000, 1000000)) +
  theme_classic(base_size = 20) +
  facet_grid(rows = vars(phenotype), scales = "free", space = "free", 
    margins = F) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, face="bold"), 
    axis.title.x = element_blank(), legend.position = "none", 
    axis.text.y = element_blank(), axis.line.y = element_blank(), 
    axis.title.y = element_blank(), axis.ticks.y = element_blank(),  
    strip.background = element_blank(), strip.text.y = element_blank(),
    strip.placement = "inside", plot.title = element_text(face = "bold")) 

[1m[22mJoining with `by = join_by(Name, samples)`


In [97]:
# Plot KSHV gene expression
kshv_plot <- hhv8_expression_table |>
  pivot_longer(col = `008_001_B`:`008_234_D`, names_to = "samples",
    values_to = "tpm") |>
  mutate(tpm = if_else(tpm == 0, NA_integer_, tpm)) |>
  right_join(sample_table) |> 
  left_join(rna_seq_metadata, by = c("samples" = "repertoire_id")) |>
  mutate(stage = factor(kshv_groups[Name], 
    levels =c("La", "IE", "E1", "E2", "E3", "L4", "L5", NA)),
    samples = factor(samples, levels = plot_order)) |>
  ggplot(aes(x = Name, y = samples, fill = tpm)) +
  geom_tile(color = "white", lwd = 0.25, linetype = 1) +
  labs(x = "Gene", fill = "TPM", y = "Sample ID") +
  ggtitle("KSHV gene\nexpression") +
  scale_fill_viridis_c(option = "viridis", 
    trans = scales::pseudo_log_trans(sigma = 0.001),
    breaks = c(0, 10, 100, 1000, 10000, 100000, 1000000)) +
  theme_classic(base_size = 20) +
  facet_grid(rows = vars(phenotype), cols = vars(stage), scales = "free", 
    space = "free", margins = F)  +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, face="bold"),
    axis.text.y = element_text(face = "bold"),
    axis.title.x = element_blank(),panel.spacing.x = unit(0.25, "lines"),
    strip.text.y = element_blank(), strip.text.x = element_text(face="bold"),
    strip.placement = "inside", axis.title.y = element_text(face = "bold"),
    plot.title = element_text(face = "bold")) 

[1m[22mJoining with `by = join_by(samples)`


In [124]:
# Plot cell type boxplot
cohort_palette <- c("Epidemic KS" = "#e41a1c", "Endemic KS" = "#377eb8")
cibersort_boxplots <- cibersort_table |>
  pivot_longer(`B cells naive`:Neutrophils, names_to = "cell_type", 
    values_to = "abundance") |>
  filter(cell_type %in% c("Macrophages M2", 
    "T cells CD4 memory activated", "T cells CD4 memory resting",
    "T cells CD8", "T cells follicular helper")) |>
  left_join(rna_seq_metadata, by = c("Mixture" = "repertoire_id")) |>
  mutate(cell_type = case_when(cell_type == "T cells CD4 memory activated" ~ "T cells CD4\nmemory activated",
    cell_type == "T cells CD4 memory resting" ~ "T cells CD4\nmemory resting",
    cell_type == "T cells follicular helper" ~ "T cells\nfollicular helper",
    TRUE ~ cell_type)) |>
  ggboxplot(x = "phenotype", y = "abundance", color = "phenotype", 
    facet.by = "cell_type", nrow = 1) |>
  ggpar(xlab = "Cohort", ylab = "Abundance", legend.title = "Cohort",
    ggtheme = theme_classic(base_size = 20), ylim = c(0, 60),
    legend = "right", palette = cohort_palette,
    panel.labs.font = list(face = "bold")) +
  stat_compare_means(comparisons = list(c("Endemic KS", "Epidemic KS")),
    size = 5, label = "p.signif") +
  theme(strip.text.x = element_text(face="bold"),
    axis.text.x = element_blank(), axis.ticks.x = element_blank(),
    axis.title = element_text(face = "bold"))

In [133]:
## Plot correlation scatter plot for M2 macrophages and CD8 T-cells
tcell_mac_corr_plot <- cibersort_table |>
    left_join(rna_seq_metadata, by = c("Mixture" = "repertoire_id")) |>
    ggplot(aes(x = `Macrophages M2`, y = `T cells CD8`, color = phenotype)) +
    geom_point() +
    labs(ylab = "T cells CD8 (%)",
        xlab = "Macrophage M2 (%)") +
    geom_smooth(method = "glm", aes(fill=phenotype)) +
    ylim(0, 50) + 
    stat_cor(p.accuracy = 0.001, r.accuracy = 0.01, size = 8)+
    scale_color_manual(values = cohort_palette) + 
    scale_fill_manual(values = cohort_palette) + 
    theme_classic(base_size = 20) +
    theme(axis.title = element_text(face = "bold"),
        legend.position = "none")

In [134]:
# Assemble all panels of Figure 1
figure_one_layout <- "
1111111111111111112233333
1111111111111111112233333
1111111111111111112233333
1111111111111111112233333
4444444444444444444455555"
suppressWarnings({
figure_one <- kshv_plot +  hiv_plot +
  cibersort_plot  +  cibersort_boxplots + tcell_mac_corr_plot +
  plot_layout(design = figure_one_layout, guides = "collect",
    widths =c(9, 1, 2.5, 10, 2.5)) +
  plot_annotation(tag_levels = 'A') & theme(text = element_text('NimbusSan'))
ggsave(str_c(figures_path, "pdf", "Figure_one.pdf", sep = "/"), figure_one, 
  limitsize = FALSE, width = 30, height = 24, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Figure_one.svg", sep = "/"), figure_one,
  limitsize = FALSE, width = 30, height = 24, device = "svg")
ggsave(str_c(figures_path, "png", "Figure_one.png", sep = "/"), figure_one,
  limitsize = FALSE, width = 30, height = 24, device = "png")})

[1m[22m`geom_smooth()` using formula = 'y ~ x'
[1m[22m`geom_smooth()` using formula = 'y ~ x'
[1m[22m`geom_smooth()` using formula = 'y ~ x'


### Supplementary figure 1:

In [137]:
phenotype_table <- study_metadata |> 
    filter(cohort == "Hippos") |>
    dplyr::select(patient_id, phenotype) |>
    mutate(phenotype = replace_na(phenotype, "Epidemic KS")) |>
    distinct()


In [139]:
targeted_viral_expression <- read_excel(targerted_expression_path, sheet = 4)
supplementary_figure_one <- targeted_viral_expression  |>
    pivot_longer(c(-Target, -`Probe Name`), names_to = "sample", values_to = "l2fc") |>
    mutate(sample = case_when(sample == "8020" ~ "008_020",
        sample == "030-B" ~ "008_030", sample == "028-B" ~ "008_028",
        sample == "024-B" ~ "008_024", sample == "013-B" ~ "008_013",
        sample == "008-C" ~ "008_008", sample == "001-B" ~ "008_001",
        TRUE ~ str_replace(sample, "-", "_")),
      sample = factor(sample, levels = c("008_001", "008_008", "008_013",
          "008_020", "008_024", "008_028", "008_030", "008_036", "008_037",
          "008_052", "008_059", "008_061", "008_062", "008_075", "008_085",
          "008_088", "008_092", "008_093", "008_097", "008_099", "008_101",
          "008_106"))) |>
    left_join(phenotype_table, by = c("sample" = "patient_id")) |>
    arrange(sample) |>
    ggplot(aes(y = `sample`, x = `Probe Name`, fill = l2fc)) +
    geom_tile(color = "white", lwd = 0.5, linetype = 1) +
    labs(x = "Probe name", y = "Sample", fill = "Log2FC") +
    #scale_fill_gradientn(colors = rev(c('#b2182b','#d6604d','#d1e5f0'))) +
    scale_fill_viridis_b(breaks = c(2,4,6,8,10)) +
    theme_classic(base_size = 20) +
    facet_grid(cols = vars(Target), rows = vars(phenotype), 
      scales = "free", space = "free", margins = F) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, face = "bold", family = 'NimbusSan'),
        axis.text.y = element_text(face = "bold", family = 'NimbusSan'),
         strip.text.x = element_text(face="bold", family = 'NimbusSan'),
         strip.text.y = element_text(face = "bold", angle = 0, family = 'NimbusSan'),
        axis.title.x = element_blank(), 
        axis.line.y = element_blank(), legend.key.size = unit(1, "cm"),
        axis.title.y = element_blank(), axis.ticks.y = element_blank()) 
ggsave(str_c(figures_path, "pdf", "Supplementary_figure_one.pdf", sep = "/"), supplementary_figure_one, 
  limitsize = FALSE, width = 20, height = 12, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Supplementary_figure_one.svg", sep = "/"), supplementary_figure_one,
  limitsize = FALSE, width = 20, height = 12, device = "svg")
ggsave(str_c(figures_path, "png", "Supplementary_figure_one.png", sep = "/"), supplementary_figure_one,
  limitsize = FALSE, width = 20, height = 12, device = "png")

### Supplementary figure 2:


In [141]:
readCiberSort <- function(path) {
    study = basename(path)
    cib_table <- read_csv(path, show_col_types = FALSE) |>
        mutate(Mixture = str_remove(Mixture, "_quant"),
            cohort = study,
            cohort = case_when(cohort == "gtex_nes.csv" ~ "Non sun-exposed skin",
                cohort == "gtex_ses.csv" ~ "Sun-exposed skin",
                str_detect(cohort, "lid") ~ "Lidenge et al.",
                str_detect(cohort, "tso") ~ "Tso et al."))
    return(cib_table)
}


public_cibersort_table <- list.files(cibersortx_results_path,
        pattern = "lid|tso|gtex", full.names = TRUE) |>
    map_df(readCiberSort)  |>
    dplyr::select(-`P-value`, -Correlation, -RMSE) |> 
    pivot_longer(c(-Mixture, -cohort,  -`Absolute score (sig.score)`),
        names_to = "cell_type", values_to = "score") |>
    mutate(percentage = (score*100)/`Absolute score (sig.score)`) |>
    pivot_wider(id_cols = c(Mixture, cohort), names_from = cell_type,
        values_from = percentage)
public_cibersort_table

Mixture,cohort,B cells naive,B cells memory,Plasma cells,T cells CD8,T cells CD4 naive,T cells CD4 memory resting,T cells CD4 memory activated,T cells follicular helper,⋯,Monocytes,Macrophages M0,Macrophages M1,Macrophages M2,Dendritic cells resting,Dendritic cells activated,Mast cells resting,Mast cells activated,Eosinophils,Neutrophils
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GTEX-1117F-2926-SM-5GZYI,Non sun-exposed skin,0.000000,0.0000000,0,7.4767371,0,5.5761521,0,3.842084,⋯,0.000000,5.678447,0.000000,18.45082,0.000000,0.000000,55.45436,0,0,0.000000
GTEX-111CU-1126-SM-5EGIM,Non sun-exposed skin,3.686808,0.0000000,0,1.6520993,0,0.8580771,0,5.497539,⋯,0.000000,6.602357,0.000000,16.72780,11.188591,1.285114,47.10898,0,0,1.528191
GTEX-111FC-2526-SM-5GZXU,Non sun-exposed skin,0.000000,0.7061475,0,0.7025772,0,10.7521579,0,9.144440,⋯,2.019597,0.000000,0.549818,14.75316,1.070596,0.000000,54.85222,0,0,0.000000
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
SRR5787176,Tso et al.,0.000000,1.838458,9.234666,7.635842,0,0.000000,0,3.198196,⋯,0.08701132,18.28287,4.904041,36.51076,3.438229,0,5.2369415,0.000000,0,0.08552486
SRR5787177,Tso et al.,2.546005,0.000000,6.736545,11.449475,0,1.000833,0,8.065807,⋯,0.00000000,13.29125,10.929159,27.07443,1.822308,0,0.4075915,2.379944,0,0.00000000


In [457]:
supplementary_figure_two_heatmap <- public_cibersort_table |>
  pivot_longer(cols = `B cells naive`:Neutrophils, names_to = "cell_type", 
    values_to = "abundance" ) |>
  mutate(Mixture = if_else(cohort == "Lidenge et al.", 
    str_extract(Mixture, "SRR\\d+$"), Mixture)) |>
  left_join(rna_seq_metadata, by = c("Mixture" = "repertoire_id")) |>
  mutate(phenotype = case_when(cohort.x %in% c("Non sun-exposed skin", "Sun-exposed skin") & is.na(tissue_type) ~ "Control", 
    cohort.x == "Lidenge et al." & tissue_type == "Normal" ~ "Con-\ntrol",
    tissue_type == "Tumor" & phenotype == "Endemic KS" ~ "Ende-\nmic KS", 
    cohort.x == "Tso et al." & tissue_type == "Tumor" & phenotype == "Epidemic KS" ~ "Epide-\nmic KS", 
    tissue_type == "Tumor" & phenotype == "Epidemic KS" ~ "Epidemic KS",
    TRUE ~ tissue_type),
    cohort.x = factor(cohort.x, levels = c("Lidenge et al.", "Tso et al.", "Non sun-exposed skin", "Sun-exposed skin")),
    phenotype = factor(phenotype, levels = c("Con-\ntrol", "Control", "NAT", "Ende-\nmic KS", "Endemic KS", "Epide-\nmic KS", "Epidemic KS"))) |>
  ggplot(aes(y = cell_type, x = Mixture, fill = abundance)) +
  geom_tile(color = "white", lwd = 0.5, linetype = 1) +
  labs(x = "Sample", y = "Cell type", fill = "Cell type\nabundance") +
  #scale_fill_gradientn(colors = rev(c('#b2182b','#d6604d','#d1e5f0'))) +
  scale_fill_viridis_b(option = "plasma") + 
  theme_classic(base_size = 20) +
  facet_nested(cols = vars(cohort.x, phenotype),scales = "free", space = "free", margins = F) +
  theme(axis.text.x = element_blank(), strip.background = element_blank(),
    axis.title.x = element_blank(), strip.text = element_text(face = "bold"),
    axis.text.y = element_text(face = "bold"), axis.line = element_blank(), 
    axis.ticks = element_blank(),
    text = element_text(family = 'NimbusSan')) 

In [458]:
supp_cohort_palatte <- c("Epidemic KS - NAT" = "#fb8072", 
  "Epidemic KS - Tumor" = "#e41a1c", "Endemic KS - NAT" = "#80b1d3",
  "Endemic KS - Tumor" = "#377eb8", "Control" = "#67a628")


supplementary_figure_two_corrplot <- public_cibersort_table |>
    mutate(Mixture = if_else(cohort == "Lidenge et al.", 
    str_extract(Mixture, "SRR\\d+$"), Mixture)) |>
    left_join(rna_seq_metadata, by = c("Mixture" = "repertoire_id")) |>
    mutate(phenotype = case_when(cohort.x %in% c("Non sun-exposed skin", "Sun-exposed skin") & is.na(tissue_type) ~ "Control", 
    cohort.x == "Lidenge et al." & tissue_type == "Normal" ~ "Control",
    tissue_type == "Tumor" & phenotype == "Endemic KS" ~ "Endemic KS - Tumor", 
    tissue_type == "Tumor" & phenotype == "Epidemic KS" ~ "Epidemic KS - Tumor", 
    tissue_type == "NAT" & phenotype == "Epidemic KS" ~ "Epidemic KS - NAT", 
    tissue_type == "NAT" & phenotype == "Endemic KS" ~ "Endemic KS - NAT"),
    cohort.x = factor(cohort.x, levels = c("Lidenge et al.", "Tso et al.", "Non sun-exposed skin", "Sun-exposed skin")),
    phenotype = factor(phenotype)) |>
    ggscatter(x = "Macrophages M2", y = "T cells CD8", color = "phenotype", 
        facet.by = "cohort.x", nrow = 1,
    add = "reg.line")  + 
    stat_cor(aes(color = phenotype), p.accuracy = 0.001, r.accuracy = 0.01, size = 6) +
    theme_classic(base_size = 20) + 
    labs(y = "T cells CD8 (%)", x = "Macrophages M2 (%)", color = "Tissue type") +
    ylim(0,50) +
    scale_color_manual(values = supp_cohort_palatte) +
    scale_fill_manual(values = supp_cohort_palatte) +
    theme(strip.text.x = element_text(face="bold"),
      axis.title = element_text(face = "bold"),
      legend.position = "right")

In [459]:
 
supplementary_figure_two <- supplementary_figure_two_heatmap / supplementary_figure_two_corrplot
ggsave(str_c(figures_path, "pdf", "Supplementary_figure_two.pdf", sep = "/"), supplementary_figure_two, 
  limitsize = FALSE, width = 30, height = 14, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Supplementary_figure_two.svg", sep = "/"), supplementary_figure_two,
  limitsize = FALSE, width = 30, height = 14, device = "svg")
ggsave(str_c(figures_path, "png", "Supplementary_figure_two.png", sep = "/"), supplementary_figure_two,
  limitsize = FALSE, width = 30, height = 14, device = "png")

### Supplementary figure 3:

In [230]:
nanostring_matrix <- read_excel(targerted_expression_path, sheet=1) |>
  pivot_longer(cols = c(-`Probe Name`, -Function), names_to = "sample", values_to = "counts")
nanostring_meta <- read_excel(targerted_expression_path, sheet=2) |>
  pivot_longer(cols = `001-B`:His9DP, names_to = "sample", values_to = "tissue_type") |> 
  dplyr::select(sample, tissue_type) |> 
  distinct()

nanostring_matrix <- left_join(nanostring_matrix, nanostring_meta, by = "sample")
my_breaks <- c(0, 0.01, 0.1, 1, 10, 100, 1000, 10000)
nanostring_matrix

Function,Probe Name,sample,counts,tissue_type
<chr>,<chr>,<chr>,<dbl>,<chr>
Macrophage,CD14,001-B,903.94,Lesion
Macrophage,CD14,008-C,734.67,Lesion
Macrophage,CD14,013-B,377.48,Lesion
⋮,⋮,⋮,⋮,⋮
T-cell inhibition,TIGIT,KS Norm,1.789021,
T-cell inhibition,TIGIT,KS + HC,1.438920,


In [238]:
nanostring_plot <- nanostring_matrix |> 
  dplyr::filter(!is.na(tissue_type) & sample != "KS + HC") |>
  mutate(tissue_type = case_when(str_detect(sample, "A$") ~ "NAT",
        str_detect(sample, "^His") ~ "Histology",
        TRUE ~ "Tumor"),
    Function = case_when(Function == "Tumor proliferation" ~ "Proliferation",
      Function == "T-cell proliferation" ~ "T-cell",
      Function == "Tumor inhibition" ~ "M1 Macrophage",
      TRUE ~ Function),
    Function = factor(Function, levels = c("T-cell", "CD4 T-cell", "CD8 T-cell", 
      "T-reg", "T-cell inhibition", "NK cell", 
      "Macrophage", "M1 Macrophage", "M2 Macrophage", 
      "Proliferation"))) |>
  filter(tissue_type != "Histology") |>
  ggplot(aes(x = sample, y = `Probe Name`, fill = counts)) +
  geom_tile(aes(width=0.9, height=0.9), color = "black", linewidth=0.1) +
  labs(x = "Samples", y = "Probe", fill = "Counts") +
  scale_fill_viridis_c(breaks = my_breaks, labels = my_breaks, 
                       trans = scales::pseudo_log_trans(sigma = 0.001)) +
  theme_classic(base_size = 26) +
  facet_grid(cols = vars(tissue_type), rows = vars(Function), scales = "free", space = "free", margins = F) +
  theme(axis.text.x = element_blank(),  axis.title.x = element_blank(),
    axis.line.y = element_blank(), axis.title.y = element_blank(), 
    axis.text.y = element_text(face = "bold"),
    strip.placement = "inside", axis.ticks.y = element_blank(),
    strip.text.y = element_blank(), strip.text.x = element_text(face = "bold"),
    legend.position = "bottom",
    legend.key.size = unit(2, "cm"),
    text = element_text(family = "NimbusSan")) 

In [232]:
nanostringfc_plot <- nanostring_matrix |> 
  dplyr::filter(is.na(tissue_type) & sample != "KS + HC") |>
  mutate(tissue_type = if_else(sample == "KS + HC", "Lesion\nvs\nNormal/HC", "FC"),
    Function = case_when(Function == "Tumor proliferation" ~ "Proliferation",
      Function == "T-cell proliferation" ~ "T-cell",
      Function == "Tumor inhibition" ~ "M1 Macrophage",
      TRUE ~ Function),
    Function = factor(Function, levels = c("T-cell", "CD4 T-cell", "CD8 T-cell", 
      "T-reg", "T-cell inhibition", "NK cell", 
      "Macrophage", "M1 Macrophage", "M2 Macrophage", 
      "Proliferation"))) |>
  ggplot(aes(x = sample, y = `Probe Name`, fill = counts)) +
  geom_tile(aes(width=0.9, height=0.9), color = "black", linewidth=0.1) +
  labs(x = "Samples", y = "Probe", fill = "Log2FC") +
  scale_fill_viridis_b(option = "plasma") +
  theme_classic(base_size = 26) +
  facet_grid(cols = vars(tissue_type), rows = vars(Function), scales = "free", space = "free", margins = F) +
  theme(axis.text.x = element_blank(),  axis.title.x = element_blank(),
    axis.text.y = element_blank(),  
    axis.line.y = element_blank(), axis.title.y = element_blank(), 
    strip.placement = "inside", axis.ticks.y = element_blank(),
    strip.text.x = element_text(face = "bold"),
    strip.text.y = element_text(angle = 0, face = "bold"), legend.position = "bottom",
    text = element_text(family = "NimbusSan")) 

In [239]:
plot_design <- "11111111111111111111111111112"
supplementary_figure_three <- nanostring_plot + nanostringfc_plot +plot_layout(design = plot_design)

ggsave(str_c(figures_path, "pdf", "Supplementary_figure_three.pdf", sep = "/"), supplementary_figure_three, 
  limitsize = FALSE, width = 25, height = 15, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Supplementary_figure_three.svg", sep = "/"), supplementary_figure_three,
  limitsize = FALSE, width = 25, height = 15, device = "svg")
ggsave(str_c(figures_path, "png", "Supplementary_figure_three.png", sep = "/"), supplementary_figure_three,
  limitsize = FALSE, width = 25, height = 15, device = "png")


## Section 2: KS TIL repertoires are diverse and poorly correlated with viral loads 

### AIRR-Seq data preparation
The coming sections will outline the steps taken to analyze the following groups 
of TCR datasets 
1. Study entry samples with one NAT and one tumor sample from the Hippos cohort 
2. Pediatric BL samples from Ghana and Uganda

Only datasets with more than 1000 sequences will be considered for the study


In [241]:
# Load AIRR-Seq data
trb_paths <- list.files(c(hippos_path, blghana_path, bluganda_path, arks_path), 
    pattern = "tsv", recursive = TRUE, full.names = TRUE)
study_raw_table <- readImmunoSeq(trb_paths)
study_nprod_table <- productiveSeq(study_raw_table, aggregate = "junction")
study_amino_acid_table <- productiveSeq(study_raw_table, aggregate = "junction_aa")


Reading AIRR-Seq files [----------------------------------------] 2/816 (  0%) eta:  2m elapsed:  0s

Reading AIRR-Seq files [----------------------------------------] 3/816 (  0%) eta:  3m elapsed:  1s

Reading AIRR-Seq files [----------------------------------------] 4/816 (  0%) eta:  4m elapsed:  1s

Reading AIRR-Seq files [----------------------------------------] 5/816 (  1%) eta:  5m elapsed:  2s

Reading AIRR-Seq files [----------------------------------------] 6/816 (  1%) eta:  4m elapsed:  2s

Reading AIRR-Seq files [----------------------------------------] 7/816 (  1%) eta:  5m elapsed:  3s

Reading AIRR-Seq files [----------------------------------------] 8/816 (  1%) eta:  5m elapsed:  3s

Reading AIRR-Seq files [----------------------------------------] 9/816 (  1%) eta:  6m elapsed:  4s

Reading AIRR-Seq files [---------------------------------------] 10/816 (  1%) eta:  6m elapsed:  4s

Reading AIRR-Seq files [>--------------------------------------] 11/816 (  1%) et

### Dataset summary

#### Raw repertoire counts

In [242]:
annotated_nprod_table <- study_metadata |>
  inner_join(study_nprod_table, by = c("trb_repertoire_id" = "repertoire_id"))
print(str_c("Total number of repertoires", 
  annotated_nprod_table |> pull(trb_repertoire_id) |> unique() |> length(),
  sep = " : "))
print(str_c("Total number of ptids", 
  annotated_nprod_table |> pull(patient_id) |> unique() |> length(),
  sep = " : "))

[1] "Total number of repertoires : 656"
[1] "Total number of ptids : 215"


#### Hippos TRB count

In [243]:
annotated_summary <- clonality(annotated_nprod_table)

In [244]:
# Study datasets summary table
# Displays all modalities of data avaiable from each dataset 
# Datasets are filtered out if their TRB sequencing contains less than 1000 sequences
options(repr.matrix.max.rows=10, repr.matrix.max.columns=5)
study_metadata |> 
    inner_join(annotated_summary, by = c("repertoire_id" = "repertoire_id")) |>
    filter(total_sequences >= 1000) |>
    group_by(cohort, phenotype, tissue_type) |> 
    summarize(number_patient =  length(unique(patient_id)),
        number_repertoires = length(unique(repertoire_id)),
        number_trb_repertoires = length(unique(trb_repertoire_id)),
        number_rna_libraries = length(unique(na.omit(rna_libraries))),
        number_nanostring = length(unique(na.omit(nanostring_libraries))))

cohort,phenotype,tissue_type,number_patient,number_repertoires,number_trb_repertoires,number_rna_libraries,number_nanostring
<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>
BL - Ghana,BL,Tumor,37,37,37,0,0
BL - Uganda,BL,Tumor,12,12,12,0,0
Hippos,Endemic KS,NAT,25,25,25,0,0
Hippos,Endemic KS,PBMC,15,42,42,0,0
Hippos,Endemic KS,Tumor,36,78,78,12,1
Hippos,Endemic KS,scREP,1,1,1,0,0
Hippos,Epidemic KS,NAT,39,39,39,0,3
Hippos,Epidemic KS,PBMC,33,73,73,0,0
Hippos,Epidemic KS,Tumor,101,221,221,18,17
Hippos,Epidemic KS,scREP,5,6,7,0,0


In [245]:
# Tumors per individual

study_metadata |> 
    inner_join(annotated_summary, by = c("repertoire_id" = "repertoire_id")) |>
    filter(total_sequences >= 1000 & tissue_type == "Tumor") |>
    group_by(patient_id) |>
    summarize(number_of_repertoires = length(unique(trb_repertoire_id))) |>
    pull(number_of_repertoires) |>
    summary()

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   1.000   1.871   3.000   7.000 

In [246]:
# Repertoires per study
options(repr.matrix.max.rows=5)
study_metadata |> 
    inner_join(annotated_summary, by = c("repertoire_id" = "repertoire_id")) |>
    filter(total_sequences >= 1000) |>
    group_by(cohort, phenotype) |>
    summarize(number_of_repertoires = length(unique(patient_id))) 

cohort,phenotype,number_of_repertoires
<chr>,<chr>,<int>
BL - Ghana,BL,37
BL - Uganda,BL,12
Hippos,Endemic KS,38
Hippos,Epidemic KS,106


In [247]:
# RNA-Seq samples by HIV status
study_metadata |> 
    filter(!is.na(rna_libraries) & cohort == "Hippos") |>
    group_by(hiv_status) |>
    summarize(count = length(unique(rna_libraries)))

hiv_status,count
<chr>,<int>
Negative,12
Positive,39


In [248]:
# Nanostring sample summary 
study_metadata |> 
    filter(!is.na(nanostring_libraries) & cohort == "Hippos") |>
    group_by(tissue_type) |>
    summarize(count = length(unique(nanostring_libraries)))

tissue_type,count
<chr>,<int>
NAT,5
Tumor,27


In [249]:
study_metadata |> 
    inner_join(annotated_summary, by = c("repertoire_id" = "repertoire_id")) |>
    filter(total_sequences >= 1000 ) |>
    group_by(cohort, phenotype) |> 
    summarize(number_patient =  length(unique(patient_id)))

cohort,phenotype,number_patient
<chr>,<chr>,<int>
BL - Ghana,BL,37
BL - Uganda,BL,12
Hippos,Endemic KS,38
Hippos,Epidemic KS,106


#### Get summarized Renyi numbers
Vegan has a wonderful utility to calculate the Renyi numbers accumulated over 
100 iterations. Since vegan will sample down to the smallest sample, we will 
need to see how many of these samlpes have at least 5000 sequences, as well as
how many of them have 1000 sequences so that we can pick a sample diversity 
threshold based on depth of sequencing. Using LymphoSeq2 we can get a summary of
repertoire characteristics for all 656 re

In [250]:
study_metadata |> 
    inner_join(annotated_summary, by = c("repertoire_id" = "repertoire_id")) |>
    filter(total_sequences >= 1000 & visit_code == "V01") |>
    group_by(cohort, phenotype) |> 
    summarize(number_patient =  length(unique(patient_id)),
        number_repertoires = length(unique(repertoire_id)))

cohort,phenotype,number_patient,number_repertoires
<chr>,<chr>,<int>,<int>
Hippos,Endemic KS,38,99
Hippos,Epidemic KS,105,214


In [252]:
study_sequenced_fitlered_sample_list <- annotated_summary |> filter(total_sequences > 1000)
study_annotated_nprod_table <- annotated_nprod_table |>
  filter(repertoire_id %in% study_sequenced_fitlered_sample_list$repertoire_id)
study_annotated_nprod_table

cohort,patient_id,repertoire_id,tra_repertoire_id,trb_repertoire_id,rna_libraries,nanostring_libraries,gex_library,vdj_library,sc_library_count,⋯,junction_aa,v_call,d_call,j_call,v_family,d_family,j_family,reading_frame,duplicate_count,duplicate_frequency
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
Hippos,008_001,008_001_B,008_001_B,008_001_B,008-001-B-12,001-B,,,,⋯,PNDPTNEKLFF,TRBV19-1,TRBD2-1,TRBJ1-4,TRBV19,TRBD2,TRBJ1,in-frame,1,3.218953e-05
Hippos,008_001,008_001_B,008_001_B,008_001_B,008-001-B-12,001-B,,,,⋯,RASSPPLGLNYGYTF,TRBV7-3,TRBD1-1,TRBJ1-2,TRBV7,TRBD1,TRBJ1,in-frame,10,3.218953e-04
Hippos,008_001,008_001_B,008_001_B,008_001_B,008-001-B-12,001-B,,,,⋯,RASSSPGTGCGYTF,TRBV7-3,TRBD1-1,TRBJ1-2,TRBV7,TRBD1,TRBJ1,in-frame,1,3.218953e-05
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
BL - Uganda,009_0249,MVQ194745A,,MVQ194745A,,,,,,⋯,CASNLDSREGGGAEAFF,TRBV5-5,,TRBJ1-1,TRBV5,,TRBJ1,in-frame,1,0.0004258944
BL - Uganda,009_0249,MVQ194745A,,MVQ194745A,,,,,,⋯,CASSLHPVGGTSYEQYF,TRBV5-5,TRBD2-1,TRBJ2-7,TRBV5,TRBD2,TRBJ2,in-frame,1,0.0004258944


##### Sampled Renyi numbers
Given that the sequencing depth across all the samples vary quite a lot due to
various factors, we will need to normalize the diversity metrics by depth. One
way to do this is to iteratively sample all the datasets to the lowest possible
depth of sequencing and then calculate the Renyi metrics for each iteration. 
This can be then average to get normalized Renyi metrics for each repertoire

In [None]:
# This function randomly samples 1000 TRB sequences from each repertoire and 
# iteratively calls the sampledRenyi function based on user input. Finally
# it calculates the mean Renyi numbers for a sample across all the iterations
iterativeRenyi <- function(study_table, iterations, min_count = 1000) {
    uncount_table <- study_table %>% 
        tidyr::uncount(weights = duplicate_count) 
    progress_bar <- progress::progress_bar$new(
        format = "Iteration [:bar] :current/:total (:percent) eta: :eta elapsed: :elapsed",
        total = iterations, clear = FALSE, width = 100)
    progress_bar$tick(0)
    srenyi_table <- purrr::map(1:iterations, 
                                \(i)sampledRenyi(uncount_table, min_count, progress_bar)) %>%
        dplyr::bind_rows() %>% 
        dplyr::group_by(repertoire_id) %>%
        dplyr::summarise_all(mean)
    return(srenyi_table)
}
# Given an uncounted table of TCR sequences with each sequence in the repertoire
# represented as a unique row of count 1. This function will first collapse 
# all rows by junction_aa, calculate the total count and frequency of each
# unique CDR3 sequence and then use the vegan package to calculate the Renyi 
# numbers for the sampled repertoire across multiple values of alpha
sampledRenyi <- function(study_table, min_count, progress) {
    progress$tick()
    study_table <- study_table %>% 
        dplyr::group_by(repertoire_id) |>
        dplyr::sample_n(min_count) %>%
        dplyr::select(-duplicate_frequency) %>% 
        dplyr::group_by(repertoire_id, junction_aa) %>% 
        dplyr::summarise(duplicate_count = dplyr::n()) %>%
        dplyr::group_by(repertoire_id) |>
        dplyr::mutate(duplicate_frequency = duplicate_count/sum(duplicate_count)) %>%
        dplyr::ungroup() 
    study_matrix <- study_table |>
      pivot_wider(id_cols = repertoire_id, names_from = junction_aa,
        values_from = duplicate_frequency, values_fn = sum, values_fill = 0)
    rep_names <- study_matrix |>
      pull(repertoire_id)
    study_matrix <- study_matrix |>
      select(-repertoire_id) |>
      as.matrix()
    rownames(study_matrix) <- rep_names
    if (length(rep_names) == 1) {
      renyi_table <- renyi(study_matrix) |>
        as_tibble_row() |>
        mutate(repertoire_id = rownames(study_matrix))
    } else {
      renyi_table <- renyi(study_matrix) |>
        as_tibble() |>
        mutate(repertoire_id = rownames(study_matrix))
    }
    return(renyi_table)
}
study_renyi_table <- study_annotated_nprod_table |>
  iterativeRenyi(100, min_count = 1000)

#### KS TME visit one sample list

Since the evaluation of the TME is primarily looking at the visit one tumor-NAT
samples. Lets now identify these samples tumor-NAT samples that pass our 
sequencing filters

In [253]:
ks_visitone <- study_metadata |> 
  filter(repertoire_id %in% study_sequenced_fitlered_sample_list$repertoire_id &
    phenotype %in% c("Epidemic KS", "Endemic KS") & 
    tissue_type %in% c("Tumor", "NAT")) |> 
  filter(visit_code == "V01") |> 
  group_by(phenotype, patient_id, tissue_type) |> 
  summarize(reps =length(unique(repertoire_id))) |> 
  pivot_wider(id_cols = patient_id, names_from = tissue_type, 
    values_from = reps) |> 
  filter(!is.na(NAT) & !is.na(Tumor)) |>
  mutate(total = sum(NAT, Tumor)) 
print(str_c("number of PTID with complete visitone data", length(ks_visitone$patient_id),
  sep = ":"))
print(str_c("number of repertoires with complete visitone data", sum(ks_visitone$total),
  sep = ":"))
print(str_c("number of repertoires with complete visitone data", sum(ks_visitone$total),
  sep = ":"))

[1] "number of PTID with complete visitone data:59"
[1] "number of repertoires with complete visitone data:162"
[1] "number of repertoires with complete visitone data:162"


#### Checkpoint 3:

At this point lets create our first checkpoint, we will store all the AIRR-Seq
data, the metadata and summary metrics in RDA files for ease of access

In [466]:
# Save raw study table, and annotated nucleotide table
save(study_raw_table, study_annotated_nprod_table, study_amino_acid_table,
    file = str_c(checkpoint_path, "kstme_paper_raw_tables.rda", sep = "/"))
# Save Renyi summary metrics, clonality table, and a list of samples that passed
# the seqeuencing threshold of 1000 total sequences
save(annotated_summary, study_renyi_table, study_sequenced_fitlered_sample_list, 
    file = str_c(checkpoint_path, "kstme_paper_tcr_tables.rda", sep = "/"))

In [254]:
load(str_c(checkpoint_path, "kstme_paper_tcr_tables.rda", sep = "/"))


### Figure 2:

In [255]:
# Subset tumor and NAT samples from the Renyi table and merge with metadata
til_normal_renyi <- study_renyi_table |> 
    dplyr::rename(trb_repertoire_id = repertoire_id) |>
    pivot_longer(cols = `0`:`Inf`, names_to = "alpha", 
        values_to = "renyi_number") |>
    inner_join(study_metadata, by = "trb_repertoire_id") |>
    filter(cohort %in% c("Hippos", "BL - Ghana", "BL - Uganda") & 
        tissue_type %in% c("NAT", "Tumor") & 
        (visit_code == "V01" | is.na(visit_code)) & !is.na(trb_repertoire_id)) |>
    mutate(cohort = case_when(
        cohort == "Hippos"  ~  str_c(phenotype, tissue_type, sep = " - "),
        cohort != "Hippos" ~ cohort),
        cohort = factor(cohort,  
            levels = c("Endemic KS - NAT", "Endemic KS - Tumor",
                "Epidemic KS - NAT", "Epidemic KS - Tumor", 
                "BL - Ghana", "BL - Uganda")),
        alpha = str_replace(alpha, "Inf", "\u221e")) 

In [256]:
# Summary of all ptids and repertoires with tumor and normal data from the three studies
options(repr.matrix.max.rows=7)
til_normal_renyi |>
  group_by(cohort) |>
  summarize(ptid_count = length(unique(patient_id)),
    repertoire_count = length(unique(trb_repertoire_id)))
options(repr.matrix.max.rows=5)

cohort,ptid_count,repertoire_count
<fct>,<int>,<int>
Endemic KS - NAT,23,23
Endemic KS - Tumor,21,45
Epidemic KS - NAT,34,34
Epidemic KS - Tumor,70,109
BL - Ghana,37,37
BL - Uganda,12,12


In [258]:
comparisons <- list(c("Endemic KS - Tumor", "Epidemic KS - Tumor"),
   c("Epidemic KS - NAT", "Epidemic KS - Tumor"), 
   c("Endemic KS - Tumor", "BL - Ghana"),
  c("Epidemic KS - Tumor", "BL - Ghana"),
  c("Epidemic KS - Tumor", "BL - Uganda"))
comparisons_clin <- list(c("Endemic KS", "Epidemic KS"))
cohort_palette <- c("Epidemic KS - NAT" = "#fb8072", 
  "Epidemic KS - Tumor" = "#e41a1c", "Endemic KS - NAT" = "#80b1d3",
  "Endemic KS - Tumor" = "#377eb8", "BL - Uganda" = "#a65628",
  "BL - Ghana" = "#fdb462")

alpha_zero <- til_normal_renyi |>
  filter(alpha == "0") |>
  ggboxplot(x = "cohort", y = "renyi_number", color = "cohort") |>
  ggpar(xlab = FALSE, ylab = "Species richness\n(\u03b1 = 0)", legend.title = "Cohort",
    ggtheme = theme_classic(base_size = 24),
    legend = "none", palette = cohort_palette) +
  rremove("x.text") +
  stat_compare_means(comparisons = list(c("Endemic KS - Tumor", "Epidemic KS - Tumor"),
   c("Endemic KS - Tumor", "BL - Ghana"),
  c("Epidemic KS - Tumor", "BL - Ghana"),
  c("Epidemic KS - Tumor", "BL - Uganda")), 
  hide.ns = TRUE, size = 5, label = "p.signif", show.legend = T)
alpha_one <- til_normal_renyi |>
  filter(alpha == "1") |>
  ggboxplot(x = "cohort", y = "renyi_number", color = "cohort") |>
  ggpar(xlab = FALSE, ylab = "Shannon entropy\n(\u03b1 = 1)", legend.title = "Cohort",
    ggtheme = theme_classic(base_size = 24),
    legend = "none", palette = cohort_palette) + 
  rremove("x.text") +
  stat_compare_means(comparisons = list(c("Endemic KS - Tumor", "Epidemic KS - Tumor"),
   c("Endemic KS - Tumor", "BL - Ghana"),
  c("Epidemic KS - Tumor", "BL - Uganda")), hide.ns = TRUE, size = 5, label = "p.signif", show.legend = T)
alpha_two <- til_normal_renyi |>
  filter(alpha == "2") |>
  ggboxplot(x = "cohort", y = "renyi_number", color = "cohort") |>
  ggpar(xlab = "Cohort", ylab = "Simpson's diversity\n(\u03b1 = 2)", legend.title = "Cohort",
    ggtheme = theme_classic(base_size = 24), 
    legend = "none", palette = cohort_palette, x.text.angle = 45) +
  stat_compare_means(comparisons = list(c("Endemic KS - Tumor", "Epidemic KS - Tumor"),
   c("Epidemic KS - NAT", "Epidemic KS - Tumor"), 
   c("Endemic KS - Tumor", "BL - Ghana"),
  c("Epidemic KS - Tumor", "BL - Uganda")), 
  hide.ns = TRUE, size = 5, label = "p.signif", show.legend = T)
alpha_inf <- til_normal_renyi |>
  filter(alpha == "\u221e") |>
  ggboxplot(x = "cohort", y = "renyi_number", color = "cohort") |>
  ggpar(xlab = "Cohort", ylab = "Berger-Parker index\n(\u03b1 = \u221e)", legend.title = "Cohort",
    ggtheme = theme_classic(base_size = 24), 
    legend = "right", palette = cohort_palette, x.text.angle = 45) +
  stat_compare_means(comparisons = list(
   c("Epidemic KS - NAT", "Epidemic KS - Tumor"), 
  c("Epidemic KS - Tumor", "BL - Uganda")), 
  hide.ns = TRUE, size = 5, label = "p.signif", show.legend = T)
renyi_plot <- ggline(til_normal_renyi, x = "alpha", y = "renyi_number",
    color = "cohort", add = "mean_se") |> 
  ggpar(xlab = "\u03b1", ylab = "Renyi entropy", legend.title = "Cohort",
    ggtheme = theme_classic(base_size = 24), 
    legend = "none", palette = cohort_palette)
figure_two_layout <- "
1122
1122
3344
3344
5555
5555"
figure_two <- alpha_zero + alpha_one + alpha_two + 
  alpha_inf + renyi_plot + 
  plot_layout(design = figure_two_layout, guides = "collect") +
  plot_annotation(tag_levels = 'A') & theme(text = element_text('NimbusSan'))
ggsave(str_c(figures_path, "pdf", "Figure_two.pdf", sep = "/"), figure_two, width = 16, 
    height = 18, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Figure_two.svg", sep = "/"), figure_two, width = 16, 
    height = 18)
ggsave(str_c(figures_path, "png", "Figure_two.png", sep = "/"), figure_two, width = 16, 
    height = 18)

### Supplementary Figure 4:

In [261]:
plot_titer_tables <- study_metadata |>
  filter(trb_repertoire_id %in% til_normal_renyi$repertoire_id) |>
  dplyr::select(patient_id, visit_code, hiv_status, ks_status,
    HIV, CD4_count, KSHV_plasma) |>
  distinct()
kshv_titer_plot <- plot_titer_tables |>
  filter(!is.na(KSHV_plasma)) |>
  mutate(cohort = if_else(hiv_status == "Positive", "Epidemic KS",
    "Endemic KS")) |>
  ggboxplot(x = "cohort", y = "KSHV_plasma", color = "cohort", 
    add = "jitter") |>
  ggpar(xlab = FALSE, ylab = "Plasma KSHV titer\n(copies/mL)", 
    legend.title = "Cohort",
    ggtheme = theme_classic(base_size = 24),
    legend = "none", palette = c("Epidemic KS" = "#e41a1c",
      "Endemic KS" = "#377eb8")) +
  stat_compare_means(comparisons = comparisons_clin, hide.ns = TRUE, size = 5, label = "p.signif", show.legend = T)
cd4_plot <- plot_titer_tables |>
  filter(!is.na(CD4_count)) |>
  mutate(cohort = if_else(hiv_status == "Positive", "Epidemic KS",
    "Endemic KS")) |>
  ggboxplot(x = "cohort", y = "CD4_count", color = "cohort", 
    add = "jitter") |>
  ggpar(xlab = FALSE, ylab = str_c("CD4+ count\n(cells/\u03BC", "L)", sep = ""), 
    legend.title = "Cohort",
    ggtheme = theme_classic(base_size = 24),
    legend = "none", palette = c("Epidemic KS" = "#e41a1c",
      "Endemic KS" = "#377eb8")) +
  geom_hline(yintercept = 200, linetype = 2) +
  stat_compare_means(comparisons = comparisons_clin, hide.ns = TRUE, size = 5, label = "p.signif", show.legend = T)
supp_figure_four_layout <- "
1122
1122"
supplementary_figure_four <- kshv_titer_plot + cd4_plot +
  plot_layout(design = supp_figure_four_layout, guides = "collect") +
  plot_annotation(tag_levels = 'A') & theme(text = element_text('NimbusSan'))
ggsave(str_c(figures_path, "pdf", "Supplementarty_figure_four.pdf", sep = "/"), 
    supplementary_figure_four, width = 16, 
    height = 6, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Supplementarty_figure_four.svg", sep = "/"), 
    supplementary_figure_four, width = 16, 
    height = 6)
ggsave(str_c(figures_path, "png", "Supplementarty_figure_four.png", sep = "/"), 
    supplementary_figure_four, width = 16, 
    height = 6)


### Supplementary Figure 5:

In [263]:
correlation_data_frame <- study_metadata |>
    filter(cohort == "Hippos" & tissue_type %in% c("NAT", "Tumor") & visit_code == "V01") |>
    filter(!is.na(HIV) & !is.na(CD4_count) & !is.na(KSHV_plasma)) |>
    dplyr::select(patient_id, repertoire_id, HIV, CD4_count, KSHV_plasma, phenotype) |> 
    inner_join(study_renyi_table, by = "repertoire_id") |>
    dplyr::select(repertoire_id, HIV:`Inf`, -phenotype)
corr_row_names <- correlation_data_frame |>
    pull(repertoire_id)
correlation_data_frame <- correlation_data_frame |>
    dplyr::select(-repertoire_id) |>
    as.data.frame()
rownames(correlation_data_frame) <- corr_row_names
colnames(correlation_data_frame) <- c("HIV\nVL", "CD4+\ncount", "KSHV\nPlasma\nVL", "0", "0.25", "0.5", "1", "2", "4", "8", "16", "32", "64", "\u221e")

correlation_data_frame


Unnamed: 0_level_0,HIV VL,CD4+ count,KSHV Plasma VL,0,0.25,0.5,1,2,4,8,16,32,64,∞
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
008_001_B,276908,127,0.00,5.936510,5.798321,5.634872,5.244745,4.502215,3.826136,3.438104,3.240889,3.143218,3.094753,3.046694
008_001_D,276908,127,0.00,6.002887,5.869040,5.709425,5.323547,4.581217,3.931988,3.589271,3.413928,3.321192,3.272770,3.222954
008_002_B,4658,418,10805.01,6.185585,6.086141,5.967807,5.674548,5.020768,4.293126,3.846466,3.617506,3.505721,3.451238,3.397933
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
008_262_D,100493,342,50.17757,6.463758,6.394088,6.305485,6.051294,5.263333,4.246675,3.707140,3.463881,3.352327,3.299121,3.247573
008_263_D,313,231,19.85711,6.124420,6.017465,5.891833,5.590408,4.973164,4.351676,3.987935,3.792147,3.688173,3.634169,3.579372


In [264]:
correlation_matrix <- round(cor(correlation_data_frame), 1)
correlation_matrix

Unnamed: 0,HIV VL,CD4+ count,KSHV Plasma VL,0,0.25,0.5,1,2,4,8,16,32,64,∞
HIV VL,1.0,-0.2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD4+ count,-0.2,1.0,0.0,-0.1,-0.1,-0.1,-0.1,-0.1,0.0,0.0,0.0,0.0,0.0,0.0
KSHV Plasma VL,0.1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
64,0,0,0.1,0.5,0.5,0.6,0.7,0.9,1,1,1,1,1,1
∞,0,0,0.1,0.5,0.5,0.6,0.7,0.9,1,1,1,1,1,1


In [265]:
supplementary_figure_five <- ggcorrplot(correlation_matrix,  outline.col = "white",  lab = TRUE) +
  theme_classic() +
  theme(axis.text = element_text(size = 12, face = "bold"), axis.title = element_blank(),
    legend.key.size = unit(1, "cm"), text = element_text(family = "NimbusSan"))
ggsave(str_c(figures_path, "pdf", "Supplementary_figure_five.pdf", sep = "/"), supplementary_figure_five, 
  limitsize = FALSE, width = 14, height = 10, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Supplementary_figure_five.svg", sep = "/"), supplementary_figure_five,
  limitsize = FALSE, width = 14, height = 10, device = "svg")
ggsave(str_c(figures_path, "png", "Supplementary_figure_five.png", sep = "/"), supplementary_figure_five,
  limitsize = FALSE, width = 14, height = 10, device = "png")


### Supplementary analysis 1:
Calculate the biserial correlation between renyi metrics 

In [266]:
clin_correlation_data_frame <- study_metadata |>
    filter(cohort == "Hippos" & tissue_type %in% c("NAT", "Tumor") & visit_code == "V01") |>
    dplyr::select(patient_id, repertoire_id, phenotype, response) |> 
    inner_join(study_renyi_table, by = "repertoire_id") |>
    filter(!is.na(response)) |>
    mutate(response = if_else(response == "CR" | response == "PR", 1, 0)) 
# Endemic KS
biserial.cor(clin_correlation_data_frame |> filter(phenotype == "Endemic KS") |> pull(`0`), clin_correlation_data_frame |> filter(phenotype == "Endemic KS") |> pull(response), use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame |> filter(phenotype == "Endemic KS") |> pull(`1`), clin_correlation_data_frame |> filter(phenotype == "Endemic KS") |> pull(response), use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame |> filter(phenotype == "Endemic KS") |> pull(`2`), clin_correlation_data_frame |> filter(phenotype == "Endemic KS") |> pull(response), use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame |> filter(phenotype == "Endemic KS") |> pull(`Inf`), clin_correlation_data_frame |> filter(phenotype == "Endemic KS") |> pull(response), use = c("all.obs"), level = 2)

In [268]:
# Epidemic KS
biserial.cor(clin_correlation_data_frame |> filter(phenotype == "Epidemic KS") |> pull(`0`), clin_correlation_data_frame |> filter(phenotype == "Epidemic KS") |> pull(response), use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame |> filter(phenotype == "Epidemic KS") |> pull(`1`), clin_correlation_data_frame |> filter(phenotype == "Epidemic KS") |> pull(response), use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame |> filter(phenotype == "Epidemic KS") |> pull(`2`), clin_correlation_data_frame |> filter(phenotype == "Epidemic KS") |> pull(response), use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame |> filter(phenotype == "Epidemic KS") |> pull(`Inf`), clin_correlation_data_frame |> filter(phenotype == "Epidemic KS") |> pull(response), use = c("all.obs"), level = 2)

In [269]:
# Epidemic and Endemic KS
biserial.cor(clin_correlation_data_frame$`0`, clin_correlation_data_frame$response, use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame$`1`, clin_correlation_data_frame$response, use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame$`2`, clin_correlation_data_frame$response, use = c("all.obs"), level = 2)
biserial.cor(clin_correlation_data_frame$`Inf`, clin_correlation_data_frame$response, use = c("all.obs"), level = 2)

## Section 3: TIL repertoires of KS tumors are largely private to each individual

#### Comparison of seqeunce sharing at a individual TCR level

In [271]:
rep_dict <- study_metadata |>
  filter(!is.na(trb_repertoire_id)) |>
  pull(repertoire_id)
names(rep_dict) <- study_metadata |>
  filter(!is.na(trb_repertoire_id)) |>
  pull(trb_repertoire_id)
ptid_dict <- study_metadata |>
  filter(!is.na(patient_id)) |>
  pull(phenotype)
names(ptid_dict) <- study_metadata |>
  filter(!is.na(patient_id)) |>
  pull(patient_id)
ptid_dict["008_265"] <- "Epidemic KS"
compairr_table <- read_tsv(comp_path, show_col_types = FALSE) |>
  dplyr::rename(rep_one = `#`) 
comp_names <- compairr_table |>
  pull(rep_one)
compairr_matrix <- compairr_table |>
  dplyr::select(-rep_one) |>
  as.matrix() 
rownames(compairr_matrix) <- comp_names
compairr_matrix[lower.tri(compairr_matrix, diag = FALSE)] <- NA 
compairr_table <- compairr_matrix |>
  as_tibble(rownames = "rep_one") |>
  pivot_longer(-rep_one, names_to = "rep_two", values_to = "overlap") |>
  filter(str_detect(rep_one, "008_\\d+") & str_detect(rep_two, "008_\\d+") & 
    !is.na(overlap) & 
    str_extract(rep_one, "[A-Z]+$") %in% c("A","B", "C", "D") &
    str_extract(rep_two, "[A-Z]+$") %in% c("A","B", "C", "D")) |>
  mutate(rep_one = rep_dict[rep_one], rep_two = rep_dict[rep_two],
    sample_group = if_else(str_extract(rep_one, "008_\\d+") ==
      str_extract(rep_two, "008_\\d+"), "Intra-subject", "Inter-subject"),
    tumor_group = case_when(
      str_extract(rep_one, "[A-Z]+$") == "A" &
        str_extract(rep_two, "[A-Z]+$") == "A" ~ "NAT - NAT",
      str_extract(rep_one, "[A-Z]+$") == "A" &
        str_extract(rep_two, "[A-Z]+$") %in% c("B", "C", "D") ~ "NAT - Tumor",
      str_extract(rep_two, "[A-Z]+$") == "A" &
        str_extract(rep_one, "[A-Z]+$") %in% c("B", "C", "D") ~ "NAT - Tumor",
      str_extract(rep_two, "[A-Z]+$") %in% c("B", "C", "D") &
        str_extract(rep_one, "[A-Z]+$") %in% c("B", "C", "D") ~ "Tumor - Tumor"),
    ks_group = str_c(ptid_dict[str_extract(rep_one, "008_\\d+")],
      ptid_dict[str_extract(rep_two, "008_\\d+")], sep = " - "),
    ks_group = str_replace(ks_group, "Epidemic KS - Endemic KS",
      "Endemic KS - Epidemic KS")) |>
  filter(!(sample_group == "Inter-subject" & tumor_group == "NAT - Tumor") &
    !(rep_one == rep_two))

alluvial_data <- study_annotated_nprod_table |>
  filter(patient_id %in% c("008_057", "008_061", "008_021", "008_048") & 
    visit_code == "V01") |>
  productiveSeq()



Subsetting productive sequences [=>----------]  12% eta:  2s

Subsetting productive sequences [=>----------]  19% eta: 13s

Subsetting productive sequences [==>---------]  25% eta: 10s

Subsetting productive sequences [===>--------]  31% eta:  8s

Subsetting productive sequences [===>--------]  38% eta:  7s

Subsetting productive sequences [====>-------]  44% eta:  9s

Subsetting productive sequences [=====>------]  50% eta:  7s












### Understanding the prevelance of TCRs with known antigenic specificity

#### Generate database of TCRs with known antigenic specificity

In [272]:
# Create a database of VDJdb and McPAS-TCR sequences
vdjdb_table <- read_tsv(vdjdb_slim_path, show_col_types = FALSE) |>
    dplyr::filter(vdjdb.score > 1 & gene == "TRB" & species == "HomoSapiens") |>
    dplyr::rename(epitope = antigen.epitope, pathology = antigen.species, 
        mhc_allele = mhc.a, trb_cdr3_aa = cdr3, antigen = antigen.gene) |>
    dplyr::select(epitope, pathology, antigen, mhc_allele, trb_cdr3_aa) |>
    separate_rows(mhc_allele, sep = ",") |>
    mutate(mhc_allele = str_remove(mhc_allele, "HLA-"))
vdjdb_table

epitope,pathology,antigen,mhc_allele,trb_cdr3_aa
<chr>,<chr>,<chr>,<chr>,<chr>
GLCTLVAML,EBV,BMLF1,A*02,CASSEGRISPGELFF
VLEETSVML,CMV,IE1,A*02:01,CASSPDSQSSGNTIYTF
VAANIVLTV,HomoSapiens,SLC30A8,A*02,CASSSVGVDTQYF
⋮,⋮,⋮,⋮,⋮
VAANIVLTV,HomoSapiens,SLC30A8,A*02,CASSPFLTGSNTEAFF
RLARLALVL,HomoSapiens,5T4,A*02:01,CASSYMGPEAFF


In [273]:
mcpas_table <- read_csv(mcpas_tcr_path, show_col_types = FALSE) |> 
    mutate(Pathology = case_when(Pathology == "Epstein Barr Virus (EBV)" ~ "EBV",
            Pathology == "Cytomegalovirus (CMV)" ~ "CMV",
            Pathology == "COVID-19" ~ "SARS-CoV-2", 
            Pathology == "Hepatitis C virus (HCV)" ~ "HCV",
            Pathology == "Human immunodeficiency virus (HIV)" ~ "HIV-1",
            Pathology == "Human immunodeficiency virus (HIV) related" ~ "HIV-1",
            Pathology == "Influenza" ~ "InfluenzaA",
            Pathology == "M.tuberculosis" ~ "M.tuberculosis",
            Pathology == "Herpes simplex virus 2 (HSV2)" ~ "HSV-2",
            Pathology == "HTLV-1" ~ "HTLV-1",
            Pathology == "HTLV-1 (Chronic" ~ "HTLV-1",
            Pathology == "Human herpes virus 1" ~ "HHV",
            TRUE ~ "Other")) |> 
    filter(Species == "Human" & Antigen.identification.method < 3 & !is.na(Epitope.peptide)) |>
    dplyr::select(CDR3.beta.aa, Pathology, Antigen.protein,
        Epitope.peptide, MHC) |>
    dplyr::rename(epitope = Epitope.peptide, pathology = Pathology, 
        mhc_allele = MHC, trb_cdr3_aa = CDR3.beta.aa, antigen = Antigen.protein) |>
    separate_rows(mhc_allele, sep = ",") |>
    mutate(mhc_allele = str_remove(mhc_allele, "HLA-"),
        mhc_allele = case_when(mhc_allele == "A2" ~ "A*02",
            mhc_allele == "Cw*16:01" ~ "C*16:01",
            mhc_allele == "A*011" ~ "A*11",
            mhc_allele == "A*2:01" ~ "A*02:01",
            mhc_allele == "B*8" ~ "B*08",
            mhc_allele == "Cw*16:01" ~ "C*16:01",
            mhc_allele == "A1" ~ "A*01",
            mhc_allele == "B7" ~ "B*07",
            mhc_allele == "DRB1*04-01" ~ "DRB1*04:01",
            mhc_allele == "A2:01" ~ "A*02:01")) |>
    filter(!is.na(mhc_allele))
mcpas_table  

trb_cdr3_aa,pathology,antigen,epitope,mhc_allele
<chr>,<chr>,<chr>,<chr>,<chr>
CASSLGSSYEQYF,Other,gp100,YLEPGPVTA,A*02
CASSESAGGYYNEQF,Other,EBNA-4,IVTDFSVIK,A*11
CASSFGFGSSYGYTF,Other,EBNA-4,IVTDFSVIK,A*11
⋮,⋮,⋮,⋮,⋮
CASSQGVNTGELFF,Other,,FLCMKALLL,A*02:01
CSAETGLSYEQYF,Other,,FLCMKALLL,A*02:01


In [274]:
antigen_db <- bind_rows(vdjdb_table, mcpas_table)
antigen_db

epitope,pathology,antigen,mhc_allele,trb_cdr3_aa
<chr>,<chr>,<chr>,<chr>,<chr>
GLCTLVAML,EBV,BMLF1,A*02,CASSEGRISPGELFF
VLEETSVML,CMV,IE1,A*02:01,CASSPDSQSSGNTIYTF
VAANIVLTV,HomoSapiens,SLC30A8,A*02,CASSSVGVDTQYF
⋮,⋮,⋮,⋮,⋮
FLCMKALLL,Other,,A*02:01,CASSQGVNTGELFF
FLCMKALLL,Other,,A*02:01,CSAETGLSYEQYF


#### Identify all TCRs with known antigenic specificities from KSTME cohort

In [275]:
hla_matched_tcr <- annotated_nprod_table |>
    mutate(visit_code = if_else(is.na(visit_code), "V01", visit_code)) |>
    filter(cohort %in% c("Hippos", "ARKS") & visit_code == "V01" &
        tissue_type %in% c("NAT", "Tumor")) |>
    inner_join(antigen_db, by = c("junction_aa" = "trb_cdr3_aa"), 
        relationship = "many-to-many") |>
    dplyr::select(patient_id, repertoire_id, junction, junction_aa, epitope, pathology,
        antigen, mhc_allele, aAllele1:DRB345allele2) |> 
    pivot_longer(cols = aAllele1:DRB345allele2, names_to = "allele",
        values_to = "mhc_value") |> 
    mutate(mhc_allele = str_remove(mhc_allele, "HLA-"),
        hla_matched = if_else(mhc_allele == mhc_value |
            mhc_allele == str_extract(mhc_value, "\\w+\\*\\d+:\\d+") |
            mhc_allele == str_extract(mhc_value, "\\w+\\*\\d+"), TRUE, FALSE)) |>
    filter(hla_matched) |> 
    dplyr::select(patient_id:junction_aa, epitope:antigen) |>
    distinct() |> 
    group_by(patient_id, repertoire_id, junction, junction_aa) |>
    summarise(npath = length(unique(pathology)),
        pathology = str_c(unique(pathology), collapse = ";")) |>
    ungroup() |>
    mutate(pathology = if_else(npath > 1, "Multi-pathogen", pathology)) |>
    dplyr::select(patient_id, repertoire_id, junction, junction_aa, pathology) |>
    distinct() |>
    mutate(pathology = recode(pathology, 
        `HomoSapiens` = "Other", "Homo sapiens" = "Other", 
        "SelaginellaMoellendorffii" = "Other", "TriticumAestivum" = "Other", 
        "synthetic" = "Other"))
hla_matched_tcr

patient_id,repertoire_id,junction,junction_aa,pathology
<chr>,<chr>,<chr>,<chr>,<chr>
008_001,008_001_D,TGTGCCACCGGGACAGGGGATAGCAATCAGCCCCAGCATTTT,CATGTGDSNQPQHF,EBV
008_001,008_001_D,TGTGCCAGCAGTACAGGGGATAGCAATCAGCCCCAGCATTTT,CASSTGDSNQPQHF,EBV
008_001,008_001_D,TGTGCCAGCGGGACAGGTGATAGCAATCAGCCCCAGCATTTT,CASGTGDSNQPQHF,EBV
⋮,⋮,⋮,⋮,⋮
Unk2,008_037_B,TGTGCCAGCAGTTTGAACACTGAAGCTTTCTTT,CASSLNTEAFF,Other
Unk2,008_037_B,TGTGCCAGCAGTTTGGGGAACACTGAAGCTTTCTTT,CASSLGNTEAFF,Other


In [276]:
path_annotated_nprod_table <- annotated_nprod_table |> 
    mutate(visit_code = if_else(is.na(visit_code), "V01", visit_code)) |>
    filter(cohort %in% c("Hippos", "ARKS") & visit_code == "V01" &
        tissue_type %in% c("NAT", "Tumor")) |>
    left_join(hla_matched_tcr, by = c("patient_id", "repertoire_id", "junction",
        "junction_aa")) |>
    dplyr::select(cohort, patient_id, repertoire_id, tissue_type, phenotype, pathology, junction_aa, duplicate_frequency) |>
    mutate(pathology = str_replace_na(pathology, "Unknown"),
        phenotype = if_else(cohort == "ARKS", "ARKS", phenotype)) |>
    group_by(cohort, repertoire_id, pathology, phenotype, tissue_type) |>
    summarize(frequency = sum(duplicate_frequency)) |>
    ungroup() |>
    mutate(cohort = if_else(cohort == "Hippos", 
            str_c(phenotype, tissue_type, sep = " - "), "ARKS"))
path_annotated_nprod_table 

cohort,repertoire_id,pathology,phenotype,tissue_type,frequency
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
Epidemic KS - NAT,008_001_A,Unknown,Epidemic KS,NAT,1.0000000000
Epidemic KS - Tumor,008_001_B,Unknown,Epidemic KS,Tumor,1.0000000000
Epidemic KS - Tumor,008_001_D,EBV,Epidemic KS,Tumor,0.0001045779
⋮,⋮,⋮,⋮,⋮,⋮
Epidemic KS - Tumor,008_265_D,Multi-pathogen,Epidemic KS,Tumor,0.0002486449
Epidemic KS - Tumor,008_265_D,Unknown,Epidemic KS,Tumor,0.9972151773


#### Prepare GLIPH inputs for KSTME datasets

In [281]:
# KSTME GLIPH table
kstme_gliph_trb_input <- annotated_nprod_table |>
    mutate(visit_code = if_else(is.na(visit_code), "V01", visit_code)) |>
    filter(cohort %in% c("Hippos") & visit_code == "V01" &
        tissue_type %in% c("NAT", "Tumor")) |>
    mutate(phenotype = if_else(is.na(phenotype), "Epidemic KS", phenotype)) |>
    dplyr::select(junction_aa, v_call, j_call, repertoire_id, tissue_type, phenotype,
        duplicate_count) |>
    mutate(gliph_id = str_c(repertoire_id,
            str_c(phenotype, tissue_type, sep = " - "), sep = ":"),
            cdra = NA_character_) |>
    dplyr::select(junction_aa, v_call, j_call, cdra, gliph_id, duplicate_count)
write_tsv(kstme_gliph_trb_input, file = kstme_gliph_input_path, col_names = FALSE)
kstme_gliph_trb_input 

junction_aa,v_call,j_call,cdra,gliph_id,duplicate_count
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
YSSAATIGQSRANVLTF,TRBV17-1,TRBJ2-6,,008_001_A:Epidemic KS - NAT,1
CSVTPGLSYEQYF,TRBV29-1,TRBJ2-7,,008_001_A:Epidemic KS - NAT,1
CSAYKGQTGVNTEAFF,,TRBJ1-1,,008_001_A:Epidemic KS - NAT,1
⋮,⋮,⋮,⋮,⋮,⋮
CVALTRGTEAFF,TRBV2-1,TRBJ1-1,,008_265_D:Epidemic KS - Tumor,9
CYGALAKNIQYF,TRBV5-4,TRBJ2-4,,008_265_D:Epidemic KS - Tumor,6


In [282]:
# KSTME GLIPH HLA table
kstme_gliph_hla_input <- annotated_nprod_table |>
    mutate(visit_code = if_else(is.na(visit_code), "V01", visit_code)) |>
    filter(cohort %in% c("Hippos") & visit_code == "V01" &
        tissue_type %in% c("NAT", "Tumor")) |>
    dplyr::select(repertoire_id, aAllele1:DRB345allele2) |>
    distinct()
write_tsv(kstme_gliph_hla_input, file = kstme_gliph_hla_input_path,
    col_names = FALSE)
kstme_gliph_hla_input

repertoire_id,aAllele1,aAllele2,bAllele1,bAllele2,cAllele1,cAllele2,DPA1allele1,DPA1allele2,DPB1allele1,DPB1allele2,DQA1allele1,DQA1allele2,DQB1allele1,DQB1allele2,DRB1allele1,DRB1allele2,DRB345allele1,DRB345allele2
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
008_001_A,A*23:17:01,A*29:02:01,B*35:01:01,B*42:01:01,C*04:01:01,C*17:01:01,DPA1*02:02:02,DPA1*03:01:01,DPB1*01:01:01,DPB1*105:01:01,DQA1*01:02:01,DQA1*04:01:01,DQB1*04:02:01,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01
008_001_B,A*23:17:01,A*29:02:01,B*35:01:01,B*42:01:01,C*04:01:01,C*17:01:01,DPA1*02:02:02,DPA1*03:01:01,DPB1*01:01:01,DPB1*105:01:01,DQA1*01:02:01,DQA1*04:01:01,DQB1*04:02:01,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01
008_001_D,A*23:17:01,A*29:02:01,B*35:01:01,B*42:01:01,C*04:01:01,C*17:01:01,DPA1*02:02:02,DPA1*03:01:01,DPB1*01:01:01,DPB1*105:01:01,DQA1*01:02:01,DQA1*04:01:01,DQB1*04:02:01,DQB1*05:01:01,DRB1*03:02:01,DRB1*11:01:02,DRB3*01:01:02,DRB3*03:01:01
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
008_264_D,A*02:01:01,A*74:01:01,B*08:01:01,B*53:01:01,C*04:01:01,C*07:01:02,DPA1*02:01:01,DPA1*03:01:01,DPB1*105:01:01,DPB1*13:01:01,DQA1*01:02:01,DQA1*04:01:02,DQB1*03:19:01,DQB1*06:09:01,DRB1*08:04:01,DRB1*13:02:01,DRB3*03:01:01,DRB345*Not_Present
008_265_D,A*30:02:01,A*32:01:01,B*07:02:01,B*18:01:01,C*07:04:01,C*07:18:01,DPA1*03:01:01,DPA1*03:01:01,DPB1*105:01:01,DPB1*105:01:01,DQA1*01:02:01,DQA1*01:03:01,DQB1*06:02:01,DQB1*06:03:01,DRB1*11:01:02,DRB1*13:01:01,DRB3*02:02:01,DRB3*03:01:01


#### Format GLIPH results and annotate with antigenic specificity

In [283]:
kstme_gliph_cluster_pred_table <- read_csv(kstme_gliph_clusters_path,
        show_col_types = FALSE) 
kstme_gliph_hla_pred_table <- read_csv(kstme_gliph_hlapred_path, 
    show_col_types = FALSE)

In [284]:
kstme_gliph_cluster_pred_table

index,pattern,Fisher_score,number_subject,number_unique_cdr3,final_score,hla_score,vb_score,expansion_score,length_score,⋯,HLA-B,HLA-C,HLA-DPA1,HLA-DPB1,HLA-DQA1,HLA-DQB1,HLA-DRB1,HLA-DRB3,HLA-DRB4,HLA-DRB5
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,KTYG,0,226,374,2.6e-08,0.0027,1,1,0.97,⋯,B*15:03:01/B*41:02:01,C*02:10:01/C*17:01:01,DPA1*01:03:01/DPA1*03:01:01,DPB1*105:01:01/DPB1*61:01N,DQA1*01:02:01/DQA1*04:01:02,DQB1*03:19:01/DQB1*06:09:01,DRB1*08:04:01/DRB1*13:02:01,DRB3*03:01:01/DRB345*Not_Present,-,-
1,KTYG,0,226,374,2.6e-08,0.0027,1,1,0.97,⋯,B*49:01:01!/B*58:01:01,C*03:02:02/C*07:01:01!,DPA1*01:03:01/DPA1*01:03:01,DPB1*30:01:01/DPB1*39:01:01,DQA1*01:01:02/DQA1*01:02:01,DQB1*05:01:01/DQB1*06:04:01,DRB1*01:02:01/DRB1*13:02:01,DRB3*03:01:01/DRB345*Not_Present,-,-
1,KTYG,0,226,374,2.6e-08,0.0027,1,1,0.97,⋯,B*49:01:01!/B*58:01:01,C*03:02:02/C*07:01:01!,DPA1*01:03:01/DPA1*01:03:01,DPB1*30:01:01/DPB1*39:01:01,DQA1*01:01:02/DQA1*01:02:01,DQB1*05:01:01/DQB1*06:04:01,DRB1*01:02:01/DRB1*13:02:01,DRB3*03:01:01/DRB345*Not_Present,-,-
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
873940,RKTY,1,6,4,2.5e-08,0.021,0.001,1,0.51,⋯,B*51:01:01/B*58:02:01,C*06:02:01/C*16:46,DPA1*02:01:01/DPA1*02:01:01,DPB1*17:01:01!/DPB1*17:01:01!,DQA1*01:01:02/DQA1*01:02:01,DQB1*05:01:01/DQB1*06:02:01,DRB1*01:02:01/DRB1*11:01:02,DRB3*02:02:01/DRB345*Not_Present,-,-
873940,RKTY,1,6,4,2.5e-08,0.021,0.001,1,0.51,⋯,B*18:01:01/B*53:01:01,C*04:01:01/C*07:04:01,DPA1*02:01:08/DPA1*03:01:01,DPB1*01:01:01/DPB1*105:01:01,DQA1*01:02:01/DQA1*02:01:01,DQB1*02:02:01/DQB1*06:02:01,DRB1*07:01:01/DRB1*11:01:02,DRB3*02:02:01,DRB4*01:01:01:01,-


In [285]:
options(repr.matrix.max.rows=5, repr.matrix.max.columns=5)
# Total number of GLIPH groups identified from the KSTME dataset
print(str_c("Total number of GLIPH groups in KSTME datasets",
    length(unique(kstme_gliph_cluster_pred_table$pattern)), sep = " : "))
print(str_c("Number of GLIPH groups of length >= 3",
    kstme_gliph_cluster_pred_table |> 
        filter(pattern != "single" & nchar(pattern) > 2) |>
        pull(pattern) |>
        unique() |>
        length(), sep = " : ")) 
print(str_c("Number of GLIPH groups of length >= 3, vb_score <= 0.01, unique CDR3s >= 3, and unique PTIDs >= 3",
    kstme_gliph_cluster_pred_table |> 
        filter(pattern != "single" & nchar(pattern) > 2) |>
        separate(Sample, into = c("repertoire_id", "cohort"), sep = ":") |>
        mutate(patient_id = str_extract(repertoire_id, "008_\\d+"),
            pattern = if_else(pattern == "single", TcRb, pattern)) |>
        group_by(pattern) |>
        mutate(number_of_unique_cdr = length(unique(TcRb)),
                number_of_unique_ptid = length(unique(patient_id))) |>
        filter(vb_score <= 0.01 & number_of_unique_cdr >= 3 &
                number_of_unique_ptid >= 3) |>
        pull(pattern) |>
        unique() |>
        length(), sep = " : ")) 


[1] "Total number of GLIPH groups in KSTME datasets : 215850"
[1] "Number of GLIPH groups of length >= 3 : 215816"
[1] "Number of GLIPH groups of length >= 3, vb_score <= 0.01, unique CDR3s >= 3, and unique PTIDs >= 3 : 43545"


In [286]:
kstme_gliph_hla_pred_table

index,pattern,'HLA allele with lowest Fisher Score',Pvalue,'Number of subjects in this cluster with this allele','Number of subjects in this cluster with HLA','Number of subjects with this allele in total','Number of subjects with HLA in total'
<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,lKTYG,B*81,0.0250,25,226,30,342
1,lKTYG,C*18,0.0027,44,226,53,342
1,lKTYG,B*18,0.0580,55,226,74,342
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
873940,lRKTY,A*31,0.021,2,6,14,342
873940,lRKTY,DPB1*17,0.089,2,6,30,342


In [287]:
print(str_c("Total number of HLA restrictions predicted by GLIPH", 
    nrow(kstme_gliph_hla_pred_table), sep = " : "))
print(str_c("Total number of HLA restrictions for GLIPH gorups of length >= 3",
    kstme_gliph_hla_pred_table |> 
        mutate(pattern = str_remove(pattern, "^[lg]")) |>
        filter(pattern != "single" & nchar(pattern) > 2) |>
        nrow(), sep = " : ")) 
print(str_c("Total number of significant HLA restrictions for GLIPH gorups of length >= 3",
    kstme_gliph_hla_pred_table |> 
        mutate(pattern = str_remove(pattern, "^[lg]")) |>
        filter(pattern != "single" & nchar(pattern) > 2 & Pvalue <= 0.05) |>
        nrow(), sep = " : ")) 

[1] "Total number of HLA restrictions predicted by GLIPH : 1516816"
[1] "Total number of HLA restrictions for GLIPH gorups of length >= 3 : 993612"
[1] "Total number of significant HLA restrictions for GLIPH gorups of length >= 3 : 640223"


In [288]:
# Filter out high confidence GLIPH groups with a pattern length of >2 where, 
# the pattern has at 3 unique CDR3s associated with it, at least 3 PTIDs 
# associated with it and with a vb_score <= 0.01
kstme_hc_gliph_cluster_table <- kstme_gliph_cluster_pred_table |>
  dplyr::select(pattern, vb_score, hla_score, expansion_score, TcRb, Sample, V, J,
    Freq) |>
  separate(Sample, into = c("repertoire_id", "cohort"), sep = ":") |>
  mutate(patient_id = str_extract(repertoire_id, "008_\\d+")) |>
  group_by(pattern) |>
  mutate(number_of_unique_cdr = length(unique(TcRb)),
    number_of_unique_ptid = length(unique(patient_id))) |>
  ungroup() |>
  filter(vb_score <= 0.01 & number_of_unique_cdr >= 3 &
    number_of_unique_ptid >= 3 & pattern != "single" & nchar(pattern) > 2)
kstme_hc_gliph_cluster_table

pattern,vb_score,hla_score,expansion_score,TcRb,repertoire_id,cohort,V,J,Freq,patient_id,number_of_unique_cdr,number_of_unique_ptid
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<int>,<int>
HHQP,0.001,0.023,1,CASSALIHFWHHQPQHF,008_074_D,Endemic KS - Tumor,,TRBJ1-5,1,008_074,155,89
HHQP,0.001,0.023,1,CASSLNLRHHQPQHF,008_143_D,Endemic KS - Tumor,,TRBJ1-5,1,008_143,155,89
HHQP,0.001,0.023,1,CASSECLPIFHHQPQHF,008_189_B,Endemic KS - Tumor,,TRBJ1-5,1,008_189,155,89
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
RKTY,0.001,0.021,1,CASGSTRAARLATLLDPRKTYGYTF,008_008_C,Epidemic KS - Tumor,,TRBJ1-2,1,008_008,243,99
RKTY,0.001,0.021,1,CTSRLRSLCERPMVRKTYGYTF,008_175_B,Endemic KS - Tumor,TRBV1-1,TRBJ1-2,1,008_175,243,99


In [289]:
# Filter out HLA predictions for high confidence GLIPH groups identified in the
# previous step
kstme_hc_gliph_hla_table <- kstme_gliph_hla_pred_table |>
  dplyr::rename(sig_allele = `'HLA allele with lowest Fisher Score'`) |>
  mutate(pattern = str_remove(pattern, "^[lg]")) |>
  filter(pattern %in% kstme_hc_gliph_cluster_table$pattern & Pvalue <= 0.05) |>
  dplyr::select(pattern, sig_allele, Pvalue)
kstme_hc_gliph_hla_table

pattern,sig_allele,Pvalue
<chr>,<chr>,<dbl>
RKTY,B*81,0.04500
RKTY,C*18,0.00016
RKTY,B*18,0.04900
⋮,⋮,⋮
SLGLAGG%NE,A*74,0.021
RKTY,A*31,0.021


In [290]:
# Merge GLIPH clusters with significant HLA association
kstme_high_confidence_gliph_groups <- kstme_hc_gliph_cluster_table |>
  left_join(kstme_hc_gliph_hla_table, by = "pattern",
    relationship = "many-to-many") 
kstme_high_confidence_gliph_groups

pattern,vb_score,hla_score,expansion_score,TcRb,repertoire_id,cohort,V,J,Freq,patient_id,number_of_unique_cdr,number_of_unique_ptid,sig_allele,Pvalue
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<int>,<int>,<chr>,<dbl>
HHQP,0.001,0.023,1,CASSALIHFWHHQPQHF,008_074_D,Endemic KS - Tumor,,TRBJ1-5,1,008_074,155,89,DPB1*03,0.023
HHQP,0.001,0.023,1,CASSLNLRHHQPQHF,008_143_D,Endemic KS - Tumor,,TRBJ1-5,1,008_143,155,89,DPB1*03,0.023
HHQP,0.001,0.023,1,CASSECLPIFHHQPQHF,008_189_B,Endemic KS - Tumor,,TRBJ1-5,1,008_189,155,89,DPB1*03,0.023
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
RKTY,0.001,0.021,1,CTSRLRSLCERPMVRKTYGYTF,008_175_B,Endemic KS - Tumor,TRBV1-1,TRBJ1-2,1,008_175,243,99,DRB1*04,0.037
RKTY,0.001,0.021,1,CTSRLRSLCERPMVRKTYGYTF,008_175_B,Endemic KS - Tumor,TRBV1-1,TRBJ1-2,1,008_175,243,99,A*31,0.021


In [291]:
# Annotate GLIPH groups with antigenic specificity based on the CDR3 sequences
# they contain
path_annotated_gliph_clusters <- hla_matched_tcr |>
  distinct() |>
  inner_join(kstme_high_confidence_gliph_groups, by = c("repertoire_id", "junction_aa" = "TcRb"),
    relationship = "many-to-many") |>
  dplyr::select(repertoire_id, pathology, pattern) |>
  distinct()
path_annotated_gliph_clusters


repertoire_id,pathology,pattern
<chr>,<chr>,<chr>
008_001_D,EBV,%TGDSNQP
008_001_D,EBV,GT%DSNQP
008_001_D,EBV,GTGD%NQP
⋮,⋮,⋮
008_037_B,Other,SL%NTE
008_037_B,Other,SLG%TE


In [292]:
 kstme_ck_gliph_table <- kstme_high_confidence_gliph_groups |>
  dplyr::select(repertoire_id, pattern, TcRb, V, J) |>
  left_join(path_annotated_gliph_clusters, by = c("repertoire_id", "pattern"), relationship = "many-to-many") |>
  distinct() |>
  filter(is.na(pathology)) |>
  mutate(pathology = if_else(is.na(pathology), "Clustered\nUnknown", pathology)) |>
  group_by(TcRb, V, J) |>
  mutate(npath = length(unique(pathology)),
    pathology = str_c(unique(pathology), collapse =";")) |>
  ungroup() |>
  mutate(pathology = if_else(npath > 1, "Multi-pathogen", pathology)) 

In [293]:
# Annotate GLIPH groups by antigenic specificity of CDR3s goruped within
kstme_path_annotated_gliph_groups <- kstme_high_confidence_gliph_groups |>
  dplyr::select(repertoire_id, pattern, TcRb, V, J) |>
  left_join(path_annotated_gliph_clusters, by = c("repertoire_id", "pattern"), relationship = "many-to-many") |>
  distinct() |>
  mutate(pathology = if_else(is.na(pathology), "Clustered\nUnknown", pathology)) |>
  group_by(TcRb, V, J) |>
  mutate(npath = length(unique(pathology)),
    pathology = str_c(unique(pathology), collapse =";")) |>
  ungroup() |>
  mutate(pathology = if_else(npath > 1, "Multi-pathogen", pathology)) 
kstme_path_annotated_gliph_groups |>
  group_by(pathology) |> 
  summarize(count = length(unique(TcRb))) 


pathology,count
<chr>,<int>
CMV,65
Clustered Unknown,151771
EBV,17
⋮,⋮
Other,275
SARS-CoV-2,6


In [294]:
kstme_gliph_nprod_table <- annotated_nprod_table |>
    mutate(visit_code = if_else(is.na(visit_code), "V01", visit_code)) |>
    filter(cohort %in% c("Hippos") & visit_code == "V01" &
        tissue_type %in% c("NAT", "Tumor")) |>
    left_join(kstme_path_annotated_gliph_groups, by = c("repertoire_id", 
        "junction_aa" = "TcRb", "v_call" = "V",
        "j_call" = "J"), relationship = "many-to-many") |>
    distinct() |>
    dplyr::select(cohort, patient_id, repertoire_id, tissue_type, phenotype, pathology, junction_aa, v_call, j_call, duplicate_frequency) |>
    mutate(pathology = if_else(is.na(pathology), "Unclustered\nUnknown", pathology),
        phenotype = if_else(is.na(phenotype) , "Epidemic KS", phenotype)) |>
    group_by(junction_aa, v_call, j_call ) |>
    mutate(npath = length(unique(pathology)),
        pathology = str_c(unique(pathology), collapse =";")) |>
    ungroup() |>
    mutate(pathology = if_else(npath > 1, "Multi-pathogen", pathology)) |>
    distinct()
kstme_gliph_nprod_table

cohort,patient_id,repertoire_id,tissue_type,phenotype,pathology,junction_aa,v_call,j_call,duplicate_frequency,npath
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>
Hippos,008_001,008_001_A,NAT,Epidemic KS,Unclustered Unknown,YSSAATIGQSRANVLTF,TRBV17-1,TRBJ2-6,0.01190476,1
Hippos,008_001,008_001_A,NAT,Epidemic KS,Clustered Unknown,CSVTPGLSYEQYF,TRBV29-1,TRBJ2-7,0.01190476,1
Hippos,008_001,008_001_A,NAT,Epidemic KS,Unclustered Unknown,CSAYKGQTGVNTEAFF,,TRBJ1-1,0.01190476,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Hippos,008_265,008_265_D,Tumor,Epidemic KS,Unclustered Unknown,CVALTRGTEAFF,TRBV2-1,TRBJ1-1,0.0004475608,1
Hippos,008_265,008_265_D,Tumor,Epidemic KS,Clustered Unknown,CYGALAKNIQYF,TRBV5-4,TRBJ2-4,0.0002983739,1


In [295]:
path_annotated_gliph_nprod_table <- kstme_gliph_nprod_table |>
    group_by(cohort, repertoire_id, pathology, phenotype, tissue_type) |>
    summarize(frequency = sum(duplicate_frequency)) |>
    ungroup() |>
    mutate(cohort = if_else(cohort == "Hippos", 
            str_c(phenotype, tissue_type, sep = " - "), "ARKS"))
path_annotated_gliph_nprod_table

cohort,repertoire_id,pathology,phenotype,tissue_type,frequency
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
Epidemic KS - NAT,008_001_A,Clustered Unknown,Epidemic KS,NAT,0.2380952
Epidemic KS - NAT,008_001_A,Unclustered Unknown,Epidemic KS,NAT,0.7619048
Epidemic KS - Tumor,008_001_B,Clustered Unknown,Epidemic KS,Tumor,0.2932144
⋮,⋮,⋮,⋮,⋮,⋮
Epidemic KS - Tumor,008_265_D,Multi-pathogen,Epidemic KS,Tumor,0.002585907
Epidemic KS - Tumor,008_265_D,Unclustered Unknown,Epidemic KS,Tumor,0.718683177


In [296]:
annotated_nprod_table |> 
    mutate(visit_code = if_else(is.na(visit_code), "V01", visit_code)) |>
    filter(cohort %in% c("Hippos", "ARKS") & visit_code == "V01" &
        tissue_type %in% c("NAT", "Tumor")) |>
    left_join(hla_matched_tcr, by = c("patient_id", "repertoire_id", "junction",
        "junction_aa")) |>
    dplyr::select(cohort, patient_id, repertoire_id, tissue_type, phenotype, pathology, junction_aa, duplicate_frequency)|> 
    filter(pathology %in% c("CMV", "DENV1", "DENV3/4", "EBV", "HCV", "HIV-1",
       "HSV-2", "InfluenzaA", "M.tuberculosis")) |>
    arrange(desc(duplicate_frequency)) |> 
    left_join(antigen_db, by = c("pathology", "junction_aa" = "trb_cdr3_aa")) |>
    dplyr::select(-mhc_allele) |>
    distinct() |>
    write_csv(str_c(table_path, "Top_known_sequences.csv", sep = "/"))

“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 3 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 3399 of `y` matches multiple rows in `x`.


In [297]:
kstme_gliph_nprod_table |> filter(pathology %in% c("CMV", "DENV1", "DENV3/4", "EBV", "HCV", "HIV-1",
       "HSV-2", "InfluenzaA", "M.tuberculosis")) |>
    arrange(desc(duplicate_frequency)) |>
    left_join(antigen_db, by = c("pathology", "junction_aa" = "trb_cdr3_aa")) |>
    write_csv(str_c(table_path, "Top_gliph_confirmed_known_sequences.csv", sep = "/"))

“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 6 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 3399 of `y` matches multiple rows in `x`.


#### Identify overlapping Clustered unknown GLIPH groups

In [299]:
kstme_gliph_tpack_table <- annotated_nprod_table |>
    mutate(visit_code = if_else(is.na(visit_code), "V01", visit_code)) |>
    filter(cohort %in% c("Hippos") & visit_code == "V01" &
        tissue_type %in% c("NAT", "Tumor")) |>
    left_join(kstme_path_annotated_gliph_groups, by = c("repertoire_id", 
        "junction_aa" = "TcRb", "v_call" = "V",
        "j_call" = "J"), relationship = "many-to-many") |>
    distinct() |>
    dplyr::select(cohort, patient_id, repertoire_id, tissue_type, phenotype, pathology, 
        junction_aa, v_call, j_call, duplicate_count, duplicate_frequency, pattern) |>
    mutate(pathology = if_else(is.na(pathology), "Unclustered\nUnknown", pathology),
        phenotype = if_else(is.na(phenotype) , "Epidemic KS", phenotype)) |>
    group_by(junction_aa, v_call, j_call ) |>
    mutate(npath = length(unique(pathology)),
        pathology = str_c(unique(pathology), collapse =";")) |>
    ungroup() |>
    mutate(pathology = if_else(npath > 1, "Multi-pathogen", pathology)) |>
    distinct()

In [300]:
kstme_clustered_unknown_table <- kstme_gliph_tpack_table |>
    filter(pathology == "Clustered\nUnknown") 
kstme_clustered_unknown_table

cohort,patient_id,repertoire_id,tissue_type,phenotype,pathology,junction_aa,v_call,j_call,duplicate_count,duplicate_frequency,pattern,npath
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<int>
Hippos,008_001,008_001_A,NAT,Epidemic KS,Clustered Unknown,CSVTPGLSYEQYF,TRBV29-1,TRBJ2-7,1,0.01190476,TPG%SYE,1
Hippos,008_001,008_001_A,NAT,Epidemic KS,Clustered Unknown,CSVTPGLSYEQYF,TRBV29-1,TRBJ2-7,1,0.01190476,%PGLSYE,1
Hippos,008_001,008_001_A,NAT,Epidemic KS,Clustered Unknown,CASSQEGGGRGQPQHF,TRBV4-1,TRBJ1-5,2,0.02380952,SQEGGG%GQP,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Hippos,008_265,008_265_D,Tumor,Epidemic KS,Clustered Unknown,CASGSRVHEQYF,TRBV12-5,TRBJ2-7,12,0.0005967477,GSRV%E,1
Hippos,008_265,008_265_D,Tumor,Epidemic KS,Clustered Unknown,CYGALAKNIQYF,TRBV5-4,TRBJ2-4,6,0.0002983739,%LAKNI,1


In [302]:
kstme_gliph_pattern_table <- kstme_clustered_unknown_table |>
    dplyr::select(repertoire_id, pattern) |>
    filter(!is.na(repertoire_id) & !is.na(pattern)) |>
    distinct()
kstme_gliph_pattern_table

repertoire_id,pattern
<chr>,<chr>
008_001_A,TPG%SYE
008_001_A,%PGLSYE
008_001_A,SQEGGG%GQP
⋮,⋮
008_265_D,GSRV%E
008_265_D,%LAKNI


In [303]:
kstme_gliph_dist_table <- kstme_gliph_pattern_table |>
    group_by(repertoire_id) |>
    summarize(gliph_groups = list(unique(pattern))) |>
    mutate(gliph_groups = map(gliph_groups, ~sort(as.character(.)))) |>
    ungroup()
kstme_gliph_dist_table

repertoire_id,gliph_groups
<chr>,<list>
008_001_A,"%PGLSYE,...."
008_001_B,"%AGE, %A...."
008_001_D,"%ADNE, %...."
⋮,⋮
008_264_D,"%ADNE, %...."
008_265_D,"%AGGE, %...."


In [304]:
kstme_clustered_unknown_table |> pull(pattern) |> unique() |> length()

In [305]:
intra_tumor_sharing_count <- kstme_clustered_unknown_table |>
    filter(!str_detect(repertoire_id, "008_\\d+_A")) |>
    group_by(pattern, patient_id) |>
    summarise(nreps = length(unique(repertoire_id))) |>
    filter(nreps > 1) |>
    pull(pattern) |>
    unique() |>
    length()
str_c("Number of specificity groups shared by at least two tumors derieved from spatially distinct sites from the same individual", intra_tumor_sharing_count, sep = " : ")

In [306]:
inter_tumor_sharing_count <- kstme_clustered_unknown_table |>
    filter(!str_detect(repertoire_id, "008_\\d+_A")) |>
    group_by(pattern) |>
    summarise(nptids = length(unique(patient_id))) |>
    filter(nptids > 1) |>
    pull(pattern) |>
    unique() |>
    length()
str_c("Number of specificity groups shared by at least two tumors derieved from spatially distinct sites from the different individuals", inter_tumor_sharing_count, sep = " : ")

In [307]:
inter_by_cond_tumor_sharing_count <- kstme_clustered_unknown_table |>
    filter(!str_detect(repertoire_id, "008_\\d+_A")) |>
    group_by(pattern,phenotype) |>
    summarise(nptids = length(unique(patient_id))) |>
    filter(nptids > 1) |>
    ungroup() |>
    group_by(phenotype) |>
    summarize(ngliphs = length(unique(pattern)))
inter_by_cond_tumor_sharing_count 

phenotype,ngliphs
<chr>,<int>
Endemic KS,26735
Epidemic KS,33469


In [308]:
inter_cond_tumor_sharing_count <- kstme_clustered_unknown_table |>
    filter(!str_detect(repertoire_id, "008_\\d+_A")) |>
    group_by(pattern) |>
    summarise(nptids = length(unique(phenotype))) |>
    filter(nptids > 1) |>
    pull(pattern) |>
    unique() |>
    length()
str_c("Number of specificity groups shared by at least two tumors derieved from spatially distinct sites from the epidemic and endemic individuals", inter_cond_tumor_sharing_count, sep = " : ")

In [309]:
kstme_gliph_dist_tibble <- expand_grid(kstme_gliph_dist_table, 
    kstme_gliph_dist_table, .name_repair = "unique") |>
  dplyr::rename(rep_one = repertoire_id...1,
    glist_one = gliph_groups...2,
    rep_two = repertoire_id...3,
    glist_two = gliph_groups...4) |>
  rowwise() |>
  mutate(jaccard = length(intersect(glist_one, glist_two))/length(union(glist_one, glist_two)))  |>
  dplyr::select(rep_one, rep_two, jaccard) |>
  filter(str_detect(rep_one, "008_\\d+") & str_detect(rep_two, "008_\\d+") & 
    !is.na(jaccard) & 
    str_extract(rep_one, "[A-Z]+$") %in% c("A","B", "C", "D") &
    str_extract(rep_two, "[A-Z]+$") %in% c("A","B", "C", "D")) |>
  ungroup() |> 
  mutate(sample_group = if_else(str_extract(rep_one, "008_\\d+") ==
      str_extract(rep_two, "008_\\d+"), "Intra-sample", "Inter-sample"),
    tumor_group = case_when(
      str_extract(rep_one, "[A-Z]+$") == "A" &
        str_extract(rep_two, "[A-Z]+$") == "A" ~ "NAT - NAT",
      str_extract(rep_one, "[A-Z]+$") == "A" &
        str_extract(rep_two, "[A-Z]+$") %in% c("B", "C", "D") ~ "NAT - Tumor",
      str_extract(rep_two, "[A-Z]+$") == "A" &
        str_extract(rep_one, "[A-Z]+$") %in% c("B", "C", "D") ~ "NAT - Tumor",
      str_extract(rep_two, "[A-Z]+$") %in% c("B", "C", "D") &
        str_extract(rep_one, "[A-Z]+$") %in% c("B", "C", "D") ~ "Tumor - Tumor"),
    ks_group = str_c(ptid_dict[str_extract(rep_one, "008_\\d+")],
      ptid_dict[str_extract(rep_two, "008_\\d+")], sep = " - "),
    ks_group = str_replace(ks_group, "Epidemic KS - Endemic KS",
      "Endemic KS - Epidemic KS")) |>
  filter(!(sample_group == "Inter-sample" & tumor_group == "NAT - Tumor") &
    !(rep_one == rep_two))
kstme_gliph_dist_tibble

[1m[22mNew names:
[36m•[39m `repertoire_id` -> `repertoire_id...1`
[36m•[39m `gliph_groups` -> `gliph_groups...2`
[36m•[39m `repertoire_id` -> `repertoire_id...3`
[36m•[39m `gliph_groups` -> `gliph_groups...4`


rep_one,rep_two,jaccard,sample_group,tumor_group,ks_group
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>
008_001_A,008_001_B,0.014041514,Intra-sample,NAT - Tumor,Epidemic KS - Epidemic KS
008_001_A,008_001_D,0.008120179,Intra-sample,NAT - Tumor,Epidemic KS - Epidemic KS
008_001_A,008_002_A,0.000000000,Inter-sample,NAT - NAT,Epidemic KS - Epidemic KS
⋮,⋮,⋮,⋮,⋮,⋮
008_265_D,008_263_D,0.0375829,Inter-sample,Tumor - Tumor,Epidemic KS - Epidemic KS
008_265_D,008_264_D,0.0606463,Inter-sample,Tumor - Tumor,Endemic KS - Epidemic KS


### Figure 3:

In [316]:
### Overlapping TCRs section
endemic_pal <- alluvial_data |>
  filter(str_detect(repertoire_id,  "008_057|008_061")) |>
  topSeqs(top = 500) |>
  cloneTrack() |> 
  group_by(junction_aa) |>
  summarize(reps = str_c(unique(str_extract(repertoire_id, "[ABCD]$")), collapse= ""),
    samples = length(unique(str_extract(repertoire_id, "\\d+_\\d+"))),
    seen = max(seen)) |>
  mutate(pal = case_when(seen == 1 ~ "#f0f0f0",
    samples > 1 & reps != "A" ~ "#e7298a",
    samples > 1 & reps == "A" ~ "#1b9e77",
    samples == 1 & seen > 1 & str_detect(reps, "A") & 
      str_detect(reps, "B|C|D") ~ "#d95f02",
    samples == 1 & seen > 1 & !str_detect(reps, "A") ~ "#7570b3"))
endemic_alluvial <- alluvial_data |>
  filter(str_detect(repertoire_id, "008_057|008_061")) |>
  topSeqs(top = 500) |>
  cloneTrack() |>
  plotTrack(alist = endemic_pal$junction_aa, apal = endemic_pal$pal) +
  scale_x_discrete(limits = c("008_061_A", "008_061_B", "008_061_C", "008_061_D",
    "008_057_D", "008_057_C", "008_057_B", "008_057_A"),
    labels = c("NAT", "Tumor B", "Tumor C", "Tumor D", "Tumor D", "Tumor C",
      "Tumor B", "NAT")) +
  annotate("text", label = "Endemic KS\nStudy Entry", y = 0.95, size = 12, x = 2.5) +
  annotate("text", label = "Endemic KS\nStudy Entry", y = 0.95, size = 12, x = 6.5) +
  geom_vline(xintercept = 4.5, linetype="dotted") + 
  ylim(0,1) +
  theme_classic(base_size = 30) +
  theme(legend.position = "none")


epidemic_pal <- alluvial_data |>
  filter(str_detect(repertoire_id, "008_021|008_048")) |>
  topSeqs(top = 500) |>
  cloneTrack() |> 
  group_by(junction_aa) |>
  summarize(reps = str_c(unique(str_extract(repertoire_id, "[ABCD]$")), collapse= ""),
    samples = length(unique(str_extract(repertoire_id, "\\d+_\\d+"))),
    seen = max(seen)) |>
  mutate(pal = case_when(seen == 1 ~ "#f0f0f0",
    samples > 1 & reps != "A" ~ "#e7298a",
    samples > 1 & reps == "A" ~ "#1b9e77",
    samples == 1 & seen > 1 & str_detect(reps, "A") & 
      str_detect(reps, "B|C|D") ~ "#d95f02",
    samples == 1 & seen > 1 & !str_detect(reps, "A") ~ "#7570b3"))

epidemic_alluvial <- alluvial_data |>
  filter(str_detect(repertoire_id, "008_021|008_048")) |>
  topSeqs(top = 500) |>
  cloneTrack() |>
  plotTrack(alist = epidemic_pal$junction_aa, apal = epidemic_pal$pal) +
  scale_x_discrete(limits = c("008_021_A", "008_021_B", "008_021_C", "008_021_D",
    "008_048_D", "008_048_C", "008_048_B", "008_048_A"),
    labels = c("NAT", "Tumor B", "Tumor C", "Tumor D", "Tumor D", "Tumor C",
      "Tumor B", "NAT")) +
  geom_vline(xintercept = 4.5, linetype="dotted") + 
  annotate("text", label = "Epidemic KS\nStudy Entry", y = 0.95, size = 12, x = 2.5) +
  annotate("text", label = "Epidemic KS\nStudy Entry", y = 0.95, size = 12, x = 6.5) +
  ylim(0,1) +
  theme_classic(base_size = 30) +
  theme(legend.position = "none")

endepi_pal <- alluvial_data |>
  filter(str_detect(repertoire_id,  "008_021|008_057")) |>
  topSeqs(top = 500) |>
  cloneTrack() |> 
  group_by(junction_aa) |>
  summarize(reps = str_c(unique(str_extract(repertoire_id, "[ABCD]$")), collapse= ""),
    samples = length(unique(str_extract(repertoire_id, "\\d+_\\d+"))),
    seen = max(seen)) |>
  mutate(pal = case_when(seen == 1 ~ "#f0f0f0",
    samples > 1 & reps != "A" ~ "#e7298a",
    samples > 1 & reps == "A" ~ "#1b9e77",
    samples == 1 & seen > 1 & str_detect(reps, "A") & 
      str_detect(reps, "B|C|D") ~ "#d95f02",
    samples == 1 & seen > 1 & !str_detect(reps, "A") ~ "#7570b3"))
endepi_alluvial <- alluvial_data |>
  filter(str_detect(repertoire_id, "008_021|008_057")) |>
  topSeqs(top = 500) |>
  cloneTrack() |>
  plotTrack(alist = endepi_pal$junction_aa, apal = endepi_pal$pal) +
  scale_x_discrete(limits = c("008_021_A", "008_021_B", "008_021_C", "008_021_D",
    "008_057_D", "008_057_C", "008_057_B", "008_057_A"),
    labels = c("NAT", "Tumor B", "Tumor C", "Tumor D", "Tumor D", "Tumor C",
      "Tumor B", "NAT")) +
  geom_vline(xintercept = 4.5, linetype="dotted") + 
  annotate("text", label = "Epidemic KS\nStudy Entry", y = 0.95, size = 12, x = 2.5) +
  annotate("text", label = "Endemic KS\nStudy Entry", y = 0.95, size = 12, x = 6.5) +
  ylim(0,1) +
  theme_classic(base_size = 30) +
  theme(legend.position = "none")

endemic_compairr <- compairr_table |>
  mutate(color_group = str_c(sample_group, tumor_group, sep = "\n"),
    overlap = if_else(overlap > 1, 1, overlap)) |>
  filter(ks_group == "Endemic KS - Endemic KS") |>
  ggboxplot(x = "color_group", y = "overlap", fill = "color_group",
    order = c("Inter-subject\nNAT - NAT", "Intra-subject\nNAT - Tumor",
      "Intra-subject\nTumor - Tumor", "Inter-subject\nTumor - Tumor")) |>
  ggpar(xlab = FALSE, ylab = "Morisita-Horn index", 
    ggtheme = theme_classic(base_size = 30), legend = "none",
    legend.title = "Group", palette = c("Inter-subject\nNAT - NAT" = "#1b9e77", 
      "Intra-subject\nNAT - Tumor" = "#d95f02",
      "Intra-subject\nTumor - Tumor" = "#7570b3", 
      "Inter-subject\nTumor - Tumor" = "#e7298a")) +
  stat_compare_means(comparisons = list(c("Intra-subject\nNAT - Tumor",
      "Intra-subject\nTumor - Tumor")), size = 6,  label = "p.signif") +
  scale_y_continuous(expand = expansion(mult = c(0.05, 0.25)))

epidemic_compairr <- compairr_table |>
  mutate(color_group = str_c(sample_group, tumor_group, sep = "\n"),
    overlap = if_else(overlap > 1, 1, overlap)) |>
  filter(ks_group == "Epidemic KS - Epidemic KS") |>
  ggboxplot(x = "color_group", y = "overlap", fill = "color_group",
    order = c("Inter-subject\nNAT - NAT", "Intra-subject\nNAT - Tumor",
      "Intra-subject\nTumor - Tumor", "Inter-subject\nTumor - Tumor")) |>
  ggpar(xlab = FALSE, ylab = "Morisita-Horn index", 
    ggtheme = theme_classic(base_size = 30), legend = "right",
    legend.title = "Group", palette = c("Inter-subject\nNAT - NAT" = "#1b9e77", 
      "Intra-subject\nNAT - Tumor" = "#d95f02",
      "Intra-subject\nTumor - Tumor" = "#7570b3", 
      "Inter-subject\nTumor - Tumor" = "#e7298a")) +
  stat_compare_means(comparisons = list(c("Intra-subject\nNAT - Tumor",
      "Intra-subject\nTumor - Tumor")), size = 6,  label = "p.signif") +
  scale_y_continuous(expand = expansion(mult = c(0.05, 0.25)))

endemic_epidemic_compairr <- compairr_table |>
  mutate(color_group = str_c(sample_group, tumor_group, sep = "\n"),
    overlap = if_else(overlap > 1, 1, overlap)) |>
  filter(ks_group == "Endemic KS - Epidemic KS") |>
  ggboxplot(x = "color_group", y = "overlap", fill = "color_group",
    order = c("Inter-subject\nNAT - NAT", "Inter-subject\nTumor - Tumor")) |>
  ggpar(xlab = FALSE, ylab = "Morisita-Horn index", ylim = c(0,1),
    ggtheme = theme_classic(base_size = 30),
    legend = "none", palette = c("Inter-subject\nNAT - NAT" = "#1b9e77", 
      "Intra-subject\nNAT - Tumor" = "#d95f02",
      "Intra-subject\nTumor - Tumor" = "#7570b3", 
      "Inter-subject\nTumor - Tumor" = "#e7298a"), size = 6, label = "p.signif")
### Public TCRs section
public_tcrs_plot <- path_annotated_nprod_table |>
  mutate(frequency = frequency * 100) |>
  filter(cohort != "ARKS") |>
  ggboxplot(x = "pathology", y = "frequency", color = "cohort", 
    add = "jitter", order = c("CMV", "DENV1", "DENV3/4", "EBV", "HCV", "HIV-1",
       "HSV-2",
      "InfluenzaA", "M.tuberculosis", "SARS-CoV-2", "Other", "Multi-pathogen",
      "Unknown")) |>
  ggpar(xlab = FALSE, ylab = "Cummulative frequency", 
    legend.title = "Cohort", 
    format.scale = T, legend = "right",
    ggtheme = theme_classic(base_size = 30),
    palette = c("Epidemic KS - NAT" = "#fb8072", 
  "Epidemic KS - Tumor" = "#e41a1c", "Endemic KS - NAT" = "#80b1d3",
  "Endemic KS - Tumor" = "#377eb8")) +
  geom_hline(yintercept = 1, linetype = 2) + 
  geom_hline(yintercept = 25, linetype = 2) + 
  geom_hline(yintercept = 75, linetype = 2) + 
  geom_hline(yintercept = 90, linetype = 2) + 
  scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  annotation_logticks(sides = "l", short = unit(2,"mm"),
    mid = unit(4,"mm"),long = unit(8,"mm"), size = 1) +
  theme_classic(base_size = 30) +
  theme(axis.text = element_text(face = "bold"),
    axis.title.x = element_blank())

public_gliph_plot <- path_annotated_gliph_nprod_table |>
  mutate(frequency = frequency * 100) |>
  ggboxplot(x = "pathology", y = "frequency", color = "cohort", 
    add = "jitter", order = c("CMV", "EBV", "HCV", "HIV-1",
       "HSV-2", "InfluenzaA", "M.tuberculosis", "SARS-CoV-2", "Other", 
       "Multi-pathogen",   "Clustered\nUnknown", "Unclustered\nUnknown")) |>
  ggpar(xlab = FALSE, ylab = "Cummulative frequency", 
    legend.title = "Cohort", 
    format.scale = T, legend = "right",
    ggtheme = theme_classic(base_size = 30),
    palette = c("Epidemic KS - NAT" = "#fb8072", 
  "Epidemic KS - Tumor" = "#e41a1c", "Endemic KS - NAT" = "#80b1d3",
  "Endemic KS - Tumor" = "#377eb8")) +
  geom_hline(yintercept = 1, linetype = 2) +
  geom_hline(yintercept = 25, linetype = 2) +
  geom_hline(yintercept = 75, linetype = 2) +  
  geom_hline(yintercept = 90, linetype = 2) +  
  scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  annotation_logticks(sides = "l", short = unit(2,"mm"),
    mid = unit(4,"mm"),long = unit(8,"mm"), size = 1) + 
  theme_classic(base_size = 30) +
  theme(axis.text = element_text(face = "bold"),
    axis.title.x = element_blank())

### Overlapping clustered unknown section
endemic_gliph_compairr <- kstme_gliph_dist_tibble |>
  mutate(color_group = str_c(str_replace(sample_group, "sample", "subject"), tumor_group, sep = "\n"),
    jaccard = if_else(jaccard > 1, 1, jaccard)) |>
  filter(ks_group == "Endemic KS - Endemic KS") |>
  ggboxplot(x = "color_group", y = "jaccard", fill = "color_group",
    order = c("Inter-subject\nNAT - NAT", "Intra-subject\nNAT - Tumor",
      "Intra-subject\nTumor - Tumor", "Inter-subject\nTumor - Tumor")) |>
  ggpar(xlab = FALSE, ylab = "Jaccard index",  #title = "Endemic KS - Endemic KS",
    ggtheme = theme_classic(base_size = 30), legend = "none",
    legend.title = "Group", palette = c("Inter-subject\nNAT - NAT" = "#1b9e77", 
      "Intra-subject\nNAT - Tumor" = "#d95f02",
      "Intra-subject\nTumor - Tumor" = "#7570b3", 
      "Inter-subject\nTumor - Tumor" = "#e7298a"),
    ylim = c(0,1)) +  stat_compare_means(comparisons = list(c("Intra-subject\nNAT - Tumor",
      "Intra-subject\nTumor - Tumor")), size = 6)

epidemic_gliph_compairr <- kstme_gliph_dist_tibble |>
  mutate(color_group = str_c(str_replace(sample_group, "sample", "subject"), tumor_group, sep = "\n"),
    jaccard = if_else(jaccard > 1, 1, jaccard)) |>
  filter(ks_group == "Epidemic KS - Epidemic KS") |>
  ggboxplot(x = "color_group", y = "jaccard", fill = "color_group",
    order = c("Inter-subject\nNAT - NAT", "Intra-subject\nNAT - Tumor",
      "Intra-subject\nTumor - Tumor", "Inter-subject\nTumor - Tumor")) |>
  ggpar(xlab = FALSE, ylab = "Jaccard index",  #title = "Epidemic KS - Epidemic KS",
    ggtheme = theme_classic(base_size = 30), legend = "right",
    legend.title = "Group", palette = c("Inter-subject\nNAT - NAT" = "#1b9e77", 
      "Intra-subject\nNAT - Tumor" = "#d95f02",
      "Intra-subject\nTumor - Tumor" = "#7570b3", 
      "Inter-subject\nTumor - Tumor" = "#e7298a"),
    ylim = c(0,1)) +
  stat_compare_means(comparisons = list(c("Intra-subject\nNAT - Tumor",
      "Intra-subject\nTumor - Tumor")), size = 6) 

endemic_epidemic_gliph_compairr <- kstme_gliph_dist_tibble |>
  mutate(color_group = str_c(str_replace(sample_group, "sample", "subject"), tumor_group, sep = "\n"),
    jaccard = if_else(jaccard > 1, 1, jaccard)) |>
  filter(ks_group == "Endemic KS - Epidemic KS") |>
  ggboxplot(x = "color_group", y = "jaccard", fill = "color_group",
    order = c("Inter-subject\nNAT - NAT", "Inter-subject\nTumor - Tumor")) |>
  ggpar(xlab = FALSE, ylab = "Jaccard index", ylim = c(0,1),
    ggtheme = theme_classic(base_size = 30), #title = "Endemic KS - Epidemic KS",
    legend = "none", palette = c("Inter-sample\nNAT - NAT" = "#1b9e77", 
      "Intra-subject\nNAT - Tumor" = "#d95f02",
      "Intra-subject\nTumor - Tumor" = "#7570b3", 
      "Inter-subject\nTumor - Tumor" = "#e7298a")) +
  stat_compare_means(comparisons = list(c("Intet-subject\nNAT - NAT",
      "Intra-subject\nTumor - Tumor")), size = 6) 


figure_three_layout <- "
AAAABBBBCCCC
AAAABBBBCCCC
AAAABBBBCCCC
DDDDEEEEFFFF
DDDDEEEEFFFF
GGGGGGGGGGGG
GGGGGGGGGGGG
HHHHHHHHHHHH
HHHHHHHHHHHH
IIIIJJJJKKKK
IIIIJJJJKKKK
"

figure_three <- endemic_alluvial + 
  epidemic_alluvial + 
  endepi_alluvial +
  endemic_compairr +
  epidemic_compairr +
  endemic_epidemic_compairr + 
  public_tcrs_plot + 
  public_gliph_plot +
  endemic_gliph_compairr +
  epidemic_gliph_compairr +
  endemic_epidemic_gliph_compairr + 
  plot_layout(design = figure_three_layout, guides = "collect") +
  plot_annotation(tag_levels = "A") 

ggsave(str_c(figures_path, "pdf", "Figure_three.pdf", sep = "/"), figure_three, 
  width = 48, height = 42, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Figure_three.svg", sep = "/"), figure_three, 
  width = 48, height = 42)
ggsave(str_c(figures_path, "png", "Figure_three.png", sep = "/"), figure_three, 
  width = 48, height = 42)


“cannot compute exact p-value with ties”
“[1m[22mComputation failed in `stat_signif()`
Caused by error in `if (scales$x$map(comp[1]) == data$group[1] | manual) ...`:
[33m![39m missing value where TRUE/FALSE needed”
“cannot compute exact p-value with ties”
“[1m[22mComputation failed in `stat_signif()`
Caused by error in `if (scales$x$map(comp[1]) == data$group[1] | manual) ...`:
[33m![39m missing value where TRUE/FALSE needed”
“cannot compute exact p-value with ties”
“[1m[22mComputation failed in `stat_signif()`
Caused by error in `if (scales$x$map(comp[1]) == data$group[1] | manual) ...`:
[33m![39m missing value where TRUE/FALSE needed”


### Supplementary figure 6:

In [318]:
# Prep the BL datasets for GLIPH
bl_gliph_trb_input_table <- study_annotated_nprod_table |> 
    filter(cohort %in% c("BL - Ghana", "BL - Uganda")) |>
    dplyr::select(cohort, repertoire_id, junction_aa, v_call, j_call, duplicate_count) |>
    mutate(gliph_id = str_c(repertoire_id, "BL", sep = ":"),
        cdra = NA_character_) |>
    dplyr::select(junction_aa, v_call, j_call, cdra, gliph_id, duplicate_count) |>
    write_tsv(bl_gliph_trb_input_path, col_names = FALSE)

In [320]:
bl_ebv_groups <- read_csv(bl_gliph_clusters_path, show_col_types = FALSE) |>
    dplyr::select(pattern, vb_score, hla_score, expansion_score, TcRb, Sample, V, J,
    Freq) |>
    separate(Sample, into = c("repertoire_id", "cohort"), sep = ":") |>
    mutate(patient_id = repertoire_id) |>
    group_by(pattern) |>
    mutate(number_of_unique_cdr = length(unique(TcRb)),
        number_of_unique_ptid = length(unique(patient_id))) |>
    ungroup() |>
    filter(vb_score <= 0.01 & number_of_unique_cdr >= 3 &
        number_of_unique_ptid >= 3 & pattern != "single" & nchar(pattern) > 2) |>
    left_join(antigen_db, by = c("TcRb" = "trb_cdr3_aa"), relationship = "many-to-many") |>
    filter(pathology == "EBV")
    
bl_gliph_cluster_table <- read_csv(bl_gliph_clusters_path, show_col_types = FALSE) |>
    dplyr::select(pattern, vb_score, hla_score, expansion_score, TcRb, Sample, V, J,
    Freq) |>
    separate(Sample, into = c("repertoire_id", "cohort"), sep = ":") |>
    mutate(patient_id = repertoire_id) |>
    filter(pattern %in% bl_ebv_groups$pattern)

bl_gliph_cluster_table

pattern,vb_score,hla_score,expansion_score,TcRb,repertoire_id,cohort,V,J,Freq,patient_id
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
SSL%GSNQP,0.006,1,0.14,CASSSLDGSNQPQHF,HU10496_0001,BL,TRBV6-6,TRBJ1-5,3,HU10496_0001
SSL%GSNQP,0.006,1,0.14,CATSSLTGSNQPQHF,HU10496_0001,BL,TRBV15-1,TRBJ1-5,9,HU10496_0001
SSL%GSNQP,0.006,1,0.14,CASSSLSGSNQPQHF,HU10496_0001,BL,TRBV6-6,TRBJ1-5,20,HU10496_0001
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
S%GQAYE,0.01,1,0.21,CASSRGQAYEQYF,H003840_02,BL,TRBV11-2,TRBJ2-7,1,H003840_02
S%GQAYE,0.01,1,0.21,CASSRGQAYEQYF,HU11595_0008,BL,TRBV19-1,TRBJ2-7,2,HU11595_0008


In [322]:
bl_nprod_table <- study_annotated_nprod_table |>
    filter(cohort %in% c("BL - Ghana", "BL - Uganda")) |>
    dplyr::select(cohort, patient_id, repertoire_id, tissue_type, phenotype, junction_aa, v_call, j_call, duplicate_count, duplicate_frequency) |>
    mutate(phenotype = cohort)
bl_nprod_table

cohort,patient_id,repertoire_id,tissue_type,phenotype,junction_aa,v_call,j_call,duplicate_count,duplicate_frequency
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
BL - Ghana,H003840_01,H003840_01,Tumor,BL - Ghana,CYQQFF,TRBV21-1,TRBJ2-1,3,2.215445e-05
BL - Ghana,H003840_01,H003840_01,Tumor,BL - Ghana,CASSTDNSYGYTF,TRBV9-1,TRBJ1-2,1,7.384815e-06
BL - Ghana,H003840_01,H003840_01,Tumor,BL - Ghana,CASSVEQGDEQYF,TRBV9-1,TRBJ2-7,1,7.384815e-06
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
BL - Uganda,009_0249,MVQ194745A,Tumor,BL - Uganda,CASNLDSREGGGAEAFF,TRBV5-5,TRBJ1-1,1,0.0004258944
BL - Uganda,009_0249,MVQ194745A,Tumor,BL - Uganda,CASSLHPVGGTSYEQYF,TRBV5-5,TRBJ2-7,1,0.0004258944


In [323]:
kstme_clustered_unknown_summary_table <- study_annotated_nprod_table |>
    filter(cohort == "Hippos" & junction_aa %in% kstme_clustered_unknown_table$junction_aa) |>
    clonality()
kstme_clustered_unknown_summary_table

In [324]:
kstme_clustered_unknown_summary_table

repertoire_id,total_sequences,unique_productive_sequences,total_count,clonality,gini_coefficient,top_productive_sequence,convergence
<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
008_001_B,972,972,9132,0.351524,0.8388728,15.82348,1.000000
008_001_D,1517,1517,10932,0.336478,0.8188119,12.35821,1.000000
008_001_I,553,553,2155,0.168007,0.6487356,5.75406,1.010101
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
008_264_V09,5358,5358,6884,0.028300,0.2070225,0.7553748,1.010101
008_265_D,1013,1013,5638,0.229788,0.7400670,5.0549840,1.020408


In [325]:
    
supplementary_figure_six <- kstme_gliph_tpack_table |>
    filter(pathology == "Clustered\nUnknown") |>
    mutate(duplicate_frequency = duplicate_frequency * 100,
        group = if_else(str_detect(repertoire_id, "008_\\d+"), 
          str_c(phenotype, tissue_type, sep = " - "), phenotype),
        group = factor(group, levels = c("Epidemic KS - NAT", 
          "Epidemic KS - Tumor", "Endemic KS - NAT",
          "Endemic KS - Tumor", "BL- Ghana", "BL - Uganda"))) |>
    ggboxplot(x = "group", y = "duplicate_frequency", color = "group") |>
    ggpar(xlab = FALSE, ylab = "Cummulative frequency", 
    legend.title = "Cohort", 
    format.scale = T, legend = "right",
    ggtheme = theme_classic(base_size = 32),
    palette = c("Epidemic KS - NAT" = "#fb8072", 
  "Epidemic KS - Tumor" = "#e41a1c", "Endemic KS - NAT" = "#80b1d3",
  "Endemic KS - Tumor" = "#377eb8")) +
  geom_hline(yintercept = 1, linetype = 2) +
  geom_hline(yintercept = 10, linetype = 2) +
  geom_hline(yintercept = 75, linetype = 2) +  
  scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  annotation_logticks(sides = "l", short = unit(2,"mm"),
    mid = unit(4,"mm"),long = unit(8,"mm"), size = 1) +
  theme(axis.text =  element_text(face = "bold"))
ggsave(str_c(figures_path, "pdf", "Supplementary_figure_six.pdf", sep = "/"), supplementary_figure_six, 
  width = 24, height = 10)
ggsave(str_c(figures_path, "svg", "Supplementary_figure_six.svg", sep = "/"), supplementary_figure_six, 
  width = 24, height = 10)
ggsave(str_c(figures_path, "png", "Supplementary_figure_six.png", sep = "/"), supplementary_figure_six, 
  width = 24, height = 10)

### Supplementary figure 7:

In [406]:
endemic_upset_input_table <- alluvial_data |>
  filter(str_detect(repertoire_id,  "008_057|008_061"))
endemic_upset_input_table

repertoire_id,junction_aa,v_call,d_call,j_call,v_family,d_family,j_family,reading_frame,duplicate_count,duplicate_frequency
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
008_057_A,CAGGGRGAEDTQYF,TRBV7-8,TRBD2-1,TRBJ2-3,TRBV7,TRBD2,TRBJ2,in-frame,1,0.0007429421
008_057_A,CAGRREETQYF,TRBV2-1,TRBD1-1,TRBJ2-5,TRBV2,TRBD1,TRBJ2,in-frame,1,0.0007429421
008_057_A,CAGSDRGRVDYGYTF,TRBV2-1,TRBD1-1,TRBJ1-2,TRBV2,TRBD1,TRBJ1,in-frame,1,0.0007429421
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
008_061_D,TLESCTGELFF,TRBV7-5,TRBD2-1,TRBJ2-2,TRBV7,TRBD2,TRBJ2,in-frame,1,6.928566e-05
008_061_D,WPAAATIPFHSDNEQFF,,TRBD2-1,TRBJ2-1,,TRBD2,TRBJ2,in-frame,1,6.928566e-05


In [407]:
# Define your list
my_list <- c("008_061_B", "008_061_C", "008_061_D", "008_057_B", "008_057_C", "008_057_D")

# Function to generate all subsets
all_subsets <- function(lst) {
  all_combinations <- list()
  for (i in 1:length(lst)) {
    all_combinations <- c(all_combinations, combn(lst, i, simplify = FALSE))
  }
  return(all_combinations)
}

# Get all subsets
subsets <- all_subsets(my_list)

formatter_list <- c()
# Print the formatted subsets
for (formatted_subset in subsets) {
  #print(str_c("upset_query(intersect = c(", str_c("'", formatted_subset, "'", collapse = ","), ") color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')", sep = ""))
  formatter_list <- c(str_c("upset_query(intersect = c(", str_c("'", formatted_subset, "'", collapse = ","), ") color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),", sep = ""), formatter_list)
}
collapsed_string <- paste(formatter_list, collapse = "\n")
cat(collapsed_string, '\n')


upset_query(intersect = c('008_061_B','008_061_C','008_061_D','008_057_B','008_057_C','008_057_D') color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
upset_query(intersect = c('008_061_C','008_061_D','008_057_B','008_057_C','008_057_D') color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
upset_query(intersect = c('008_061_B','008_061_D','008_057_B','008_057_C','008_057_D') color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
upset_query(intersect = c('008_061_B','008_061_C','008_057_B','008_057_C','008_057_D') color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
upset_query(intersect = c('008_061_B','008_061_C','008_061_D','008_057_C','008_057_D') color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
upset_query(intersect = c('008_061_B','008_

In [411]:
# UpSetR plots for endemic KS comparisons

endemic_upset_input_tibble <- alluvial_data |>
    filter(str_detect(repertoire_id,  "008_057|008_061"))  |>
    left_join(study_metadata, by = "repertoire_id") |>
    dplyr::select(repertoire_id, junction_aa, tissue_type, phenotype) |>
    mutate(present = 1,
        row_id = str_c(repertoire_id, tissue_type, phenotype, sep = " ")) |>
    pivot_wider(id_cols = junction_aa, names_from = repertoire_id, values_from = present, values_fill = 0) |>
    dplyr::select(junction_aa, "008_057_A", "008_057_B", "008_057_C", "008_057_D", "008_061_D",
        "008_061_C", "008_061_B", "008_061_A")

endemic_upset_rownames <- endemic_upset_input_tibble |>
    pull(junction_aa)
endemic_upset_input_table <- endemic_upset_input_tibble |>
    dplyr::select(-junction_aa) |>
    as.data.frame() 
rownames(endemic_upset_input_table) <- endemic_upset_rownames

endemic_upset <- upset(endemic_upset_input_table, 
    intersect = c("008_057_A", "008_057_B", "008_057_C", "008_057_D", "008_061_D", "008_061_C", "008_061_B", "008_061_A"),
    queries = list(
    upset_query(set = '008_061_A', fill = '#80b1d3'),
    upset_query(set = '008_061_B', fill = '#377eb8'),
    upset_query(set = '008_061_C', fill = '#377eb8'),
    upset_query(set = '008_061_D', fill = '#377eb8'),
    upset_query(set = '008_057_A', fill = '#80b1d3'),
    upset_query(set = '008_057_B', fill = '#377eb8'),
    upset_query(set = '008_057_C', fill = '#377eb8'),
    upset_query(set = '008_057_D', fill = '#377eb8'),
    upset_query(intersect = c('008_061_A'), color = '#80b1d3', fill = '#80b1d3', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_A'), color = '#80b1d3', fill = '#80b1d3', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_D'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_B'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_C'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_D'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_B','008_057_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_B','008_057_C', '008_057_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),    
    upset_query(intersect = c('008_061_B','008_061_C', '008_061_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),    
    upset_query(intersect = c('008_057_B','008_057_C'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_C','008_057_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_C'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C','008_061_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_A','008_057_B', '008_057_C', '008_057_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_B', '008_061_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_B'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_B', '008_061_C', '008_061_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_A','008_057_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_A','008_057_B'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_A','008_057_B', '008_057_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_A','008_057_C'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_A','008_057_C', '008_057_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_B', '008_061_C'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_C'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_057_A','008_057_B', '008_057_C'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_D','008_057_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_C', '008_061_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_D','008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_D', '008_057_C', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_057_D'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_D','008_057_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_D','008_057_D', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_D', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_C', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_C', '008_057_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C','008_057_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_C', '008_061_D', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_D', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C','008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_057_B'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_057_D','008_057_C','008_057_B'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_D', '008_057_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_C', '008_061_D', '008_057_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C','008_061_D', '008_057_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),    
    upset_query(intersect = c('008_057_C','008_061_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_D','008_057_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_B', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C','008_061_D', '008_057_D', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_D', '008_057_D', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C','008_057_D', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C','008_061_D', '008_057_C', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_B', '008_061_D', '008_057_C', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_C', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_D','008_057_D', '008_057_C', '008_057_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_061_D','008_057_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_057_A'), color = '#1b9e77', fill = '#1b9e77', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_057_D','008_057_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_B','008_061_C','008_061_D', '008_057_C', '008_057_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_D','008_057_B','008_057_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_057_B','008_057_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_057_D','008_057_B','008_057_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_D','008_057_D','008_057_C','008_057_B','008_057_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_C','008_057_D','008_057_C','008_057_B','008_057_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
    upset_query(intersect = c('008_061_A','008_057_D','008_057_C','008_057_B','008_057_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size'))),
    intersections = list('008_061_B', '008_057_B','008_057_D', '008_061_D',
        '008_057_C', '008_061_A', '008_061_C', '008_057_A', 
        c('008_061_B','008_061_D'),
        c('008_061_B','008_061_C','008_061_D'),
        c('008_061_B','008_061_C'),
        c('008_061_D','008_061_C'),
        c('008_057_D','008_057_B'),
        c('008_057_D','008_057_C','008_057_B'), 
        c('008_057_C','008_057_B'),
        c('008_057_D','008_057_C'),
        c('008_061_A','008_061_B','008_061_D'),
        c('008_061_A','008_061_B'),
        c('008_061_A','008_061_B','008_061_C','008_061_D'),
        c('008_061_A','008_061_D'),
        c('008_061_A','008_061_B','008_061_C'),
        c('008_061_A','008_061_C'),
        c('008_061_A','008_061_C','008_061_D'),
        c('008_057_D','008_057_C','008_057_B','008_057_A'),
        c('008_057_D','008_057_A'),
        c('008_057_B','008_057_A'),
        c('008_057_D','008_057_B','008_057_A'),
        c('008_057_C','008_057_A'),
        c('008_057_D','008_057_C','008_057_A'),
        c('008_057_C','008_057_B','008_057_A'),
        c('008_057_B','008_061_B'),
        c('008_057_D','008_061_D'),
        c('008_061_B','008_057_C'),
        c('008_061_B','008_057_D'),
        c('008_061_D','008_057_B'),
        c('008_061_B','008_057_D','008_057_C','008_057_B'),
        c('008_061_D','008_057_C'),
        c('008_061_D','008_057_D','008_057_B'),
        c('008_061_B','008_057_D','008_057_B'),
        c('008_061_B','008_061_C','008_057_B'),
        c('008_061_B','008_061_C','008_057_C'),
        c('008_061_C','008_057_C'),
        c('008_061_B','008_061_C','008_061_D','008_057_B'),
        c('008_061_B','008_061_D','008_057_B'),
        c('008_061_C','008_057_B'),
        c('008_061_B','008_061_D','008_057_D'),
        c('008_061_B','008_061_C','008_061_D','008_057_C'),
        c('008_061_C','008_061_D','008_057_C'),
        c('008_061_B','008_057_D','008_057_C'),
        c('008_061_A','008_061_B','008_057_B'),
        c('008_061_C','008_061_D','008_057_D','008_057_B'),
        c('008_061_A','008_061_D','008_057_D','008_057_B'),
        c('008_061_C','008_057_D','008_057_B'),
        c('008_061_C','008_061_D','008_057_C','008_057_B'),
        c('008_061_A','008_061_B','008_061_D','008_057_C','008_057_B'),
        c('008_061_B','008_057_C','008_057_B'),
        c('008_061_D','008_057_D','008_057_C','008_057_B'),
        c('008_061_B','008_057_D','008_057_A'),
        c('008_061_B','008_061_C','008_061_D','008_057_C','008_057_A'),
        c('008_061_D','008_057_B','008_057_A'),
        c('008_061_D','008_057_D','008_057_C','008_057_B','008_057_A'),
        c('008_061_C','008_057_D','008_057_C','008_057_B','008_057_A'),
        c('008_061_A','008_057_A'),
        c('008_061_A','008_057_D'),    
        c('008_061_A','008_057_B'),
        c('008_061_A','008_057_D','008_057_C','008_057_B'),
        c('008_061_A','008_057_C'),
        c('008_061_A','008_061_D','008_057_A'),
        c('008_061_B','008_057_A'),
        c('008_061_A','008_057_B','008_057_A'),
        c('008_061_A','008_057_D','008_057_B','008_057_A'),
        c('008_061_A','008_057_D','008_057_C','008_057_B','008_057_A')),
    sort_sets=FALSE,
    sort_intersections=FALSE,
    width_ratio=0.1,
    name = "Overlap between TCR repertoires",
    base_annotations = list(
            'Intersection size'=intersection_size(
                text = list(size = 7, angle = 90, hjust = 0, vjust = 0.5)
            ) + ylab('Number of overlapping TCRs') 
            + ggtitle("Endemic KS - Endemic KS")
            + theme_classic() 
            + labs(tag = "A")
            + theme(axis.title.x = element_blank(),
                plot.title = element_text(size = 30, face = "bold", family = "NimbusSan", hjust = 0.5),
                axis.text.x = element_blank(),
                axis.ticks.x = element_blank(),
                axis.title.y = element_text(size = 24, face = "bold", family = "NimbusSan"),
                axis.text.y = element_text(size = 22, face = "bold", family = "NimbusSan"),
                plot.tag = element_text(size = 28, face = "bold", family = "NimbusSan"))),
    set_sizes=(upset_set_size()+ ylab('Number of\nunique TCRs') 
            + theme_void() 
            + theme(axis.title.y = element_blank(),
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank(),
                axis.title.x = element_blank(),
                axis.text.x = element_text(size = 20, face = "bold", family = "NimbusSan", angle = 90, vjust = 0.5),
                axis.ticks.x = element_line(linewidth = 2, colour = 'black' ))
    )) +
    theme(axis.title.x = element_blank(),
        axis.text.y = element_text(size = 24, face = "bold", family = "NimbusSan"))


In [412]:
# UpSetR plots for epidemic KS comparisons
epidemic_upset_input_tibble <- alluvial_data |>
    filter(str_detect(repertoire_id, "008_021|008_048"))  |>
    left_join(study_metadata, by = "repertoire_id") |>
    dplyr::select(repertoire_id, junction_aa, tissue_type, phenotype) |>
    mutate(present = 1,
        row_id = str_c(repertoire_id, tissue_type, phenotype, sep = " ")) |>
    pivot_wider(id_cols = junction_aa, names_from = repertoire_id, values_from = present, values_fill = 0)

epidemic_upset_rownames <- epidemic_upset_input_tibble |>
    pull(junction_aa)
epidemic_upset_input_table <- epidemic_upset_input_tibble |>
    dplyr::select(-junction_aa) |>
    as.data.frame() 
rownames(epidemic_upset_input_table) <- epidemic_upset_rownames

epidemic_upset <- upset(epidemic_upset_input_tibble, 
    intersect = c("008_021_A", "008_021_B", "008_021_C", "008_021_D", "008_048_D", "008_048_C", "008_048_B", "008_048_A"),
    queries = list(
        upset_query(set = '008_021_A', fill = '#fb8072'),
        upset_query(set = '008_021_B', fill = '#e41a1c'),
        upset_query(set = '008_021_C', fill = '#e41a1c'),
        upset_query(set = '008_021_D', fill = '#e41a1c'),
        upset_query(set = '008_048_A', fill = '#fb8072'),
        upset_query(set = '008_048_B', fill = '#e41a1c'),
        upset_query(set = '008_048_C', fill = '#e41a1c'),
        upset_query(set = '008_048_D', fill = '#e41a1c'),
        upset_query(intersect = c('008_048_A','008_048_B','008_048_C','008_048_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_C','008_048_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_C','008_048_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_C','008_048_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_B','008_048_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_B','008_048_C'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_C'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_C'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_C'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_B'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A'), color = '#fb8072', fill = '#fb8072', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_D','008_021_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_021_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_B','008_048_C','008_048_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_C','008_048_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_C','008_048_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_C','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_B','008_048_C','008_048_D','008_021_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_C','008_048_D','008_021_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_D','008_021_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_C'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_C'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_B','008_048_C','008_048_D','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_048_D','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_021_B'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_D','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_B'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_C','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_C','008_021_C','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_C','008_021_B'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_C','008_021_B'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_B'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_021_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_D','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_C','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_C','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_B','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_C','008_021_B','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_048_C','008_048_D','008_021_D','008_021_C','008_021_B','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_D','008_021_D','008_021_C','008_021_B','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_B','008_021_D','008_021_C','008_021_B','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_048_A','008_021_D','008_021_C','008_021_B','008_021_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_C','008_021_B','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_C','008_021_B','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_B','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_A'), color = '#fb8072', fill = '#fb8072', only_components=c('intersections_matrix', 'Intersection size'))
    ),
    intersections = list(
        '008_048_B','008_048_D','008_021_C','008_021_D',
        '008_048_A','008_021_B','008_048_C','008_021_A',
        c('008_048_B','008_048_D'),
        c('008_048_B','008_048_C','008_048_D'),
        c('008_048_C','008_048_D'),
        c('008_048_B','008_048_C'),
        c('008_021_D','008_021_C'),
        c('008_021_D','008_021_C','008_021_B'),
        c('008_021_C','008_021_B'),
        c('008_021_D','008_021_B'),
        c('008_048_A','008_048_B','008_048_C','008_048_D'),
        c('008_048_A','008_048_B','008_048_D'),
        c('008_048_A','008_048_B'),
        c('008_048_A','008_048_D'),
        c('008_048_A','008_048_B','008_048_C'),
        c('008_048_A','008_048_C','008_048_D'),
        c('008_048_A','008_048_C'),
        c('008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_021_D','008_021_C','008_021_A'),
        c('008_021_B','008_021_A'),
        c('008_021_C','008_021_A'),
        c('008_021_C','008_021_B','008_021_A'),
        c('008_021_D','008_021_B','008_021_A'),
        c('008_021_D','008_021_A'),
        c('008_048_B','008_021_C'),
        c('008_048_D','008_021_D'),
        c('008_048_B','008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_048_D','008_021_B'),
        c('008_048_A','008_048_B','008_048_C','008_048_D','008_021_D','008_021_C'),
        c('008_048_B','008_048_D','008_021_C'),
        c('008_048_B','008_048_D','008_021_B'),
        c('008_048_B','008_021_B'),
        c('008_048_B','008_048_D','008_021_D'),
        c('008_048_B','008_021_D'),
        c('008_048_A','008_048_B','008_048_C','008_048_D','008_021_C'),
        c('008_048_B','008_048_C','008_048_D','008_021_C'),
        c('008_048_C','008_048_D','008_021_C'),
        c('008_048_C','008_021_C'),
        c('008_048_B','008_048_C','008_048_D','008_021_D','008_021_C'),
        c('008_048_B','008_048_D','008_021_D','008_021_C'),
        c('008_048_D','008_021_D','008_021_C'),
        c('008_048_A','008_048_B','008_048_C','008_048_D','008_021_B'),
        c('008_048_D','008_021_D','008_021_B'),
        c('008_048_D','008_021_C','008_021_B'),
        c('008_048_C','008_021_C','008_021_B'),
        c('008_048_D','008_021_D','008_021_A'),
        c('008_048_D','008_021_C','008_021_B','008_021_A'),
        c('008_048_A','008_048_C','008_048_D','008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_048_D','008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_048_A','008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_048_A','008_021_B'),
        c('008_048_D','008_021_A'),
        c('008_048_B','008_021_A')),
    sort_sets=FALSE,
    sort_intersections=FALSE,
    width_ratio=0.1,
    name = "Overlap between TCR repertoires",
    base_annotations = list(
            'Intersection size'=intersection_size(
                text = list(size = 7, angle = 90, hjust = 0, vjust = 0.5)
            ) + ylab('Number of overlapping TCRs') 
            + ggtitle("Epidemic KS - Epidemic KS")
            + theme_classic() 
            + labs(tag = "B")
            + theme(axis.title.x = element_blank(),
                plot.title = element_text(size = 30, face = "bold", family = "NimbusSan", hjust = 0.5),
                axis.text.x = element_blank(),
                axis.ticks.x = element_blank(),
                axis.title.y = element_text(size = 24, face = "bold", family = "NimbusSan"),
                axis.text.y = element_text(size = 22, face = "bold", family = "NimbusSan"),
                plot.tag = element_text(size = 28, face = "bold", family = "NimbusSan"))),
    set_sizes=(upset_set_size()+ ylab('Number of\nunique TCRs') 
            + theme_void() 
            + theme(axis.title.y = element_blank(),
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank(),
                axis.title.x = element_blank(),
                axis.text.x = element_text(size = 20, face = "bold", family = "NimbusSan", angle = 90, vjust = 0.5),
                axis.ticks.x = element_line(linewidth = 2, colour = 'black' ))
    )) +
    theme(axis.title.x = element_blank(),
        axis.text.y = element_text(size = 24, face = "bold", family = "NimbusSan"))



In [417]:
epiend_upset_input_tibble <- alluvial_data |>
    filter(str_detect(repertoire_id, "008_021|008_057"))  |>
    left_join(study_metadata, by = "repertoire_id") |>
    dplyr::select(repertoire_id, junction_aa, tissue_type, phenotype) |>
    mutate(present = 1,
        row_id = str_c(repertoire_id, tissue_type, phenotype, sep = " ")) |>
    pivot_wider(id_cols = junction_aa, names_from = repertoire_id, values_from = present, values_fill = 0)

epiend_upset_rownames <- epiend_upset_input_tibble |>
    pull(junction_aa)
epiend_upset_input_table <- epiend_upset_input_tibble |>
    dplyr::select(-junction_aa) |>
    as.data.frame() 
rownames(epiend_upset_input_table) <- epiend_upset_rownames

# UpSetR plots for epidemic - endemic KS data comparisons
epiend_upset_data <- upset_data(epiend_upset_input_table, 
    intersect = c("008_021_A", "008_021_B", "008_021_C", "008_021_D", "008_057_D", "008_057_C", "008_057_B", "008_057_A"))

combined_pal <- c("Epidemic KS - NAT" = "#fb8072", 
  "Epidemic KS - Tumor" = "#e41a1c", "Endemic KS - NAT" = "#80b1d3",
  "Endemic KS - Tumor" = "#377eb8", "Inter-subject\nNAT - NAT" = "#1b9e77", 
  "Intra-subject\nNAT - Tumor" = "#d95f02",
  "Intra-subject\nTumor - Tumor" = "#7570b3", 
  "Inter-subject\nTumor - Tumor" = "#e7298a",
  "Other" = "#606060")
epiend_pal_table <- epiend_upset_data$plot_intersections_subset |>
    as_tibble() |> 
    dplyr::rename(intersections = value) |>
    mutate(group = case_when(intersections == "008_021_A" ~ "Epidemic KS - NAT",
        intersections == "008_057_A" ~ "Endemic KS - NAT",
        intersections %in% c("008_057_B","008_057_C","008_057_D")~ "Endemic KS - Tumor",
        intersections %in% c("008_021_B","008_021_C","008_021_D")~ "Epidemic KS - Tumor",
        str_detect(intersections, "008_021") & !str_detect(intersections,"008_057") & str_detect(intersections,"A") & str_detect(intersections,'-') ~ "Intra-subject\nNAT - Tumor",
        !str_detect(intersections, "008_021") & str_detect(intersections,"008_057") & str_detect(intersections,"A") & str_detect(intersections,'[BCD]') ~ "Intra-subject\nNAT - Tumor",
        str_detect(intersections, "008_021") & !str_detect(intersections,"008_057") & !str_detect(intersections,"A") & str_detect(intersections,'[BCD]') ~ "Intra-subject\nTumor - Tumor",
        !str_detect(intersections, "008_021") & str_detect(intersections,"008_057") & !str_detect(intersections,"A") & str_detect(intersections,'[BCD]') ~ "Intra-subject\nTumor - Tumor",
        str_detect(intersections, "008_021") & str_detect(intersections,"008_057") & str_detect(intersections,"008_021_B|008_021_C|008_021_D") & str_detect(intersections,"008_057_B|008_057_C|008_057_D") ~ "Inter-subject\nTumor - Tumor",
        intersections == "008_021_A-008_057_A" ~ "Inter-subject\nNAT - NAT",
        str_detect(intersections,"008_021") & str_detect(intersections,"008_057") & !str_detect(intersections,"008_021_B") & !str_detect(intersections,"008_021_C") & !str_detect(intersections,"008_021_D") ~ "Other",
        str_detect(intersections,"008_021") & str_detect(intersections,"008_057") & !str_detect(intersections,"008_057_B") & !str_detect(intersections,"008_057_C") & !str_detect(intersections,"008_057_D") ~ "Other"),
        color = combined_pal[group])
epiend_pal <- epiend_pal_table |>
  pull(color) 
names(epiend_pal) <- epiend_pal_table |>
  pull(intersections) 

In [418]:

epiend_upset <- upset(epiend_upset_input_table, 
    intersect = c("008_021_A", "008_021_B", "008_021_C", "008_021_D", "008_057_D", "008_057_C", "008_057_B", "008_057_A"),
    queries = list(
        upset_query(set = '008_021_A', fill = '#fb8072'),
        upset_query(set = '008_021_B', fill = '#e41a1c'),
        upset_query(set = '008_021_C', fill = '#e41a1c'),
        upset_query(set = '008_021_D', fill = '#e41a1c'),
        upset_query(set = '008_057_A', fill = '#80b1d3'),
        upset_query(set = '008_057_B', fill = '#377eb8'),
        upset_query(set = '008_057_C', fill = '#377eb8'),
        upset_query(set = '008_057_D', fill = '#377eb8'),
        upset_query(intersect = c('008_057_A','008_057_B','008_057_C','008_057_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_057_C','008_057_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_C','008_057_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_C','008_057_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_B','008_057_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_057_D'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_D'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_D'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_B','008_057_C'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_057_C'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_C'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_C'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_B'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B'), color = '#377eb8', fill = '#377eb8', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A'), color = '#80b1d3', fill = '#80b1d3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_057_C','008_057_D','008_021_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_D','008_021_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_C','008_021_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_021_D'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_021_D'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_C','008_057_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_B','008_057_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_057_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_057_C','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_C','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_021_C'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_057_D','008_021_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_D','008_021_D','008_021_C'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_C'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_C'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_057_D','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_D','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_C','008_021_D','008_021_B'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_B'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_021_C','008_021_B'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_C','008_021_B'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_C','008_021_B'), color = '#7570b3', fill = '#7570b3', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_B'), color = '#e41a1c', fill = '#e41a1c', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_D','008_021_A'), color = '#606060', fill = '#606060', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_C','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_C','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_B','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_B','008_057_C','008_057_D','008_021_D','008_021_C','008_021_B','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_A','008_057_B','008_057_D','008_021_D','008_021_C','008_021_B','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_D','008_021_D','008_021_C','008_021_B','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_057_B','008_021_D','008_021_C','008_021_B','008_021_A'), color = '#e7298a', fill = '#e7298a', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_D','008_021_C','008_021_B','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_C','008_021_B','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_B','008_021_A'), color = '#d95f02', fill = '#d95f02', only_components=c('intersections_matrix', 'Intersection size')),
        upset_query(intersect = c('008_021_A'), color = '#fb8072', fill = '#fb8072', only_components=c('intersections_matrix', 'Intersection size'))),
    intersections = list(
        '008_057_B','008_057_D','008_057_C','008_057_A',
        '008_021_C','008_021_D','008_021_B','008_021_A',
        c('008_057_B','008_057_D'),
        c('008_057_B','008_057_C','008_057_D'),
        c('008_057_B','008_057_C'),
        c('008_057_C','008_057_D'),
        c('008_021_D','008_021_C'),
        c('008_021_D','008_021_C','008_021_B'),
        c('008_021_C','008_021_B'),
        c('008_021_D','008_021_B'),
        c('008_057_A','008_057_B','008_057_C','008_057_D'),
        c('008_057_A','008_057_D'),
        c('008_057_A','008_057_B'),
        c('008_057_A','008_057_C'),
        c('008_057_A','008_057_B','008_057_D'),
        c('008_057_A','008_057_C','008_057_D'),
        c('008_057_A','008_057_B','008_057_C'),
        c('008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_021_D','008_021_C','008_021_A'),
        c('008_021_B','008_021_A'),
        c('008_021_C','008_021_A'),
        c('008_021_C','008_021_B','008_021_A'),
        c('008_021_D','008_021_A'),
        c('008_021_D','008_021_B','008_021_A'),
        c('008_057_B','008_057_D','008_021_C'),
        c('008_057_B','008_021_C'),
        c('008_057_D','008_021_D'),
        c('008_057_B','008_021_D'),
        c('008_057_D','008_021_C'),
        c('008_057_D','008_021_B'),
        c('008_057_B','008_021_B'),
        c('008_057_B','008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_057_B','008_057_C','008_057_D','008_021_D'),
        c('008_057_C','008_021_D'),
        c('008_057_C','008_057_D','008_021_C'),
        c('008_057_A','008_057_B','008_057_D','008_021_C'),
        c('008_057_B','008_057_C','008_021_C'),
        c('008_057_C','008_021_C'),
        c('008_057_B','008_057_D','008_021_D','008_021_C'),
        c('008_057_D','008_021_D','008_021_C'),
        c('008_057_B','008_057_D','008_021_B'),
        c('008_057_C','008_021_D','008_021_B'),
        c('008_057_A','008_057_B','008_057_C','008_057_D','008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_057_A','008_057_B','008_057_D','008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_057_D','008_021_D','008_021_C','008_021_B','008_021_A'),
        c('008_057_A','008_021_D'),
        c('008_057_A','008_021_C'),
        c('008_057_A','008_021_C','008_021_B'),
        c('008_057_D','008_021_A')

    ),
    sort_sets=FALSE,
    sort_intersections=FALSE,
    width_ratio=0.1,
    name = "Overlap between TCR repertoires",
    base_annotations = list(
            'Intersection size'=intersection_size(
                aes(fill = intersection),
                text = list(size = 7, angle = 90, hjust = 0, vjust = 0.5)
            ) + ylab('Number of overlapping TCRs') 
            + theme_classic() 
            + ggtitle("Endemic KS - Epidemic KS") 
            + labs(tag = "C") 
            + scale_fill_manual(values = epiend_pal)
            + theme(axis.title.x = element_blank(),
                plot.title = element_text(size = 30, face = "bold", family = "NimbusSan", hjust = 0.5),
                axis.text.x = element_blank(),
                axis.ticks.x = element_blank(),
                axis.title.y = element_text(size = 24, face = "bold", family = "NimbusSan"),
                axis.text.y = element_text(size = 22, face = "bold", family = "NimbusSan"),
                plot.tag = element_text(size = 28, face = "bold", family = "NimbusSan"),
                legend.position = "right")),
    set_sizes=(upset_set_size()+ ylab('Number of\nunique TCRs') 
            + theme_void() 
            + theme(axis.title.y = element_blank(),
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank(),
                axis.title.x = element_text(size = 24, face = "bold", family = "NimbusSan"),
                axis.text.x = element_text(size = 20, face = "bold", family = "NimbusSan", angle = 90, vjust = 0.5),
                axis.ticks.x = element_line(linewidth = 2, colour = 'black' ))
    ), guides = "collect") +
    scale_fill_manual(values = epiend_pal) +
    theme(axis.title.x = element_text(size = 24, face = "bold", family = "NimbusSan"),
        axis.text.y = element_text(size = 24, face = "bold", family = "NimbusSan")) 


In [419]:
supplementary_figure_seven <- endemic_upset / epidemic_upset / epiend_upset + plot_layout(guides = "collect")  & theme(text = element_text('NimbusSan'))

ggsave(str_c(figures_path, "pdf", "Supplementary_figure_seven.pdf", sep = "/"), supplementary_figure_seven, width = 32, 
    height = 30, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "png", "Supplementary_figure_seven.png", sep = "/"), supplementary_figure_seven, width = 32, 
    height = 30)
ggsave(str_c(figures_path, "svg", "Supplementary_figure_seven.svg", sep = "/"), supplementary_figure_seven, width = 32, 
    height = 30)


## Section 4: Temporal overlap between KS tumors


In [327]:
temporal_map_table <- study_metadata |>
    filter(patient_id %in% c("008_141", "008_008")) |>
    dplyr::select(patient_id, repertoire_id, phenotype, tissue_type, tumor_code, visit_code) |>
    mutate(visit_code = str_c("Visit", str_extract(visit_code, "\\d+"), sep = " "),
    label = if_else(tissue_type == "NAT", 
        str_c(tissue_type, "\n", visit_code, sep = ""),
        str_c(tissue_type, " ", tumor_code, "\n", visit_code, sep = ""))) |>
    dplyr::select(repertoire_id, label)
temporal_map_dict <- temporal_map_table |>
    pull(label)
names(temporal_map_dict) <- temporal_map_table |>
    pull(repertoire_id)

### Figure 4:


In [462]:
temopral_example_table <- study_annotated_nprod_table |>
  filter(patient_id %in% c("008_141", "008_008")) |>
  productiveSeq(aggregate = "junction_aa") 

temporal_overlap_samples <- study_metadata |> 
  filter(tissue_type %in% c( "Tumor") & 
    str_detect(repertoire_id, "008_\\d+")) |>  
  group_by(patient_id, hiv_status, visit_code) |> 
  summarize(count = length(unique(repertoire_id))) |> 
  pivot_wider(names_from = visit_code, values_from = count, values_fill = 0) |> 
  filter(V01 != 0 & (V05 != 0 | V08 != 0)) |>
  pull(patient_id)

visit_map <- study_metadata |>
  filter(tissue_type %in% c( "Tumor") & 
    str_detect(repertoire_id, "008_\\d+")) |>
  pull(visit_code)
names(visit_map) <- study_metadata |>
  filter(tissue_type %in% c( "Tumor") & 
    str_detect(repertoire_id, "008_\\d+")) |>
  pull(trb_repertoire_id)

endemic_ug <- temopral_example_table |> 
  filter(junction_aa %in% kstme_clustered_unknown_table$junction_aa & 
    str_detect(repertoire_id, "008_141")) |>
  mutate(color = "#377eb8") |>
  pull(color) 

endemic_alist <- temopral_example_table |> 
  filter(junction_aa %in% kstme_clustered_unknown_table$junction_aa & 
    str_detect(repertoire_id, "008_141")) |>
  mutate(color = "#377eb8") |>
  pull(junction_aa) 
names(endemic_ug) <- temopral_example_table |> 
  filter(junction_aa %in% kstme_clustered_unknown_table$junction_aa & 
    str_detect(repertoire_id, "008_141")) |>
  mutate(color = "#377eb8") |>
  pull(junction_aa) 

endemic_alluvial <- temopral_example_table |>
  filter(str_detect(repertoire_id, "008_141")) |>
  topSeqs(top = 500) |>
  cloneTrack() |>
  plotTrack(alist = endemic_alist, apal = endemic_ug) +
  scale_x_discrete(labels = temporal_map_dict) +
  labs(x = "Sample") + 
  theme_classic(base_size = 30) +
  theme(legend.position = "none", 
    axis.text.x = element_text(face = "bold", color = "black"))

epidemic_ug <- temopral_example_table |> 
  filter(junction_aa %in% kstme_clustered_unknown_table$junction_aa & 
    str_detect(repertoire_id, "008_008")) |>
  mutate(color = "#e41a1c") |>
  pull(color) 
epidemic_alist <- temopral_example_table |> 
  filter(junction_aa %in% kstme_clustered_unknown_table$junction_aa & 
    str_detect(repertoire_id, "008_008")) |>
  mutate(color = "#e41a1c") |>
  pull(junction_aa) 
names(epidemic_ug) <- temopral_example_table |> 
  filter(junction_aa %in% kstme_clustered_unknown_table$junction_aa & 
    str_detect(repertoire_id, "008_008")) |>
  mutate(color = "#e41a1c") |>
  pull(junction_aa) 

epidemic_alluvial <- temopral_example_table |>
  filter(str_detect(repertoire_id, "008_008")) |>
  topSeqs(top = 500) |>
  cloneTrack() |>
  plotTrack(alist = epidemic_alist, apal = epidemic_ug) +
  scale_x_discrete(labels = temporal_map_dict) +
  labs(x = "Sample") + 
  theme_classic(base_size = 30)  +
  theme(legend.position = "none",
   axis.text.x = element_text(face = "bold", color = "black"))

figure_four_layout <- "
11112222
11112222
11112222"
figure_four <- (endemic_alluvial + epidemic_alluvial +plot_layout(guides = "auto"))  +
  plot_layout(design = figure_four_layout) +
  plot_annotation(tag_levels = 'A') & theme(text = element_text('NimbusSan', face = "bold"))
ggsave(str_c(figures_path, "pdf", "Figure_four.pdf", sep = "/"), figure_four, 
  width = 30, height = 30)
ggsave(str_c(figures_path, "svg", "Figure_four.svg", sep = "/"), figure_four, 
  width = 30, height = 30)
ggsave(str_c(figures_path, "png", "Figure_four.png", sep = "/"), figure_four, 
  width =  30, height = 30)


Subsetting productive sequences [=>----------]  14% eta:  3s

Subsetting productive sequences [==>---------]  21% eta:  3s

Subsetting productive sequences [==>---------]  29% eta:  3s

Subsetting productive sequences [===>--------]  36% eta: 15s

Subsetting productive sequences [====>-------]  43% eta: 12s

Subsetting productive sequences [=====>------]  50% eta:  9s











## Section 5: Temporal sharing of TCRs in single-cell sequencing data

### Load 10X PBMC dataset

In [332]:
pbmc_objects <- list.files(path = seurat_input_path,
  pattern = "V\\d+", full.names = TRUE) |>
  map(LoadH5Seurat)|>
  map(~subset(.x, subset = discard == 0 & scDblFinder.class == "singlet")) |>
  map(NormalizeData) |>
  map(~FindVariableFeatures(.x, selection.method = "vst", nfeatures = 2000)) 
  
pbmc_features <- SelectIntegrationFeatures(object.list = pbmc_objects)

pbmc_objects <- pbmc_objects |>
  map(~ScaleData(.x, features = pbmc_features)) |>
  map(~RunPCA(.x, features = pbmc_features))

pbmc_anchors <- FindIntegrationAnchors(object.list = pbmc_objects, 
  anchor.features = pbmc_features, reduction = "rpca")
pbmc_integrated <- IntegrateData(anchorset = pbmc_anchors)
DefaultAssay(pbmc_integrated) <- "integrated"
pbmc_integrated <- ScaleData(pbmc_integrated, verbose = FALSE)
pbmc_integrated <- RunPCA(pbmc_integrated, npcs = 30, verbose = FALSE)
pbmc_integrated <- RunUMAP(pbmc_integrated, reduction = "pca", dims = 1:30)
pbmc_integrated <- RunTSNE(pbmc_integrated, reduction = "pca", dims = 1:30)
pbmc_integrated <- FindNeighbors(pbmc_integrated, reduction = "pca", dims = 1:30)
pbmc_integrated <- FindClusters(pbmc_integrated, resolution = 0.5)
DefaultAssay(pbmc_integrated) <- "RNA"

Validating h5Seurat file



Initializing RNA with data

Adding counts for RNA

Adding feature-level metadata for RNA

Adding miscellaneous information for RNA

Adding reduction PCA

Adding cell embeddings for PCA

Adding miscellaneous information for PCA

Adding reduction TSNE

Adding cell embeddings for TSNE

Adding miscellaneous information for TSNE

Adding command information

Adding cell-level metadata

Adding miscellaneous information

Adding tool-specific results

Validating h5Seurat file

Initializing RNA with data

Adding counts for RNA

Adding feature-level metadata for RNA

Adding miscellaneous information for RNA

Adding reduction PCA

Adding cell embeddings for PCA

Adding miscellaneous information for PCA

Adding reduction TSNE

Adding cell embeddings for TSNE

Adding miscellaneous information for TSNE

Adding command information

Adding cell-level metadata

Adding miscellaneous information

Adding tool-specific results

Validating h5Seurat file

Initializing RNA with data

Adding counts for RNA

Add

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 246521
Number of edges: 9418092

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9311
Number of communities: 25
Elapsed time: 167 seconds


In [333]:
pbsample_table <- pbmc_integrated@meta.data |>
  as_tibble() |>
  dplyr::select(Sample) |>
  distinct() 

pbcohort_table <- read_csv(kstme_metadata_path) |> 
  filter(!is.na(gex_library)) |> 
  dplyr::select(patient_id, repertoire_id, hiv_status,response) |> 
  mutate(cohort = if_else(hiv_status == "Positive", "Epidemic KS", "Endemic KS")) |> 
  dplyr::select(patient_id, repertoire_id, cohort, response) |>
  filter(repertoire_id %in% pbsample_table$Sample) |>
  dplyr::select(repertoire_id, cohort, response) |>
  mutate(cohort = if_else(is.na(cohort), "Epidemic KS", cohort),
    response = factor(response, levels = c("CR", "PR" ,"SD", "PD")))

cohort_dict <- pbcohort_table |>
  pull(cohort) 
names(cohort_dict) <- pbcohort_table |>
  pull(repertoire_id)

pbmc_integrated@meta.data$cohort <- cohort_dict[pbmc_integrated@meta.data$Sample]
pbmc_tcells <- pbmc_integrated@meta.data |>
  as_tibble() |>
  mutate(isT = if_else(CTaa == "NA", "Other", "Tcell")) |>
  pull(isT)
pbmc_integrated@meta.data$is_tcell <- pbmc_tcells


[1mRows: [22m[34m682[39m [1mColumns: [22m[34m42[39m
[36m──[39m [1mColumn specification[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (38): cohort, patient_id, repertoire_id, tra_repertoire_id, trb_repertoi...
[32mdbl[39m  (4): sc_library_count, lession_number, age, fudy

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [338]:
#Convert all ensemble IDs to Gene symbols

feature_table <- pbmc_integrated[["RNA"]][[]]
feature_table$ensembl_id <- rownames(feature_table)
master_gene_table <- feature_table |>
  dplyr::mutate(symbol = mapIds(org.Hs.eg.db, keys = feature_table$ensembl_id, keytype = "ENSEMBL", column = "SYMBOL")) |>
  filter(!is.na(symbol)) |>
  group_by(symbol) |>
  slice_head(n =1) |>
  ungroup() 
master_gene_dict <- master_gene_table |>
  pull(symbol)
names(master_gene_dict) <- master_gene_table |>
  pull(ensembl_id)
pbmc_gs <- pbmc_integrated[rownames(pbmc_integrated) %in% master_gene_table$ensembl_id, ]
pbmc_gsc <- as.SingleCellExperiment(pbmc_integrated)
pbmc_gsc <- pbmc_gsc[rownames(pbmc_gsc) %in% master_gene_table$ensembl_id, ]
pbmc_gsc <- scuttle::logNormCounts(pbmc_gsc) 
rownames(pbmc_gsc) <- master_gene_dict[rownames(pbmc_gsc)]
pbmc_gs <- CreateSeuratObject(counts = assay(pbmc_gsc, "counts"), data = assay(pbmc_gsc, "logcounts"))
pbmc_gs@meta.data <- colData(pbmc_gsc) |> 
  as.data.frame()
pbmc_gs <- NormalizeData(pbmc_gs)
pbmc_gs <- FindVariableFeatures(pbmc_gs, selection.method = "vst", nfeatures = 2000)
pbmc_gs <- ScaleData(pbmc_gs, verbose = FALSE)
pbmc_gs <- RunPCA(pbmc_gs, npcs = 30, verbose = FALSE)
pbmc_gs <- RunUMAP(pbmc_gs, reduction = "pca", dims = 1:30)
pbmc_gs <- RunTSNE(pbmc_gs, reduction = "pca", dims = 1:30)
pbmc_gs <- FindNeighbors(pbmc_gs, reduction = "pca", dims = 1:30)
pbmc_gs <- FindClusters(pbmc_gs, resolution = 0.5)
DefaultAssay(pbmc_gs) <- "RNA"

'select()' returned 1:many mapping between keys and columns

Normalizing layer: counts

Finding variable features for layer counts

14:47:35 UMAP embedding parameters a = 0.9922 b = 1.112

Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

Also defined by ‘spam’

14:47:35 Read 246521 rows and found 30 numeric columns

14:47:35 Using Annoy for neighbor search, n_neighbors = 30

Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'

Also defined by ‘spam’

14:47:35 Building Annoy index with metric = cosine, n_trees = 50

0%   10   20   30   40   50   60   70   80   90   100%

[----|----|----|----|----|----|----|----|----|----|

*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|

14:47:59 Writing NN index file to temp file /loc/scratch/33598905/RtmpftM7Ky/file43f76579f75

14:47:59 Searching Annoy index using 1 thread, search_k = 3000

14:49:32 Annoy recall = 100%


Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 246521
Number of edges: 6572981

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9421
Number of communities: 38
Elapsed time: 119 seconds


10 singletons identified. 28 final clusters.



In [340]:
#Azimuth classification
pbmc_gs <- SCTransform(pbmc_gs, verbose = FALSE)
reference <- LoadH5Seurat(azimuth_reference_path)

anchors <- FindTransferAnchors(
  reference = reference,
  query = pbmc_gs,
  normalization.method = "SCT",
  reference.reduction = "spca",
  dims = 1:50
)
              
pbmc_gs <-  MapQuery(
  anchorset = anchors,
  query = pbmc_gs,
  reference = reference,
  refdata = list(
    celltype.l1 = "celltype.l1",
    celltype.l2 = "celltype.l2",
    predicted_ADT = "ADT"
  ),
  reference.reduction = "spca", 
  reduction.model = "wnn.umap"
)

pbmc_gs$cloneType <- pbmc_gs@meta.data |> 
  as_tibble() |>
  mutate(cloneType = if_else(cloneType == "NA", NA_character_, cloneType)) |>
  pull(cloneType)

clones <- pbmc_gs@meta.data |> 
  as_tibble(rownames = "cell_id") |>
  filter(!is.na(cloneType)) |>
  pull(cell_id)
DefaultAssay(pbmc_gs) <- "RNA"

Validating h5Seurat file

Initializing ADT with data

Adding counts for ADT

Adding variable feature information for ADT

Adding miscellaneous information for ADT

Initializing SCT with data

Adding counts for SCT

Adding variable feature information for SCT

Adding miscellaneous information for SCT

Adding reduction apca

Adding cell embeddings for apca

Adding feature loadings for apca

Adding miscellaneous information for apca

Adding reduction aumap

Adding cell embeddings for aumap

Adding miscellaneous information for aumap

Adding reduction pca

Adding cell embeddings for pca

Adding feature loadings for pca

Adding miscellaneous information for pca

Adding reduction spca

Adding cell embeddings for spca

Adding feature loadings for spca

Adding miscellaneous information for spca

Adding reduction umap

Adding cell embeddings for umap

Adding miscellaneous information for umap

Adding reduction wnn.umap

Adding cell embeddings for wnn.umap

Adding miscellaneous information for w

### Figure 5:

In [421]:
pbmc_gs$celltype <- pbmc_gs@meta.data |>
  as_tibble() |>
  mutate(celltype = case_when(
    predicted.celltype.l2 %in% c("ASDC", "cDC1", "cDC2", "pDC") ~ "DC",
    predicted.celltype.l2 %in% c("B intermediate", "B memory", "B naive") ~ "B cell",
    predicted.celltype.l2 %in% c("Doublet", "HSPC", "ILC", "Plasmablast", "Platelet") ~ "Other",
    TRUE ~ predicted.celltype.l2
  )) |>
  pull(celltype)

palette <- c("B cell" = "#e41a1c", "CD14 Mono" = "#377eb8", "CD16 Mono" = "#a6cee3", 
  "CD4 CTL" = "#ff7f00", "CD4 Naive" = "#fdbf6f", 
  "CD4 Proliferating" = "#fdae6b", "CD4 TCM" = "#a63603", "CD4 TEM" = "#f16913", 
  "CD8 Naive"= "#984ea3", 
  "CD8 Proliferating" = "#bcbddc", "CD8 TCM" = "#3f007d", "CD8 TEM" = "#807dba", "DC" = "#999999", 
  "dnT" = "#ffff33", "MAIT" = "#ffeda0", "gdT" = "#a65628", "NK" = "#f781bf",
  "NK Proliferating", "NK_CD56bright", "Treg" = "#4daf4a", "Other" = "grey")


celltype_umap <- DimPlot(pbmc_gs, reduction = "ref.umap", raster = FALSE, 
  group.by = "celltype", label = TRUE, cols = palette,
  label.size = 8 ,repel = TRUE) + theme_classic(base_size = 24) +
  guides(fill = guide_legend(override.aes = list(size = 16))) +
  theme(plot.title = element_blank())


persample_alluvial <- function(celltype_table) {
  ptid <- celltype_table |>
    pull(patient_id) |>
    unique()
  palette <- c("B cell" = "#e41a1c", "CD14 Mono" = "#377eb8", "CD16 Mono" = "#a6cee3", 
  "CD4 CTL" = "#ff7f00", "CD4 Naive" = "#fdbf6f", 
  "CD4 Proliferating" = "#fdae6b", "CD4 TCM" = "#a63603", "CD4 TEM" = "#f16913", 
  "CD8 Naive"= "#984ea3",
  "CD8 Proliferating" = "#bcbddc", "CD8 TCM" = "#3f007d", "CD8 TEM" = "#807dba", "DC" = "#999999", 
  "dnT" = "#ffff33", "MAIT" = "#ffeda0", "gdT" = "#a65628", "NK" = "#f781bf",
  "NK Proliferating", "NK_CD56bright", "Treg" = "#4daf4a", "Other" = "grey")
  celltype_table <- celltype_table |>
    mutate(Sample = str_extract(Sample, "V\\d+$"))
  celltype_alluvial <-  ggplot2::ggplot(
  celltype_table,
  aes(
      x = Sample, y = duplicate_frequency, stratum = celltype,
      alluvium = celltype, fill = celltype, 
      label = celltype
    )
  ) +
  ggalluvial::geom_alluvium() +
  ggalluvial::geom_stratum(ggplot2::aes(y = duplicate_frequency)) +
  ggplot2::xlab("Repertoire ID") +
  ggplot2::ylab("Frequency of cell type") +
  ggplot2::ggtitle(ptid) +
  ggplot2::scale_fill_manual(values = palette) +
  ggplot2::theme_classic(base_size =  24) +
  ggplot2::theme(
    legend.position = "none",
    axis.text.x = ggplot2::element_text(angle = -90)
  )
  return(celltype_alluvial)
}

celltype_table <- pbmc_gs@meta.data |>
  as_tibble() |>
  mutate(celltype = case_when(
    predicted.celltype.l2 %in% c("ASDC", "cDC1", "cDC2", "pDC") ~ "DC",
    predicted.celltype.l2 %in% c("B intermediate", "B memory", "B naive") ~ "B cell",
    predicted.celltype.l2 %in% c("Doublet", "HSPC", "ILC", "Plasmablast", "Platelet", "Eryth") ~ "Other",
    TRUE ~ predicted.celltype.l2
  )) |>
  group_by(Sample, celltype) |>
  summarize(duplicate_count = length(unique(Barcode))) |>
  group_by(Sample) |>
  mutate(duplicate_frequency = (duplicate_count*100) / sum(duplicate_count)) |>
  ungroup() |>
  mutate(patient_id = str_extract(Sample, "008_\\d+"))

celltype_alluvials <- celltype_table |>
  group_by(patient_id) |>
  group_split() |>
  map(persample_alluvial)

figure_one_layout <- "000
000
000
123
456
789"

figure_five <- celltype_umap + celltype_alluvials[[1]] +
  celltype_alluvials[[2]] + celltype_alluvials[[3]] +
  celltype_alluvials[[4]] + celltype_alluvials[[5]] +
  celltype_alluvials[[6]] + celltype_alluvials[[7]] +
  celltype_alluvials[[8]] + celltype_alluvials[[9]] +
  plot_layout(design = figure_one_layout, guides = "collect") +
  plot_annotation(tag_levels = 'A') & theme(text = element_text('NimbusSan'))


ggsave(str_c(figures_path, "pdf", "Figure_five.pdf", sep = "/"), figure_five, 
  width = 20, height = 30, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Figure_five.svg", sep = "/"), figure_five, 
  width = 20, height = 30)
ggsave(str_c(figures_path, "png", "Figure_five.png", sep = "/"), figure_five, 
  width = 20, height = 30)

## Section 6: Identify T-cells from peripheral blood that map to CLusteredUnknowns

In [344]:
kstil_data <- list.files(str_c(tcr_sequencing_path, "TRB/KS/", sep = "/"), 
  pattern = "008_216|008_217|008_220|008_235|008_241|008_236|008_252|008_255|008_256",
  full.names = TRUE) |>
  readImmunoSeq() |>
  productiveSeq() |> 
  dplyr::rename(trb_repertoire_id = repertoire_id) |>
  inner_join(study_metadata, by = "trb_repertoire_id") |>
  filter(tissue_type == "Tumor") |>
  dplyr::select(patient_id, junction_aa, duplicate_count)


Reading AIRR-Seq files [=>---------------------------------------] 2/41 (  5%) eta:  9s elapsed:  0s

Reading AIRR-Seq files [==>--------------------------------------] 3/41 (  7%) eta: 14s elapsed:  1s

Reading AIRR-Seq files [===>-------------------------------------] 4/41 ( 10%) eta: 18s elapsed:  2s

Reading AIRR-Seq files [====>------------------------------------] 5/41 ( 12%) eta: 22s elapsed:  3s

Reading AIRR-Seq files [=====>-----------------------------------] 6/41 ( 15%) eta: 22s elapsed:  4s








































Subsetting productive sequences [------------]   2% eta: 28s

Subsetting productive sequences [>-----------]   5% eta: 23s

Subsetting productive sequences [>-----------]   7% eta: 28s

Subsetting productive sequences [>-----------]  10% eta:  1m

Subsetting productive sequences [>-----------]  12% eta:  2m

Subsetting productive sequences [=>----------]  15% eta:  2m

Subsetting productive sequences [=>----------]  17% eta:  2m

Subsetting produ

In [346]:
kstme_clustered_unknown_tcrs <- kstme_clustered_unknown_table |>
    dplyr::select(pattern, junction_aa) |>
    distinct()

In [347]:
pbmc_tcells_type <- pbmc_gs@meta.data |>
  as_tibble(rownames = "cell_id") |>
  separate(CTaa, into = c("tra", "trb"), sep = "_", remove = F) |>
  mutate(tra = if_else(tra == "NA", str_replace_all(tra, "NA", NA_character_), tra),
    trb = if_else(trb == "NA", str_replace_all(trb, "NA", NA_character_), trb)) |>
  filter(!is.na(trb) ) |>
  inner_join(kstme_clustered_unknown_tcrs, by = c("trb" = "junction_aa"), relationship = "many-to-many") |>
  filter(trb %in% kstil_data$junction_aa) |>
  dplyr::rename(repertoire_id = Sample, 
    cell_type = predicted.celltype.l2) |>
  mutate(tcr_id = str_c(repertoire_id, trb,cell_type, sep = ";")) |>
  dplyr::select(tcr_id, pattern, repertoire_id, trb, cell_id, Barcode, Frequency)



“[1m[22mExpected 2 pieces. Missing pieces filled with `NA` in 184758 rows [2, 5, 6, 8, 9, 12, 13, 15, 18, 20, 24, 29, 30, 31, 34, 39, 41, 42, 44, 45, ...].”


In [348]:

packEmUp <- function(pbmc_tcells_type, gsize_filter = 3) {
    pbmc_pack_plot_table <- pbmc_tcells_type |>
        group_by(pattern, tcr_id) |>
        summarize(cell_count = length(unique(cell_id)),
            frequency_count = sum(Frequency)) |>
        group_by(pattern) |>
        mutate(ptid = str_extract(tcr_id, "008_\\d+"),
            gsize = length(unique(ptid))) |>
        filter(gsize >= gsize_filter )
    gcount_table <- pbmc_pack_plot_table |>
        group_by(pattern) |>
        summarize(count = sum(cell_count)) |>
        dplyr::rename(name = pattern)   
    tcount_table <- pbmc_pack_plot_table |>
        ungroup() |>
        dplyr::select(tcr_id, cell_count) |>
        dplyr::rename(count = cell_count,
            name = tcr_id)  
    count_table <- bind_rows(gcount_table, tcount_table)
    count_dict <- count_table |> pull(count)
    names(count_dict) <- count_table |> pull(name)
    sample_pal <- c("008_216" = "#08306b", "008_241" = "#4292c6", 
        "008_252" = "#c6dbef", "008_236" = "#08519c", 
        "008_217" = "#67000d", "008_220" = "#a50f15", 
        "008_255" = "#fb6a4a" , "008_256" = "#fcbba1",
        "008_235" = "#ef3b2c")
    gtable <- pbmc_pack_plot_table |>
        dplyr::select(pattern, tcr_id) |>
        dplyr::rename(from = pattern, to = tcr_id)  |>
        dplyr::select(from, to) 
    graph <- gtable |>
        group_by(from) |>
        group_split() |>
        map(buildGliphNetworks) |>
        bind_graphs() |> 
        mutate(ptid = str_extract(name, "008_\\d+"),
            ptid = factor(ptid, levels = c("008_216", "008_236", "008_241", "008_252",
            "008_217", "008_220", "008_235", "008_255", "008_256")),
            cell_type = str_remove(str_extract(name, ";\\w+\\s?\\w+$"), ";"),
            count = count_dict[name]) |>
        mutate(group_size = if_else(is.na(cell_type), NA_integer_, count))

    palette <- c("B cell" = "#e41a1c", "CD14 Mono" = "#377eb8", "CD16 Mono" = "#a6cee3", 
    "CD4 CTL" = "#ff7f00", "CD4 Naive" = "#fdbf6f", 
    "CD4 Proliferating" = "#fdae6b", "CD4 TCM" = "#a63603", "CD4 TEM" = "#f16913", 
    "CD8 Naive"= "#984ea3", 
    "CD8 Proliferating" = "#bcbddc", "CD8 TCM" = "#3f007d", "CD8 TEM" = "#807dba", "DC" = "#999999", 
    "dnT" = "#ffff33", "MAIT" = "#ffeda0", "gdT" = "#a65628", "NK" = "#f781bf",
    "NK Proliferating", "NK_CD56bright", "Treg" = "#4daf4a", "Other" = "grey")
    set.seed(12357)
    ptid_plot <- ggraph(graph, 'circlepack', weight = count) + 
        geom_node_circle(aes(fill = ptid), show.legend = TRUE) + 
        coord_fixed() +
        scale_fill_manual(values = sample_pal, na.value = alpha("#99d8c9", 0.2)) + 
        geom_text(aes(x = x, y = y, label = group_size)) +
        labs(fill = "Patient ID", size = "Cell count") +
        theme_void(base_size = 24) +
        theme(legend.position = "right")
    set.seed(12357)
    ctype_plot <- ggraph(graph, 'circlepack', weight = count) + 
        geom_node_circle(aes(fill = cell_type)) + 
        coord_fixed() +
        scale_fill_manual(values = palette, na.value = alpha("#99d8c9", 0.2)) +
        scale_size_binned() + 
        labs(fill = "Cell type", size = "Cell count") +
        theme_void(base_size = 24) 
    return(list(ptid_plot, ctype_plot))
}

buildGliphNetworks <- function(gtable) {
    graph <- gtable |>
        dplyr::select(from, to, everything()) |>
        igraph::graph_from_data_frame() |>
        as_tbl_graph()
    return(graph)
}

kspbmc_cell_pack_four <- packEmUp(pbmc_tcells_type, gsize_filter = 4)
kspbmc_cell_pack_three <- packEmUp(pbmc_tcells_type, gsize_filter = 3)


[1m[22mNon-leaf weights ignored
[1m[22mNon-leaf weights ignored
[1m[22mNon-leaf weights ignored
[1m[22mNon-leaf weights ignored


In [349]:
pbmc_logo_table <- pbmc_gs@meta.data |>
  as_tibble(rownames = "cell_id") |>
  separate(CTaa, into = c("tra", "trb"), sep = "_", remove = F) |>
  mutate(tra = if_else(tra == "NA", str_replace_all(tra, "NA", NA_character_), tra),
    trb = if_else(trb == "NA", str_replace_all(trb, "NA", NA_character_), trb)) |>
  filter(!is.na(trb) ) |>
  inner_join(kstme_clustered_unknown_tcrs, by = c("trb" = "junction_aa"), relationship = "many-to-many") |>
  filter(trb %in% kstil_data$junction_aa) |>
  dplyr::select(Sample, CTaa, pattern, tra, trb, cell_id, Frequency) |>
  group_by(Sample, pattern, CTaa, tra, trb) |>
  mutate(count = length(unique(cell_id))) |>
  left_join(study_metadata |> dplyr::select(repertoire_id, phenotype), by = c("Sample" = "repertoire_id"))
pbmc_logo_table

“[1m[22mExpected 2 pieces. Missing pieces filled with `NA` in 184758 rows [2, 5, 6, 8, 9, 12, 13, 15, 18, 20, 24, 29, 30, 31, 34, 39, 41, 42, 44, 45, ...].”


Sample,CTaa,pattern,tra,trb,cell_id,Frequency,count,phenotype
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>
008_216_V01,CAVRDNDGYGQNFVF_CASSQDLGQGGDTQYF,SQDLGQG%DT,CAVRDNDGYGQNFVF,CASSQDLGQGGDTQYF,008_216_V01_AAACCTGTCACCGGGT-1,67,63,Endemic KS
008_216_V01,CALSAPDNYGQNFVF_CASSQPSGGTGELFF,S%PSGGTGE,CALSAPDNYGQNFVF,CASSQPSGGTGELFF,008_216_V01_AAACGGGTCAGGCAAG-1,161,151,Endemic KS
008_216_V01,CAASGNVYSGGGADGLTF_CATSLGTGITDTQYF,SLGTG%TDT,CAASGNVYSGGGADGLTF,CATSLGTGITDTQYF,008_216_V01_AAACGGGTCGTATCAG-1,152,138,Endemic KS
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
008_256_V01,CAIPSGGGADGLTF_CASSHTGGGSYEQYF,SH%GGGSYE,CAIPSGGGADGLTF,CASSHTGGGSYEQYF,008_256_V01_TTTGTCATCCAGAGGA-1,37,34,Epidemic KS
008_256_V01,CAIPSGGGADGLTF_CASSHTGGGSYEQYF,S%TGGGSYE,CAIPSGGGADGLTF,CASSHTGGGSYEQYF,008_256_V01_TTTGTCATCCAGAGGA-1,37,34,Epidemic KS


In [420]:
pbmc_logo_quant_table <- pbmc_logo_table |>
    mutate(Sample = str_extract(Sample, "008_\\d+")) |>
    group_by(pattern) |>
    mutate(nptids = length(unique(Sample)),
        nseqs = length(CTaa),
        nunique = length(unique(CTaa))) |>
    dplyr::select(-Sample) |>
    distinct() 

top_pbmc_groups <- pbmc_logo_quant_table |>
    filter(nptids > 2) |>
    dplyr::select(pattern, nptids, nseqs, nunique) |>
    ungroup() |>
    distinct() |>
    arrange(desc(nseqs)) |>
    slice_head(n = 2)

top_pbmc_groups

pattern,nptids,nseqs,nunique
<chr>,<int>,<int>,<int>
SP%NTGE,3,117,4
SLG%GE,4,46,5


In [369]:
logosGalore <- function(pattern_table) {
    pattern <- pattern_table |>
        pull(pattern) |>
        unique()
    tra_list <- pattern_table |>
        dplyr::select(tra) |>
        distinct() |>
        pull(tra)
    names(tra_list) <- pattern_table |>
        dplyr::select(tra) |>
        distinct() |>
        mutate(tra_id = str_c(tra, row_number(), sep = "_")) |>
        pull(tra_id)
    tra_logo <- seqlogo(AAStringSet(tra_list), color = "Chemistry_AA") +
        ggtitle(str_c("Specifcity group: ", pattern, sep = "")) 
    trb_list <- pattern_table |>
        dplyr::select(trb) |>
        distinct() |>
        pull(trb) 
    names(trb_list) <- pattern_table |>
        dplyr::select(trb) |>
        distinct() |>
        mutate(trb_id = str_c(trb, row_number(), sep = "_")) |>
        pull(trb_id) 
    trb_logo <- seqlogo(AAStringSet(trb_list), color = "Chemistry_AA") 
    return(list(tra_logo, trb_logo))
}

### Figure 6:

In [424]:
pbmc_top_patterns_logos <-  pbmc_logo_quant_table |>
    filter(pattern %in% top_pbmc_groups$pattern) |>
    group_by(pattern) |>
    group_split() |>
    map(logosGalore) |>
    wrap_plots(nrow = 5, ncol = 2)

figure_six <- ((kspbmc_cell_pack_three[[1]] + labs(tag = 'A')) |
    (kspbmc_cell_pack_three[[2]] + labs(tag = 'B'))) /
    ((pbmc_top_patterns_logos[[1]][[1]] + labs(tag = 'C')) +
     pbmc_top_patterns_logos[[1]][[2]] |
     pbmc_top_patterns_logos[[2]][[1]] +
     pbmc_top_patterns_logos[[2]][[2]]) + 
     plot_layout(nrow = 2, widths = c(20, 20, 10, 10,10,10),
        heights = c(20,20,5,5,5,5), guides = "collect") &
    theme(text = element_text('NimbusSan')) 


ggsave(str_c(figures_path, "pdf", "Figure_six.pdf", sep = "/"), figure_six, 
  width = 35, height = 30, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Figure_six.svg", sep = "/"), figure_six, 
  width = 35, height = 30)
ggsave(str_c(figures_path, "png", "Figure_six.png", sep = "/"), figure_six, 
  width = 35, height = 30)

“number of columns of result is not a multiple of vector length (arg 1)”
“number of columns of result is not a multiple of vector length (arg 1)”


“[1m[22mRemoved 105 rows containing missing values (`geom_text()`).”
“[1m[22mRemoved 105 rows containing missing values (`geom_text()`).”
“[1m[22mRemoved 105 rows containing missing values (`geom_text()`).”


# Section 7: Characterize all the cell from the 10X data

### Figure 7:

In [461]:
figure_seven <- FeaturePlot(pbmc_gs, 
  features = c("CD4", "CD8A", "CD8B", "NCAM1", 
  "GZMA", "GZMB", "PRF1",  "IFNG", 
  "PDCD1", "CTLA4", "LAG3", "HAVCR2", 
   "MKI67", "KLRG1", "ENTPD1", "CD68",
   "CD163", "IL10", "VEGFA", "MAF"), ncol = 4,
   keep.scale = "all", pt.size = 1, reduction = "ref.umap")

ggsave(str_c(figures_path, "pdf", "Figure_seven.pdf", sep = "/"), figure_seven, 
  width = 16, height = 20, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Figure_seven.svg", sep = "/"), figure_seven, 
  width = 16, height = 20)
ggsave(str_c(figures_path, "png", "Figure_seven.png", sep = "/"), figure_seven, 
  width = 16, height = 20)

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`

Rasterizing points since number of points exceeds 100,000.
To di

## Section 8: Characterization of clustered unknown in AIRR-Seq data from KSHV+ PBMC datasets

### Load KS, KOBS, and U035 data

In [428]:
kspbmc_trb_path <- str_c(tcr_v3_sequencing_path, "ks", sep = "/")
kskobs_trb_path <- str_c(tcr_v3_sequencing_path, "kobs", sep = "/")
ksu035_trb_path <- str_c(tcr_sequencing_path, "TRB/U035_KSHV", sep "/")
kspbmc_trb_files <- list.files(kspbmc_trb_path, pattern = "^008_\\d+_V\\d+_\\d+",
    full.names = TRUE)
kskobs_trb_files <- list.files(kskobs_trb_path, pattern = "tsv",
    full.names = TRUE)
ksu035_trb_files <- list.files(ksu035_trb_path, pattern = "tsv",
    full.names = TRUE)

In [429]:
kspbmc_study_table <- readImmunoSeq(c(kspbmc_trb_files, kskobs_trb_files)) 
kspbmc_amino_acid_table <- productiveSeq(kspbmc_study_table)
kspbmc_nprod_table <- productiveSeq(kspbmc_study_table, aggregate = "junction")


Reading AIRR-Seq files [>---------------------------------------] 2/153 (  1%) eta:  2m elapsed:  1s

Reading AIRR-Seq files [>---------------------------------------] 3/153 (  2%) eta:  3m elapsed:  3s

Reading AIRR-Seq files [>---------------------------------------] 4/153 (  3%) eta:  2m elapsed:  3s

Reading AIRR-Seq files [>---------------------------------------] 5/153 (  3%) eta:  2m elapsed:  4s

Reading AIRR-Seq files [=>--------------------------------------] 6/153 (  4%) eta:  2m elapsed:  6s

Reading AIRR-Seq files [=>--------------------------------------] 7/153 (  5%) eta:  3m elapsed:  7s

Reading AIRR-Seq files [=>--------------------------------------] 8/153 (  5%) eta:  3m elapsed:  9s

Reading AIRR-Seq files [=>--------------------------------------] 9/153 (  6%) eta:  2m elapsed:  9s

Reading AIRR-Seq files [==>------------------------------------] 10/153 (  7%) eta:  3m elapsed: 11s

Reading AIRR-Seq files [==>------------------------------------] 11/153 (  7%) et

### Generate PBMC metadata with HLA information

In [430]:
kspbmc_metadata <- study_metadata |>
    filter(tissue_type == "PBMC") |>
    dplyr::select(-c('tra_repertoire_id','rna_libraries',
        'nanostring_libraries','gex_library','vdj_library','sc_library_count'))
print(str_c("Data : KS PBMC"))
print(str_c("Number of PTIDs : ", kspbmc_metadata |> pull(patient_id) |> unique() |> length(), sep = ""))
print(str_c("Number of repertoires : ", kspbmc_metadata |> pull(repertoire_id) |> unique() |> length(), sep = ""))

[1] "Data : KS PBMC"
[1] "Number of PTIDs : 48"
[1] "Number of repertoires : 118"


In [431]:
# Load KOBS sequencing metadata
kskobs_metadata_path <- str_c(analysis_path, "metadata/Control_metadata.csv", sep = "/")
kskobs_metadata <- read_csv(kskobs_metadata_path, show_col_types = FALSE) |>
    dplyr::select(-c('tra_repertoire_id','rna_libraries','nanostring_libraries',
        'gex_library','vdj_library','sc_library_count')) |>
    filter(!is.na(trb_repertoire_id)) |>
    mutate(ptid = str_pad(str_extract(patient_id, "\\d+$"), width = 3, side = "left", pad = "0")) |>
    dplyr::select(ptid, everything())
kskobs_metadata

ptid,patient_id,repertoire_id,trb_repertoire_id,tissue_type,visit_code,lession_number,tumor_code,age,gender,⋯,DPB1allele2,DQA1allele1,DQA1allele2,DQB1allele1,DQB1allele2,DRB1allele1,DRB1allele2,DRB345allele1,DRB345allele2,timestamp
<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<dbl>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
011,KOBSC_11,KOBSC_11,KOBSC_11,PBMC,,,,23,Female,⋯,DPB1*105:01:01,DQA1*04:01:02,DQA1*04:01:02,DQB1*03:19:01,DQB1*03:19:01,DRB1*08:04:01,DRB1*08:04:01,DRB345*Not_Present,DRB345*Not_Present,1/12/23
014,KOBSC_14,KOBSC_14,KOBSC_14,PBMC,,,,25,Female,⋯,DPB1*01:01:01,DQA1*01:01:02,DQA1*02:01:01,DQB1*02:02:01,DQB1*05:01:01,DRB1*07:01:01,DRB1*11:01:02,DRB3*03:01:01,DRB4*01:03:01,1/12/23
015,KOBSC_15,KOBSC_15,KOBSC_15,PBMC,,,,23,Female,⋯,DPB1*02:01:02,DQA1*01:02:01,DQA1*01:02:01,DQB1*05:01:01,DQB1*06:02:01,DRB1*13:01:01,DRB1*15:03:01,DRB3*03:01:01,DRB5*01:01:01,1/12/23
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
034,KOBSC_34,KOBSC_34,KOBSC_34,PBMC,,,,24,Female,⋯,DPB1*01:01:01,DQA1*02:01:01,DQA1*05:05:01,DQB1*02:02:01,DQB1*03:01:01,DRB1*07:01:01,DRB1*11:02:01,DRB3*02:02:01,DRB4*01:01:01,1/12/23
078,KOBSC_78,KOBSC_78,KOBSC_78,PBMC,,,,51,Male,⋯,DPB1*18:01:01,DQA1*05:01:01,DQA1*05:11,DQB1*02:01:01,DQB1*03:01:01,DRB1*03:01:01,DRB1*11:01:02,DRB3*02:02:01,DRB3*02:02:01,1/12/23


In [432]:
# Load KOBS clinical metadata
kskobs_clinical_path <- str_c(analysis_path, "metadata/KOBS_metadata.RData", sep = "/")
load(kskobs_clinical_path)
kskobs_clinical_metadata <- perperson |>
    dplyr::select(ptid, med_hhv8_logcopy_oral, CD4, rna)

In [433]:
kskobs_metadata <- kskobs_metadata |> left_join(kskobs_clinical_metadata) |>
    dplyr::mutate(phenotype = case_when(hiv_status == "Negative" & kshv_status == "Negative" ~ "Control",
        hiv_status == "Positive" & kshv_status == "Negative" ~ "HIV Positive",
        hiv_status == "Negative" & kshv_status == "Positive" ~ "KSHV Positive",
        hiv_status == "Positive" & kshv_status == "Positive" ~ "HIV & KSHV Positive")) |>
    dplyr::rename(KSHV_Oral = med_hhv8_logcopy_oral, 
        CD4_count = CD4, HIV = rna) |>
    dplyr::select(-ptid)
kskobs_metadata

[1m[22mJoining with `by = join_by(ptid)`


patient_id,repertoire_id,trb_repertoire_id,tissue_type,visit_code,lession_number,tumor_code,age,gender,response,⋯,DQB1allele2,DRB1allele1,DRB1allele2,DRB345allele1,DRB345allele2,timestamp,KSHV_Oral,CD4_count,HIV,phenotype
<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<dbl>,<chr>,<lgl>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<int>,<chr>
KOBSC_11,KOBSC_11,KOBSC_11,PBMC,,,,23,Female,,⋯,DQB1*03:19:01,DRB1*08:04:01,DRB1*08:04:01,DRB345*Not_Present,DRB345*Not_Present,1/12/23,,,,Control
KOBSC_14,KOBSC_14,KOBSC_14,PBMC,,,,25,Female,,⋯,DQB1*05:01:01,DRB1*07:01:01,DRB1*11:01:02,DRB3*03:01:01,DRB4*01:03:01,1/12/23,,,,Control
KOBSC_15,KOBSC_15,KOBSC_15,PBMC,,,,23,Female,,⋯,DQB1*06:02:01,DRB1*13:01:01,DRB1*15:03:01,DRB3*03:01:01,DRB5*01:01:01,1/12/23,,,,Control
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
KOBSC_34,KOBSC_34,KOBSC_34,PBMC,,,,24,Female,,⋯,DQB1*03:01:01,DRB1*07:01:01,DRB1*11:02:01,DRB3*02:02:01,DRB4*01:01:01,1/12/23,5.129206,616,194023,HIV & KSHV Positive
KOBSC_78,KOBSC_78,KOBSC_78,PBMC,,,,51,Male,,⋯,DQB1*03:01:01,DRB1*03:01:01,DRB1*11:01:02,DRB3*02:02:01,DRB3*02:02:01,1/12/23,2.867577,176,750000,HIV & KSHV Positive


In [434]:
kspbmc_study_metadata <- bind_rows(kspbmc_metadata, kskobs_metadata)

### Prep GLIPH2 inputs for KOBS and KS PBMC

In [435]:
kspbmc_gliph_trb_table <- kspbmc_study_metadata |>
    left_join(kspbmc_amino_acid_table, by = c("trb_repertoire_id" = "repertoire_id")) |>
    mutate(gliph_id = str_c(repertoire_id, phenotype, sep = ":"),
        CDR3a = NA_character_) |>
    dplyr::select(junction_aa, v_call, j_call, CDR3a, gliph_id, duplicate_count) 
kspbmc_gliph_trb_table

junction_aa,v_call,j_call,CDR3a,gliph_id,duplicate_count
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
CAAAADSYNSPLHF,TRBV10-3,TRBJ1-6,,008_216_V01:Endemic KS,2
CAAGLGRYYGYTF,TRBV30-1,TRBJ1-2,,008_216_V01:Endemic KS,1
CAAGPGQNTEAFF,TRBV7-5,TRBJ1-1,,008_216_V01:Endemic KS,1
⋮,⋮,⋮,⋮,⋮,⋮
YDSTDTQYF,TRBV5-1,TRBJ2-3,,KOBSC_78:HIV & KSHV Positive,1
YKDRVRINQPQHF,TRBV6-4,TRBJ1-5,,KOBSC_78:HIV & KSHV Positive,1


In [436]:
kspbmc_gliph_hla_table <- kspbmc_study_metadata |>
    dplyr::select(repertoire_id, aAllele1:DRB345allele2) |>
    distinct()
kspbmc_gliph_hla_table

repertoire_id,aAllele1,aAllele2,bAllele1,bAllele2,cAllele1,cAllele2,DPA1allele1,DPA1allele2,DPB1allele1,DPB1allele2,DQA1allele1,DQA1allele2,DQB1allele1,DQB1allele2,DRB1allele1,DRB1allele2,DRB345allele1,DRB345allele2
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
008_216_V01,A*29:02:01,A*34:02:01,B*41:01:01,B*82:01:01,C*03:02:02,C*07:01:01,DPA1*01:03:01,DPA1*03:01:01,DPB1*04:01:01,DPB1*105:01:01,DQA1*01:02:01,DQA1*01:05:01,DQB1*05:01:01,DQB1*06:02:01,DRB1*10:01:01,DRB1*15:03:01,DRB345*Not_Present,DRB5*01:01:01
008_216_V02,A*29:02:01,A*34:02:01,B*41:01:01,B*82:01:01,C*03:02:02,C*07:01:01,DPA1*01:03:01,DPA1*03:01:01,DPB1*04:01:01,DPB1*105:01:01,DQA1*01:02:01,DQA1*01:05:01,DQB1*05:01:01,DQB1*06:02:01,DRB1*10:01:01,DRB1*15:03:01,DRB345*Not_Present,DRB5*01:01:01
008_216_V09,A*29:02:01,A*34:02:01,B*41:01:01,B*82:01:01,C*03:02:02,C*07:01:01,DPA1*01:03:01,DPA1*03:01:01,DPB1*04:01:01,DPB1*105:01:01,DQA1*01:02:01,DQA1*01:05:01,DQB1*05:01:01,DQB1*06:02:01,DRB1*10:01:01,DRB1*15:03:01,DRB345*Not_Present,DRB5*01:01:01
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
KOBSC_34,A*34:02:01,A*80:01:01,B*18:01:01,B*58:02:01,C*02:02:02,C*06:02:01,DPA1*02:01:08,DPA1*02:02:02,DPB1*01:01:01,DPB1*01:01:01,DQA1*02:01:01,DQA1*05:05:01,DQB1*02:02:01,DQB1*03:01:01,DRB1*07:01:01,DRB1*11:02:01,DRB3*02:02:01,DRB4*01:01:01
KOBSC_78,A*02:01:01,A*30:02:01,B*08:01:01,B*45:01:01,C*07:01:01,C*16:01:01,DPA1*01:03:01,DPA1*02:01:08,DPB1*01:01:01,DPB1*18:01:01,DQA1*05:01:01,DQA1*05:11,DQB1*02:01:01,DQB1*03:01:01,DRB1*03:01:01,DRB1*11:01:02,DRB3*02:02:01,DRB3*02:02:01


In [437]:
kspbmc_gliph_trb_input_table <- str_c(table_path, "kspbmc_gliph", "gliph_trb_table.tsv", sep = "/")
kspbmc_gliph_hla_input_table <- str_c(table_path, "kspbmc_gliph", "gliph_hla_table.tsv", sep = "/")
write_tsv(kspbmc_gliph_trb_table, kspbmc_gliph_trb_input_table)
write_tsv(kspbmc_gliph_hla_table, kspbmc_gliph_hla_input_table)

### Read GLIPH2 outputs from PBMC data

In [440]:
kspbmc_gliph_clusters_table <- read_csv(kspbmc_gliph_clusters_path, show_col_types = FALSE)
kspbmc_gliph_hlapreds_table <- read_csv(kspbmc_gliph_hlapreds_path, show_col_types = FALSE)

### Get HLA matched TCRs from RAW PBMC AIRR-Seq data

In [441]:
kspbmc_annotated_nprod_table <- kspbmc_nprod_table |>
    dplyr::rename(trb_repertoire_id = repertoire_id) |>
    inner_join(kspbmc_study_metadata, by = c("trb_repertoire_id"))
kspbmc_annotated_nprod_table

trb_repertoire_id,junction,junction_aa,v_call,d_call,j_call,v_family,d_family,j_family,reading_frame,⋯,DRB345allele2,timestamp,HIV,CD4_count,KSHV_plasma,phenotype,DPBallele2,ebv_status,bl_status,KSHV_Oral
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>
008_216_V10_11_7_19_r,ATCGGACAGGCTCACTATGGCTACACCTTC,IGQAHYGYTF,TRBV21-1,TRBD1-1,TRBJ1-2,TRBV21,TRBD1,TRBJ1,in-frame,⋯,DRB5*01:01:01,1/12/23,,776,3854.72,Endemic KS,,,,
008_216_V10_11_7_19_r,ATCGGGGGGCAGGGGGCATACTATGGCTACACCTTC,IGGQGAYYGYTF,,TRBD1-1,TRBJ1-2,,TRBD1,TRBJ1,in-frame,⋯,DRB5*01:01:01,1/12/23,,776,3854.72,Endemic KS,,,,
008_216_V10_11_7_19_r,CCCCCAGGGACCTATGGCTACACCTTC,PPGTYGYTF,TRBV21-1,,TRBJ1-2,TRBV21,,TRBJ1,in-frame,⋯,DRB5*01:01:01,1/12/23,,776,3854.72,Endemic KS,,,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
KOBSC_9,TTTAGGGCCGCAGATACGCAGTATTTT,FRAADTQYF,TRBV29-1,,TRBJ2-3,TRBV29,,TRBJ2,in-frame,⋯,DRB5*01:01:01,1/12/23,,,,KSHV Positive,,,,4.4323
KOBSC_9,TTTATAACAAGGGAAAACGTTAGTGGCTACACCTTC,FITRENVSGYTF,TRBV7-3,,TRBJ1-2,TRBV7,,TRBJ1,in-frame,⋯,DRB5*01:01:01,1/12/23,,,,KSHV Positive,,,,4.4323


In [442]:
kspbmc_hla_matched_tcr <- kspbmc_annotated_nprod_table |>
    mutate(visit_code = if_else(is.na(visit_code), "V01", visit_code)) |>
    inner_join(antigen_db, by = c("junction_aa" = "trb_cdr3_aa"), 
        relationship = "many-to-many") |>
    dplyr::select(patient_id, repertoire_id, junction, junction_aa, epitope, pathology,
        antigen, mhc_allele, aAllele1:DRB345allele2) |> 
    pivot_longer(cols = aAllele1:DRB345allele2, names_to = "allele",
        values_to = "mhc_value") |> 
    mutate(mhc_allele = str_remove(mhc_allele, "HLA-"),
        hla_matched = if_else(mhc_allele == mhc_value |
            mhc_allele == str_extract(mhc_value, "\\w+\\*\\d+:\\d+") |
            mhc_allele == str_extract(mhc_value, "\\w+\\*\\d+"), TRUE, FALSE)) |>
    filter(hla_matched) |> 
    dplyr::select(patient_id:junction_aa, epitope:antigen) |>
    distinct() |> 
    group_by(patient_id, repertoire_id, junction, junction_aa) |>
    summarise(npath = length(unique(pathology)),
        pathology = str_c(unique(pathology), collapse = ";")) |>
    ungroup() |>
    mutate(pathology = if_else(npath > 1, "Multi-pathogen", pathology)) |>
    dplyr::select(patient_id, repertoire_id, junction, junction_aa, pathology) |>
    distinct() |>
    mutate(pathology = recode(pathology, 
        `HomoSapiens` = "Other", "Homo sapiens" = "Other", 
        "SelaginellaMoellendorffii" = "Other", "TriticumAestivum" = "Other", 
        "synthetic" = "Other", "MCPyV" = "Other"))
kspbmc_hla_matched_tcr

patient_id,repertoire_id,junction,junction_aa,pathology
<chr>,<chr>,<chr>,<chr>,<chr>
008_216,008_216_V01,TGTGCCAGCAGCTTAGAAGAGACCCAGTACTTC,CASSLEETQYF,M.tuberculosis
008_216,008_216_V01,TGTGCCAGCAGTTTAGAAGAGACCCAGTACTTC,CASSLEETQYF,M.tuberculosis
008_216,008_216_V10,TGTGCCAGCAGTTCCCGGGACTACGAGCAGTACTTC,CASSSRDYEQYF,Other
⋮,⋮,⋮,⋮,⋮
KOBSC_9,KOBSC_9,TGTGCCAGTAGTAGGACGCTGAACACTGAAGCTTTCTTT,CASSRTLNTEAFF,CMV
KOBSC_9,KOBSC_9,TGTGCCAGTAGTTTTATCACAGATACGCAGTATTTT,CASSFITDTQYF,CMV


In [443]:
kspbmc_path_annotated_nprod_table <- kspbmc_annotated_nprod_table |> 
    left_join(kspbmc_hla_matched_tcr) |>
    filter(str_detect(junction_aa, "^C") & str_detect(junction_aa, "F$")) |>
    dplyr::select(cohort, patient_id, repertoire_id, tissue_type, phenotype, pathology, junction_aa, duplicate_frequency) |>
    mutate(pathology = str_replace_na(pathology, "Unknown"),
        cohort = if_else(is.na(cohort), "KOBS", cohort)) |>
    group_by(cohort, repertoire_id, pathology, phenotype, tissue_type) |>
    summarize(frequency = sum(duplicate_frequency)) |>
    ungroup()

[1m[22mJoining with `by = join_by(junction, junction_aa, patient_id, repertoire_id)`


## Get HLA matched GLIPH groups with known antigenic specificity 

In [445]:
kspbmc_gliph_clusters_path <- str_c(analysis_path, "gliph/kspbmc_gliph_run/gliph_clusters_cluster.csv", sep = "/")
kspbmc_gliph_hlapreds_path <-str_c(analysis_path, "gliph/kspbmc_gliph_run/gliph_clusters_HLA.csv", sep = "/")
# KSPBMC GLIPH table
kspbmc_gliph_cluster_pred_table <- read_csv(kspbmc_gliph_clusters_path,
        show_col_types = FALSE) 
kspbmc_gliph_hla_pred_table <- read_csv(kspbmc_gliph_hlapreds_path, 
    show_col_types = FALSE)

In [446]:
options(repr.matrix.max.rows=5, repr.matrix.max.columns=5)
# Total number of GLIPH groups identified from the KSPBMC dataset
print(str_c("Total number of GLIPH groups in KSPBMC datasets",
    length(unique(kspbmc_gliph_cluster_pred_table$pattern)), sep = " : "))
print(str_c("Number of GLIPH groups of length >= 3",
    kspbmc_gliph_cluster_pred_table |> 
        filter(pattern != "single" & nchar(pattern) > 2) |>
        pull(pattern) |>
        unique() |>
        length(), sep = " : ")) 
print(str_c("Number of GLIPH groups of length >= 3, vb_score <= 0.01, unique CDR3s >= 3, and unique PTIDs >= 3",
    kspbmc_gliph_cluster_pred_table |> 
        filter(pattern != "single" & nchar(pattern) > 2) |>
        separate(Sample, into = c("repertoire_id", "cohort"), sep = ":") |>
        mutate(patient_id = if_else(str_detect(repertoire_id, "KOBS"), repertoire_id,
            str_extract(repertoire_id, "008_\\d+")),
            pattern = if_else(pattern == "single", TcRb, pattern)) |>
        group_by(pattern) |>
        mutate(number_of_unique_cdr = length(unique(TcRb)),
                number_of_unique_ptid = length(unique(patient_id))) |>
        filter(vb_score <= 0.01 & number_of_unique_cdr >= 3 &
                number_of_unique_ptid >= 3) |>
        pull(pattern) |>
        unique() |>
        length(), sep = " : ")) 


[1] "Total number of GLIPH groups in KSPBMC datasets : 2153193"
[1] "Number of GLIPH groups of length >= 3 : 2153152"
[1] "Number of GLIPH groups of length >= 3, vb_score <= 0.01, unique CDR3s >= 3, and unique PTIDs >= 3 : 315964"


In [447]:
# Filter out high confidence GLIPH groups with a pattern length of >2 where, 
# the pattern has at 3 unique CDR3s associated with it, at least 3 PTIDs 
# associated with it and with a vb_score <= 0.01
kspbmc_hc_gliph_cluster_table <- kspbmc_gliph_cluster_pred_table |>
  dplyr::select(pattern, vb_score, hla_score, expansion_score, TcRb, Sample, V, J,
    Freq) |>
  separate(Sample, into = c("repertoire_id", "cohort"), sep = ":") |>
  mutate(patient_id = if_else(str_detect(repertoire_id, "KOBS"), repertoire_id,
    str_extract(repertoire_id, "008_\\d+"))) |>
  group_by(pattern) |>
  mutate(number_of_unique_cdr = length(unique(TcRb)),
    number_of_unique_ptid = length(unique(patient_id))) |>
  ungroup() |>
  filter(vb_score <= 0.01 & number_of_unique_cdr >= 3 &
    number_of_unique_ptid >= 3 & pattern != "single" & nchar(pattern) > 2)
kspbmc_hc_gliph_cluster_table

pattern,vb_score,hla_score,expansion_score,TcRb,repertoire_id,cohort,V,J,Freq,patient_id,number_of_unique_cdr,number_of_unique_ptid
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<int>,<int>
RG%AGE,0.006,0.018,0.76,CASRGEAGELFF,008_229_V01,Epidemic KS,TRBV5-1,TRBJ2-2,1,008_229,52,47
RG%AGE,0.006,0.018,0.76,CASRGEAGELFF,008_229_V02,Epidemic KS,TRBV5-1,TRBJ2-2,3,008_229,52,47
RG%AGE,0.006,0.018,0.76,CASRGEAGELFF,008_229_V09,Epidemic KS,TRBV5-1,TRBJ2-2,3,008_229,52,47
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
SV%GGMNTE,0.001,0.0017,0.91,CASSVVGGMNTEAFF,KOBSC_37,Control,TRBV9-1,TRBJ1-1,1,KOBSC_37,15,25
SV%GGMNTE,0.001,0.0017,0.91,CASSVPGGMNTEAFF,KOBSC_26,Control,TRBV19-1,TRBJ1-1,1,KOBSC_26,15,25


In [448]:
# Filter out HLA predictions for high confidence GLIPH groups identified in the
# previous step
kspbmc_hc_gliph_hla_table <- kspbmc_gliph_hla_pred_table |>
  dplyr::rename(sig_allele = `'HLA allele with lowest Fisher Score'`) |>
  mutate(pattern = str_remove(pattern, "^[lg]")) |>
  filter(pattern %in% kspbmc_hc_gliph_cluster_table$pattern & Pvalue <= 0.05) |>
  dplyr::select(pattern, sig_allele, Pvalue)
kspbmc_hc_gliph_hla_table

pattern,sig_allele,Pvalue
<chr>,<chr>,<dbl>
RG%AGE,DPB1*105,0.036
RG%AGE,DRB3*02,0.049
RG%AGE,B*53,0.018
⋮,⋮,⋮
SV%GGMNTE,C*14,0.0280
SV%GGMNTE,B*44,0.0081


In [449]:
print(str_c("Total number of HLA restrictions predicted by GLIPH", 
    nrow(kspbmc_gliph_hla_pred_table), sep = " : "))
print(str_c("Total number of HLA restrictions for GLIPH gorups of length >= 3",
    kspbmc_gliph_hla_pred_table |> 
        mutate(pattern = str_remove(pattern, "^[lg]")) |>
        filter(pattern != "single" & nchar(pattern) > 2) |>
        nrow(), sep = " : ")) 
print(str_c("Total number of significant HLA restrictions for GLIPH gorups of length >= 3",
    kspbmc_gliph_hla_pred_table |> 
        mutate(pattern = str_remove(pattern, "^[lg]")) |>
        filter(pattern != "single" & nchar(pattern) > 2 & Pvalue <= 0.05) |>
        nrow(), sep = " : ")) 

[1] "Total number of HLA restrictions predicted by GLIPH : 7584840"
[1] "Total number of HLA restrictions for GLIPH gorups of length >= 3 : 7237204"
[1] "Total number of significant HLA restrictions for GLIPH gorups of length >= 3 : 3892975"


In [450]:
# Merge GLIPH clusters with significant HLA association
kspbmc_high_confidence_gliph_groups <- kspbmc_hc_gliph_cluster_table |>
  left_join(kspbmc_hc_gliph_hla_table, by = "pattern",
    relationship = "many-to-many") 
kspbmc_high_confidence_gliph_groups

pattern,vb_score,hla_score,expansion_score,TcRb,repertoire_id,cohort,V,J,Freq,patient_id,number_of_unique_cdr,number_of_unique_ptid,sig_allele,Pvalue
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<int>,<int>,<chr>,<dbl>
RG%AGE,0.006,0.018,0.76,CASRGEAGELFF,008_229_V01,Epidemic KS,TRBV5-1,TRBJ2-2,1,008_229,52,47,DPB1*105,0.036
RG%AGE,0.006,0.018,0.76,CASRGEAGELFF,008_229_V01,Epidemic KS,TRBV5-1,TRBJ2-2,1,008_229,52,47,DRB3*02,0.049
RG%AGE,0.006,0.018,0.76,CASRGEAGELFF,008_229_V01,Epidemic KS,TRBV5-1,TRBJ2-2,1,008_229,52,47,B*53,0.018
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
SV%GGMNTE,0.001,0.0017,0.91,CASSVPGGMNTEAFF,KOBSC_26,Control,TRBV19-1,TRBJ1-1,1,KOBSC_26,15,25,C*14,0.0280
SV%GGMNTE,0.001,0.0017,0.91,CASSVPGGMNTEAFF,KOBSC_26,Control,TRBV19-1,TRBJ1-1,1,KOBSC_26,15,25,B*44,0.0081


In [451]:
# Annotate GLIPH groups with antigenic specificity based on the CDR3 sequences
# they contain
kspbmc_path_annotated_gliph_clusters <- kspbmc_hla_matched_tcr |>
  distinct() |>
  inner_join(kspbmc_high_confidence_gliph_groups, by = c("repertoire_id", "junction_aa" = "TcRb"),
    relationship = "many-to-many") |>
  dplyr::select(repertoire_id, pathology, pattern) |>
  distinct()
kspbmc_path_annotated_gliph_clusters

repertoire_id,pathology,pattern
<chr>,<chr>,<chr>
008_216_V01,M.tuberculosis,SLEE%
008_216_V01,M.tuberculosis,%LEET
008_216_V01,M.tuberculosis,SLE%T
⋮,⋮,⋮
KOBSC_9,CMV,SRT%NTE
KOBSC_9,CMV,SF%TDT


In [452]:
# Annotate GLIPH groups by antigenic specificity of CDR3s goruped within
kspbmc_path_annotated_gliph_groups <- kspbmc_high_confidence_gliph_groups |>
  dplyr::select(repertoire_id, pattern, TcRb, V, J) |>
  left_join(kspbmc_path_annotated_gliph_clusters, by = c("repertoire_id", "pattern"), relationship = "many-to-many") |>
  distinct() |>
  mutate(pathology = if_else(is.na(pathology), "Clustered\nUnknown", pathology)) |>
  group_by(TcRb, V, J) |>
  mutate(npath = length(unique(pathology)),
    pathology = str_c(unique(pathology), collapse =";")) |>
  ungroup() |>
  mutate(pathology = if_else(npath > 1, "Multi-pathogen", pathology)) 
kspbmc_path_annotated_gliph_groups |>
  group_by(pathology) |> 
  summarize(count = length(unique(TcRb))) |>
  print(n = 11)


[90m# A tibble: 11 × 2[39m
   pathology              count
   [3m[90m<chr>[39m[23m                  [3m[90m<int>[39m[23m
[90m 1[39m [90m"[39mCMV[90m"[39m                    194
[90m 2[39m [90m"[39mClustered\nUnknown[90m"[39m 1[4m0[24m[4m8[24m[4m9[24m540
[90m 3[39m [90m"[39mEBV[90m"[39m                     26
[90m 4[39m [90m"[39mHCV[90m"[39m                     60
[90m 5[39m [90m"[39mHIV-1[90m"[39m                    2
[90m 6[39m [90m"[39mHSV-2[90m"[39m                    4
[90m 7[39m [90m"[39mInfluenzaA[90m"[39m             645
[90m 8[39m [90m"[39mM.tuberculosis[90m"[39m          28
[90m 9[39m [90m"[39mMulti-pathogen[90m"[39m        [4m8[24m471
[90m10[39m [90m"[39mOther[90m"[39m                  797
[90m11[39m [90m"[39mSARS-CoV-2[90m"[39m              14


In [453]:
kspbmc_gliph_nprod_table <- kspbmc_annotated_nprod_table |>
    left_join(kspbmc_path_annotated_gliph_groups, by = c("repertoire_id", 
        "junction_aa" = "TcRb", "v_call" = "V",
        "j_call" = "J"), relationship = "many-to-many") |>
    distinct() |>
    dplyr::select(cohort, patient_id, repertoire_id, tissue_type, phenotype, 
        pathology, junction_aa, v_call, j_call, duplicate_frequency) |>
    mutate(pathology = if_else(is.na(pathology), "Unclustered\nUnknown", pathology),
        phenotype = if_else(is.na(phenotype) , "Epidemic KS", phenotype)) |>
    group_by(junction_aa, v_call, j_call ) |>
    mutate(npath = length(unique(pathology)),
        pathology = str_c(unique(pathology), collapse =";")) |>
    ungroup() |>
    mutate(pathology = if_else(npath > 1, "Multi-pathogen", pathology)) |>
    distinct() |>
    mutate(cohort = if_else(is.na(cohort), "KOBS", cohort))
kspbmc_gliph_nprod_table

cohort,patient_id,repertoire_id,tissue_type,phenotype,pathology,junction_aa,v_call,j_call,duplicate_frequency,npath
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>
Hippos,008_216,008_216_V10,PBMC,Endemic KS,Clustered Unknown,IGQAHYGYTF,TRBV21-1,TRBJ1-2,3.365813e-05,1
Hippos,008_216,008_216_V10,PBMC,Endemic KS,Unclustered Unknown,IGGQGAYYGYTF,,TRBJ1-2,1.682907e-05,1
Hippos,008_216,008_216_V10,PBMC,Endemic KS,Clustered Unknown,PPGTYGYTF,TRBV21-1,TRBJ1-2,1.682907e-05,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
KOBS,KOBSC_9,KOBSC_9,PBMC,KSHV Positive,Clustered Unknown,FRAADTQYF,TRBV29-1,TRBJ2-3,9.970786e-06,1
KOBS,KOBSC_9,KOBSC_9,PBMC,KSHV Positive,Unclustered Unknown,FITRENVSGYTF,TRBV7-3,TRBJ1-2,9.970786e-06,1


In [454]:
kspbmc_path_annotated_gliph_nprod_table <- kspbmc_gliph_nprod_table |>
    group_by(cohort, repertoire_id, pathology, phenotype, tissue_type) |>
    summarize(frequency = sum(duplicate_frequency)) |>
    ungroup() 
kspbmc_path_annotated_gliph_nprod_table

cohort,repertoire_id,pathology,phenotype,tissue_type,frequency
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
Hippos,008_216_V01,Clustered Unknown,Endemic KS,PBMC,0.592600906
Hippos,008_216_V01,Multi-pathogen,Endemic KS,PBMC,0.008366322
Hippos,008_216_V01,Unclustered Unknown,Endemic KS,PBMC,0.398461413
⋮,⋮,⋮,⋮,⋮,⋮
KOBS,KOBSC_9,Multi-pathogen,KSHV Positive,PBMC,0.01916385
KOBS,KOBSC_9,Unclustered Unknown,KSHV Positive,PBMC,0.58731915


### Figure 8

In [463]:
kspbmc_public_tcrs_plot <- kspbmc_path_annotated_nprod_table |>
  mutate(frequency = frequency * 100) |>
  ggboxplot(x = "pathology", y = "frequency", color = "phenotype", 
    add = "jitter", order = c("CMV", "DENV1", "DENV3/4", "EBV", "HCV", "HIV-1",
       "HSV-2",
      "InfluenzaA", "M.tuberculosis", "SARS-CoV-2", "Other", "Multi-pathogen",
      "Unknown")) |>
  ggpar(xlab = FALSE, ylab = "Cummulative frequency", 
    legend.title = "Cohort", 
    format.scale = T, legend = "right",
    ggtheme = theme_classic(base_size = 32),
    palette = c("HIV & KSHV Positive" = "#fb8072", 
  "Epidemic KS" = "#e41a1c", "KSHV Positive" = "#80b1d3",
  "Endemic KS" = "#377eb8", "HIV Positive" = "#756bb1", "Control" = "#addd8e")) +
  geom_hline(yintercept = 1, linetype = 2) + 
  geom_hline(yintercept = 25, linetype = 2) + 
  geom_hline(yintercept = 75, linetype = 2) + 
  geom_hline(yintercept = 90, linetype = 2) + 
  scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  annotation_logticks(sides = "l", short = unit(2,"mm"),
    mid = unit(4,"mm"),long = unit(8,"mm"), size = 1) +
  theme_classic(base_size = 26) +
  theme(axis.text = element_text(face = "bold"))
  
kspbmc_public_gliph_plot <- kspbmc_path_annotated_gliph_nprod_table |>
  mutate(frequency = frequency * 100) |>
  ggboxplot(x = "pathology", y = "frequency", color = "phenotype", 
    add = "jitter", order = c("CMV", "EBV", "HCV", "HIV-1",
       "HSV-2", "InfluenzaA", "M.tuberculosis", "SARS-CoV-2", "Other", 
       "Multi-pathogen",   "Clustered\nUnknown", "Unclustered\nUnknown")) |>
  ggpar(xlab = FALSE, ylab = "Cummulative frequency", 
    legend.title = "Cohort", 
    format.scale = T, legend = "right",
    ggtheme = theme_classic(base_size = 32),
    palette = c("HIV & KSHV Positive" = "#fb8072", 
  "Epidemic KS" = "#e41a1c", "KSHV Positive" = "#80b1d3",
  "Endemic KS" = "#377eb8", "HIV Positive" = "#756bb1", "Control" = "#429f03")) +
  geom_hline(yintercept = 1, linetype = 2) +
  geom_hline(yintercept = 25, linetype = 2) +
  geom_hline(yintercept = 75, linetype = 2) +  
  geom_hline(yintercept = 90, linetype = 2) +  
  scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  annotation_logticks(sides = "l", short = unit(2,"mm"),
    mid = unit(4,"mm"),long = unit(8,"mm"), size = 1) + 
  theme_classic(base_size = 26) +
  theme(axis.text = element_text(face = "bold"))


figure_eight_layout = "
1111
1111
2222
2222"
figure_eight <- kspbmc_public_tcrs_plot +  
    kspbmc_public_gliph_plot + 
  plot_layout(design = figure_eight_layout, guides = "collect") +
  plot_annotation(tag_levels = 'A') & theme(text = element_text('NimbusSan'),
    legend.position = "right",
    axis.title.x = element_blank())
ggsave(str_c(figures_path, "pdf", "Figure_eight.pdf", sep = "/"), figure_eight, width = 36, 
    height = 12, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(str_c(figures_path, "svg", "Figure_eight.svg", sep = "/"), figure_eight, width = 36, 
    height = 12)
ggsave(str_c(figures_path, "png", "Figure_eight.png", sep = "/"), figure_eight, width = 36, 
    height = 12)