# figure 1 vlz

- kernel: r_env, R 4.1.3
- date: 2023-12-26

## load

In [None]:
library(tidyverse)
library(tools)
library(logging)
library(ggpubr)
library(ggsci)
library(patchwork)
library(igraph)
library(ggraph)
library(Seurat)

source('../scripts/r_funcs.r')

theme_set(theme_pubr())
logging::basicConfig()
options(warn = -1)

outdir <- '../figures/fig1'
create_dir(outdir)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.0     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.1     [32m✔[39m [34mtibble   [39m 3.2.0
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘igraph’


The following objects are masked from ‘package:lubridate’:

    %--%, union


The following objects are masked from ‘package:dplyr’:

    

In [3]:
f_pat_gp <- '../tables/patient_info.tsv'
gp <- 'response'

gp_comp_map$response <- list(
  c('R-pre', 'R-post'),
  c('PR-pre', 'PR-post'),
  c('R-pre', 'PR-pre'),
  c('R-post', 'PR-post')
)
gp_lvls$response <- c('R', 'PR')
comb_order <- c('R-pre', 'R-post', 'PR-pre', 'PR-post')

## 1b: clinical info

- date: 2024-01-02
- desc: WES only include TN-paired, Fresh, nonLN samples

In [4]:
f_sc_clin <- '../../assets/clinical/sc_merge_clinical-230925.xlsx'
f_wes_clin <- '../../assets/clinical/wes_merge_clin-230921.xlsx'
f_mode <- '../../s1_flow/wes-new/tumor_samples_analysis_mode.tsv'
outdir <- '../figures/fig1/'

In [5]:
# sample info
df_samp_sc <- readxl::read_xlsx(f_sc_clin, sheet = 'id_mapping') %>% 
    mutate(sample_id = paste(sample_type_id, patient_id, tissue_id, sep = '-')) %>%
    select(sample_id, patient_id, patient) %>% 
    distinct() %>%
    mutate(data_cat = 'single-cell')
loginfo('sc samples: %g', nrow(df_samp_sc))

df_samp_wes <- readxl::read_xlsx(f_wes_clin, sheet = 'id_mapping') %>%
    select(sample_id, patient_id, patient) %>% 
    distinct() %>%
    mutate(data_cat = 'WES')
loginfo('WES samples: %g', nrow(df_samp_wes))

[0m2024-03-01 10:30:30 INFO::sc samples: 81[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m
[0m2024-03-01 10:30:30 INFO::WES samples: 142[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m


In [6]:
# wes sample filtering
wes_mode_valid <- read_tsv(file = f_mode, show_col_types = F) %>% 
    filter(pair_mode == 'tumor-normal-paired', pair_cat == 'Fresh', !grepl('-LN', tumor_sample_id))
df_samp_wes <- filter(df_samp_wes, sample_id %in% c(wes_mode_valid$tumor_sample_id, wes_mode_valid$paired_normal))
loginfo('TN-pair, fresh, nonLN WES samples: %g', nrow(df_samp_wes))

[0m2024-03-01 10:30:34 INFO::TN-pair, fresh, nonLN WES samples: 98[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m


In [7]:
# merge all samples & add clin group
df_samp <- rbind.data.frame(df_samp_sc, df_samp_wes) %>%
    add_clin_info(ftsv = f_pat_gp, columns = NULL, merge_by = 'patient_id') %>%
    mutate(sample_type = if_else(str_starts(pattern = '01-', string = sample_id), 'Baseline', 'Treat'),
           tissue_cat = case_when(
               str_ends(sample_id, '-B') ~ 'Blood',
               str_ends(sample_id, '-T') ~ 'Tumor',
               TRUE ~ 'Others'
           ))
df_samp %>% write_tsv(str_glue('{outdir}/fig1b-clin_info-wes_tn_pair_fresh_t.tsv'))
loginfo('total valid samples: %g', nrow(df_samp))
table(df_samp$tissue_cat, df_samp$data_cat)

[0m2024-03-01 10:30:34 INFO::these clinial info will be added: mandard_group, response_degree, treatment_group, patient_gp, patient_gp_v2, response[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m
[0m2024-03-01 10:30:34 INFO::total valid samples: 179[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m


       
        single-cell WES
  Blood           0  45
  Tumor          81  53

In [8]:
# WES & scRNA tumor samples overlap
pdf(str_glue('{outdir}/fig1b-venn_clin-wes_vs_sc.pdf'), width = 5, height = 3)
par(mfrow = c(1, 2), mar = c(0, 0, 0, 0))
for (stype in c('Baseline', 'Treat')) {
    venn::venn(
        x = list(WES = filter(df_samp, sample_type == stype, data_cat == 'WES', tissue_cat == 'Tumor') %>% pull(sample_id),
                 scRNA = filter(df_samp, sample_type == stype, data_cat == 'single-cell', tissue_cat == 'Tumor') %>% pull(sample_id)),
        zcolor = 'style', ilcs = 1, sncs = 1, main = stype, line = -2, box = F, par = F
    )
}
dev.off()

In [11]:
# scRNA clinical info barplot
dp <- df_samp %>% 
    filter(data_cat == 'single-cell') %>%
    pivot_wider(id_cols = c('patient_id', gp),
                names_from = 'sample_type',
                values_from = 'sample_id') %>% 
    mutate(sample_cat = case_when(
        !is.na(Baseline) & !is.na(Treat) ~ 'Paired',
        is.na(Baseline) ~ 'Post-only',
        is.na(Treat) ~ 'Pre-only'
    )) %>% 
    count(sample_cat, .data[[gp]], name = 'n_patient') %>% 
    mutate(pat_gp = if_else(is.na(.data[[gp]]), 'unknown', .data[[gp]]),
           sample_cat = factor(sample_cat, levels = c('Paired', 'Pre-only', 'Post-only')))

p <- ggbarplot(data = dp, x = 'pat_gp', y = 'n_patient', fill = 'sample_cat', 
               palette = 'nejm', order = c('R', 'PR', 'unknown'),
               position = position_dodge(0.7), label = T, lab.vjust = -0.1) +
    labs(x = '', y = 'Number of patients', fill = '')
ggsave(filename = str_glue('{outdir}/fig1b-bar_clin-mandard_group.pdf'), plot = p, width = 3, height = 4)

## 1c: umap of whole data

In [None]:
theme_set(theme_pubr())
outdir <- '../figures/fig1/'

scrna <- readRDS('../../stage4/a01_data/seuobj/whole.rds')

In [None]:
subdata <- subset(x = scrna, downsample = 10000)
p <- DimPlot(subdata, group.by = 'cellgp', pt.size = 0.8, label = T) +
    NoLegend() + 
    scale_color_jama() +
    theme(plot.title = element_blank())
ggsave('../fig1/fig1-umap_celltype-whole-v3.pdf', plot=p, height = 4, width = 5)

## 1d: major cell type cell composition

In [51]:
theme_set(theme_pubr())
outdir <- '../figures/fig1/'

f_obs <- '../../stage4/a01_data/h5ad/whole_obs.csv'

df_samp <- read_csv(file = f_obs, show_col_types = F) %>% 
    count(sample, name = 'n_cell_samp')

[1m[22mNew names:
[36m•[39m `` -> `...1`


In [52]:
comp <- read_csv(f_obs, show_col_types = F) %>% 
    select(sample, patient, sample_type, cell_type = cell_type_correct) %>% 
    add_clin_info(ftsv = f_pat_gp, columns = gp, merge_by = 'patient') %>% 
    cal_cell_comp(samp_cnt = df_samp, clinical_cols = c('patient', gp)) %>% 
    mutate(cell_type = case_when(
        cell_type == 'T' ~ 'T cells',
        cell_type == 'B' ~ 'B cells',
        TRUE ~ cell_type
    )) %>%
    mutate(sample_type = case_match(sample_type, 'Baseline' ~ 'pre', 'Treat' ~ 'post'))

comp %>% write_tsv(str_glue('{outdir}/fig1d-cc-major.tsv'))

[1m[22mNew names:
[36m•[39m `` -> `...1`


[0m2024-02-04 23:30:54 INFO::these clinial info will be added: response[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m


In [57]:
ctype_order <- c('Endothelial', 'Fibroblast', 'Myeloid', 'Plasma', 'T cells')
p <- read_tsv(str_glue('{outdir}/fig1d-cc-major.tsv'), show_col_types = F) %>% 
    filter(.data[[gp]] %in% gp_lvls[[gp]]) %>% 
    filter(cell_type %in% ctype_order) %>% 
    mutate(cell_type = factor(cell_type, ctype_order)) %>% 
    cell_comp_boxplot(x = c(gp, 'sample_type'), y = 'pct', pt_fill = gp,
                      xorder = comb_order, fill_order = gp_lvls[[gp]], xangle = 60, ncol = 7) +
    stat_compare_means(comparisons = list(c('R-pre', 'R-post'), c('PR-pre', 'PR-post'))) +
    labs(fill = 'Response', y = 'Cell percent in sample') +
    theme(legend.position = 'right')
ggsave(filename = str_glue('{outdir}/fig1d-box_cc-major.pdf'), 
       plot = p, width = 14, height = 4)

## 1e: overview of all subtypes

In [12]:
f_cell_info <- '../../stage4/a01_data/h5ad/whole_obs.csv'
theme_set(theme_void())

In [13]:
cell_info <- read_csv('../../stage4/a01_data/h5ad/whole_obs.csv', show_col_types = F) %>% 
    mutate(
        celltype1 = 'All cells', 
        celltype2 = case_when(
            cell_type_correct %in% c('B', 'Plasma') ~ 'B & Plasma cells',
            cell_type_correct == 'T' ~ 'T cells',
            TRUE ~ cell_type_correct,
        ),
        celltype3 = case_when(
            subtype == 'T_Prolif' ~ 'T_MKI67',
            TRUE ~ subtype
        )
    )
# color tree edge & cell count/frac barplot by major cell types
uniq_major_ctype <- unique(cell_info[['celltype2']])
major_color_map <- pal_nejm()(length(uniq_major_ctype))
names(major_color_map) <- uniq_major_ctype
# color tree leaf use detailed cell types
uniq_leaf <- unique(cell_info[['celltype3']])
leaf_color_map <- scanpy_pal$godsnot_102[1:length(uniq_leaf)]
names(leaf_color_map) <- uniq_leaf

[1m[22mNew names:
[36m•[39m `` -> `...1`


In [14]:
# tree of cell types
## construct tree edge
tree_edge <- cell_info %>%
    select(from = celltype1, to = celltype2, edge_color = celltype2) %>%
    distinct()
tree_edge <- cell_info %>%
    select(from = celltype2, to = celltype3, edge_color = celltype2) %>%
    distinct() %>%
    rbind.data.frame(tree_edge) %>%
    arrange(from, to)
loginfo('%g edges', nrow(tree_edge))
## construct tree node
tree_node <- data.frame(name = unique(c(tree_edge$from, tree_edge$to))) %>% 
    mutate(node_color = if_else(name %in% names(leaf_color_map), name, NA_character_)) 
loginfo('%g nodes', nrow(tree_node))

[0m2024-03-01 10:33:12 INFO::96 edges[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m
[0m2024-03-01 10:33:12 INFO::97 nodes[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m


In [15]:
# cluster tree
p_tree <- ggraph(graph = graph_from_data_frame(d = tree_edge, vertices = tree_node), layout = 'dendrogram') +
    geom_edge_link(aes(edge_color = edge_color), edge_width = 1) + 
    geom_node_point(aes(color = node_color), size = 5, alpha = 0.7) + 
    geom_node_text(aes(label = name), repel = T, vjust = 0.5, hjust = 0, 
                   nudge_y = if_else(tree_node$name %in% names(leaf_color_map), 0.5, 0)) +   
    scale_edge_color_manual(values = major_color_map, na.value = 'black') +
    scale_color_manual(values = leaf_color_map, na.value = 'black') +
    coord_flip() +
    scale_y_reverse() + 
    theme(legend.position = "none")

In [16]:
# obtain y coordinates of each leaf
leaf_coord <- p_tree$data %>% 
    filter(name %in% names(leaf_color_map)) %>%
    select(leaf = name, yval = x)

In [17]:
# cell count barplot
cell_cnt <- cell_info %>% 
    select(color_by = celltype2, count_by = celltype3) %>%  # color_by should be marco-/same-calss of count_by
    count(color_by, count_by, name = 'cell_num') %>%
    merge(leaf_coord, by.x = 'count_by', by.y = 'leaf')
# cell count plot
p_cell_cnt <- ggplot(data = cell_cnt, aes(x = cell_num, y = yval, fill = color_by)) +
    geom_bar(stat = 'identity', show.legend = F, orientation = 'y') +
    scale_fill_manual(values = major_color_map) +
    # scale_x_continuous(trans = 'log10') +
    labs(title = 'cell count')

In [18]:
# cell fraction barplot
cell_frac <- cell_info %>% 
    select(macrotype = celltype2, subtype = celltype3, color_by = celltype2) %>%  # color_by should be marco-/same-calss of subtype
    add_count(macrotype, name = 'n_cell_macro') %>% 
    count(macrotype, subtype, color_by, n_cell_macro, name = 'n_cell_subtype') %>% 
    mutate(fraction = 100 * n_cell_subtype / n_cell_macro) %>%
    merge(leaf_coord, by.x = 'subtype', by.y = 'leaf')
# cell fraction plot
p_cell_frac <- ggplot(data = cell_frac, aes(x = fraction, y = yval, fill = color_by)) +
    geom_bar(stat = 'identity', show.legend = F, orientation = 'y') +
    scale_fill_manual(values = major_color_map) +
    # scale_x_continuous(trans = 'log10') +
    labs(title = 'cell fraction')

In [19]:
# clinial group composition
pat_color_map <- c('R' = '#0073C2FF', 'PR' = '#EFC000FF', 'unknown' = 'gray50')
clin_gp_composition <- cell_info %>% 
    add_clin_info(ftsv = f_pat_gp, columns = gp, merge_by = 'patient') %>%
    select(celltype = celltype3, color_by = .data[[gp]]) %>% 
    mutate(color_by = case_when(
        is.na(color_by) ~ 'unknown',
        color_by %in% c('Out', 'NA') ~ 'unknown',
        TRUE ~ color_by
    )) %>% 
    count(celltype, color_by, name = 'n_cell') %>%
    merge(leaf_coord, by.x = 'celltype', by.y = 'leaf') %>%
    mutate(color_by = factor(color_by, names(pat_color_map)))
# patient cell composintion barplot
p_pat_cc <- ggplot(data = clin_gp_composition, aes(x = n_cell, y = yval, fill = color_by)) +
    geom_bar(stat = 'identity', orientation = 'y', position = 'fill') +
    scale_fill_manual(values = pat_color_map) +
    labs(title = 'response group', fill = '') +
    guides(fill = guide_legend(ncol = 1)) +
    theme(legend.position = 'top', legend.justification = 'left')

[0m2024-03-01 10:33:13 INFO::these clinial info will be added: response[0m[22m[23m[24m[27m[28m[29m[39m[49m[0m[0m[22m[23m[24m[27m[28m[29m[39m[49m


In [20]:
# sample type composition
samp_gp_composition <- cell_info %>%
    mutate(sample_type = factor(case_match(sample_type, 'Baseline' ~ 'pre', 'Treat' ~ 'post'), c('pre', 'post'))) %>% 
    select(celltype = celltype3, color_by = sample_type) %>% 
    count(celltype, color_by, name = 'n_cell') %>%
    merge(leaf_coord, by.x = 'celltype', by.y = 'leaf')
# patient cell composintion barplot
p_samp_cc <- ggplot(data = samp_gp_composition, aes(x = n_cell, y = yval, fill = color_by)) +
    geom_bar(stat = 'identity', orientation = 'y', position = 'fill') +
    scale_fill_nejm() +
    labs(title = 'sample type', fill = '') +
    guides(fill = guide_legend(ncol = 1)) +
    theme(legend.position = 'top', legend.justification = 'left')

In [25]:
p <- p_tree + p_cell_cnt + p_cell_frac + p_pat_cc + p_samp_cc + 
    plot_layout(nrow = 1, widths = c(3.5, 1, 1, 1.5, 1.5))
ggsave(filename = str_glue('{outdir}/fig1e-subtype_overview.pdf'), height = 25, width = 12, plot = p)