In [1]:
options(warn = -1)
suppressPackageStartupMessages(library(tidyverse))

In [7]:
args <- commandArgs(trailingOnly = TRUE)

raw_count_file <- args[1]
linked_count_file <- args[2]
umi_cutoff <- as.numeric(args[3])
output_file <- args[4]
# raw_count_file <- "../data/barcode_umi_and_read_counts/pn_219p19_219p27.csv"
# linked_count_file <- "../data/subpool_barcode_counts/219p27.csv"
# umi_cutoff <- 20
# output_file <- "../data/library_statistics/219p27.csv"

### Load sample annotations

In [3]:
sample_annotations <- read_csv("../annotations/sample_annotations.csv", show_col_types = F)

sample_annotations

sample_number,owner,sample_name,library_type,plasmid_snapgene_map,amplicon_snapgene_map,illumina_sample_id,sample_id,barcode1_read,barcode1_start,⋯,umi_read,umi_start,umi_length,linkage_ref,linkage_ref_barcode_number,run_id,read1_length,read2_length,read3_length,subpool_barcode
<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>
1,pn,ptc_bc1_total,mrna,pHPHS949,lPNHS23,pn_179p1_179p2_179p3_179p4,179p1,1,1,⋯,2,1,7,182p1,1,230104_VH01189_46_AACH77JM5,60,7,60,CGTGAT
2,pn,ntc_bc1_total,mrna,pHPHS950,lPNHS23,pn_179p1_179p2_179p3_179p4,179p2,1,1,⋯,2,1,7,182p1,1,230104_VH01189_46_AACH77JM5,60,7,60,CACTGT
3,pn,eyfpwt_bc1_total,mrna,pHPHS951,lPNHS23,pn_179p1_179p2_179p3_179p4,179p3,1,1,⋯,2,1,7,182p1,1,230104_VH01189_46_AACH77JM5,60,7,60,ATTGGC
4,pn,eyfpdeopt_bc1_total,mrna,pHPHS952,lPNHS23,pn_179p1_179p2_179p3_179p4,179p4,1,1,⋯,2,1,7,182p1,1,230104_VH01189_46_AACH77JM5,60,7,60,GATCTC
13,pn,ptc_bc2_total,mrna,pHPHS949,lPNHS24,pn_179p7_179p8,179p7,1,1,⋯,2,1,7,182p1,2,230202_VH01189_62_AACJKHVM5,60,7,60,CGTGAT
14,pn,eyfpdeopt_bc2_total,mrna,pHPHS952,lPNHS24,pn_179p7_179p8,179p8,1,1,⋯,2,1,7,182p1,2,230202_VH01189_62_AACJKHVM5,60,7,60,GATCTC
15,pn,ntc_bc1_i2_retained,mrna,pHPHS950,lPNHS26,pn_188p1,188p1,1,1,⋯,2,1,7,182p1,1,230202_VH01189_62_AACJKHVM5,60,7,60,CACTGT
16,pn,ptc_bc1_i2_retained,mrna,pHPHS949,lPNHS26,pn_188p2,188p2,1,1,⋯,2,1,7,182p1,1,230302_VH00319_324_AACKHCLM5,30,7,80,ATCACG
17,pn,ntc_bc1_cpa,mrna,pHPHS950,lPNHS27,pn_191p23,191p2,1,1,⋯,2,1,7,182p1,1,230302_VH00319_324_AACKHCLM5,30,7,80,ACAGTG
18,pn,eyfpwt_bc1_cpa,mrna,pHPHS951,lPNHS27,pn_191p23,191p3,1,1,⋯,2,1,7,182p1,1,230302_VH00319_324_AACKHCLM5,30,7,80,GCCAAT


In [8]:
raw_counts <- read_csv(raw_count_file, show_col_types = FALSE)
linked_counts <- read_csv(linked_count_file, show_col_types = FALSE)

In [9]:
raw_stats <- raw_counts %>%
  filter(umi_count >= umi_cutoff) %>%
  summarize(
    total_barcodes = dplyr::n(), 
    total_umi_count = sum(umi_count), 
    total_read_count = sum(read_count)) %>%
    pivot_longer(everything())

### Load insert annotations

In [10]:
insert_annotations <- read_csv("../../20221229_exp182_rbp_dual_sgrna_linkage/annotations/insert_annotations/rbp_dual_sgrna_linkage.csv", show_col_types = F) %>%
  mutate(sgrna = str_c(sgrna1_name, str_extract(sgrna2_name, "..$"))) %>%
  select(insert_num, gene_name, sgrna)

insert_annotations

insert_num,gene_name,sgrna
<dbl>,<chr>,<chr>
0,A1CF,A1CF_1_2
1,A1CF,A1CF_2_3
2,A1CF,A1CF_3_4
3,A1CF,A1CF_4_1
4,AAR2,AAR2_1_2
5,AAR2,AAR2_2_3
6,AAR2,AAR2_3_4
7,AAR2,AAR2_4_1
8,AARS1,AARS1_1_2
9,AARS1,AARS1_2_3


In [18]:
linked_stats <- linked_counts %>%
  filter(umi_count >= umi_cutoff) %>%
  left_join(insert_annotations, by = "insert_num") %>%
  group_by(insert_num) %>%
  summarize(n_barcodes = dplyr::n(), umi_count = sum(umi_count), read_count = sum(read_count), gene_name = first(gene_name), .groups = "drop") %>%
  summarize(
    total_genes = dplyr::n_distinct(gene_name),
    total_inserts = dplyr::n(),
    total_linked_read_count = sum(read_count), 
    total_linked_umi_count = sum(umi_count), 
    total_linked_barcodes = sum(n_barcodes),
    median_barcodes_per_insert = median(n_barcodes), 
    median_reads_per_insert = median(read_count), 
    median_umis_per_insert = median(umi_count)
  ) %>%
  pivot_longer(everything())

In [22]:
anti_join(insert_annotations, linked_counts %>%filter(umi_count >= umi_cutoff)) %>%
  group_by(gene_name) %>%
  count() %>%
  arrange(desc(n))

[1m[22mJoining, by = "insert_num"


gene_name,n
<chr>,<int>
RPL14,3
DDX28,2
RPS5,2
ZCCHC4,2
ACAA2,1
ACO2,1
ALKBH8,1
APOBEC3A,1
ARF1,1
ARHGEF1,1


In [29]:
8676/8760*100

In [17]:
bind_rows(raw_stats, linked_stats) %>%
  write_csv(output_file)

name,value
<chr>,<dbl>
total_barcodes,219606.0
total_umi_count,51620163.0
total_read_count,68392283.0
total_genes,2186.0
total_inserts,8676.0
total_linked_read_count,26795393.0
total_linked_umi_count,20543294.0
total_linked_barcodes,76004.0
median_barcodes_per_insert,8.0
median_reads_per_insert,2533.0
