# Filter barcodes to remove ones aligning to multiple inserts or second barcode

Arvind Rasi Subramaniam

27 Oct 2021

**Edit this Rscript only in the accompanying .ipynb file. The `snakemake` workflow will automatically export it as a .R script.**

## Load libraries

In [66]:
library(Biostrings)
library(GenomicAlignments)
library(plyranges)
library(tidyverse)

## Define analysis-specific variables

In [67]:
args <- commandArgs(trailingOnly = T)
barcode1_alignment_file <- args[1]
barcode_insert_file <- args[2]
read_count_cutoff <- args[3]
output_file <- args[4]

## Read insert-barcode pair counts 

In [68]:
insert_barcodes <- read_csv(barcode_insert_file) %>% 
  rename(read_count = count) %>%
  filter(read_count >= as.integer(read_count_cutoff)) %>%
  mutate(barcode_num = str_c(barcode_num, '_', sample)) %>%
  arrange(barcode_num) %>%
  print()

[1mRows: [22m[34m75741[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (2): barcode, sample
[32mdbl[39m (3): insert_num, barcode_num, count

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[90m# A tibble: 75,741 × 5[39m
   insert_num barcode_num barcode                  sample read_count
        [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m                    [3m[90m<chr>[39m[23m       [3m[90m<dbl>[39m[23m
[90m 1[39m         77 10_89lib1   TGGACTGCCCACCCCGGTGGCTCT 89lib1        741
[90m 2[39m        174 10_89lib2   AGTTATGATATGCGCTGGTATTGT 89lib2        757
[90m 3[39m         58 10_89lib4   GCCGGTCCGTCTAGTCATGTGTGG 89lib4        545
[90m 4[39m        222 10_89lib5   GTGGAGTCTAGCTTTGAGTGTAGG 89lib5        257
[90m 5[39m        109 10_89lib6   AGTTTTACTTAGGCCGCGACGGTC 89lib6        587
[90m 6[39m         99 100_89lib1  GATGGTTCGTGTCAGGCCTTCATC 89lib1        546
[90m 7[39m        180 100_89lib2  TGGGTGATTAGGCAGGGTGGTATC 89lib2        552
[90m 8[39m        234 100_89lib3  TATTGGTTGCACTGGCTGGGGATT 89lib3        440
[90m 9[39m        195 100_89lib4  CTTACGTGTTGCAATTTGGAGGGC 89lib4        397
[90m10[39m        163

## How many barcode_1 have multiple inserts?

In [69]:
many_to_one_barcode_combinations <- insert_barcodes %>% 
  group_by(barcode) %>% 
  mutate(n1 = dplyr::n()) %>% 
  ungroup() %>% 
  filter(n1 > 1) %>% 
  print()

[90m# A tibble: 0 × 6[39m
[90m# … with 6 variables: insert_num <dbl>, barcode_num <chr>, barcode <chr>,[39m
[90m#   sample <chr>, read_count <dbl>, n1 <int>[39m


## Fields to read from BAM file

In [70]:
# extract the number of mismatches and total edits
param <- ScanBamParam(
  # what = scanBamWhat(),
  what = c("qname", "flag"),
  # extract number of mismatches
  tag = c("XM"), 
  # include only snps; exclude indels
  simpleCigar = T
)

## Read barcode vs barcode alignments for barcodes 1

In [71]:
bamfile1 <- BamFile(barcode1_alignment_file)
alns1 <- readGAlignments(bamfile1, param = param) %>% 
  as_tibble() %>% 
  mutate(rname = as.character(seqnames)) %>% 
  select(rname, qname, flag, XM) %>% 
  type_convert() %>% 
  print()


[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
cols(
  rname = [31mcol_character()[39m,
  qname = [31mcol_character()[39m
)



[90m# A tibble: 154,225 × 4[39m
   rname    qname         flag    XM
   [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m        [3m[90m<int>[39m[23m [3m[90m<int>[39m[23m
[90m 1[39m 2_89lib1 2_89lib1         0     0
[90m 2[39m 2_89lib1 6691_89lib1    256     1
[90m 3[39m 2_89lib1 12020_89lib1   256     1
[90m 4[39m 2_89lib1 12022_89lib1   256     1
[90m 5[39m 2_89lib1 12023_89lib1   256     1
[90m 6[39m 3_89lib1 3_89lib1         0     0
[90m 7[39m 3_89lib1 5921_89lib1    256     1
[90m 8[39m 3_89lib1 5923_89lib1    256     1
[90m 9[39m 3_89lib1 7594_89lib1    256     1
[90m10[39m 3_89lib1 7596_89lib1    256     1
[90m# … with 154,215 more rows[39m


## Find barcode_1 that are linked to distinct insert or might be sequencing errors

In [72]:
exclude1 <- alns1 %>% 
  filter(rname != qname) %>%
  left_join(select(insert_barcodes, insert_num, barcode_num, read_count), by = c("rname" = "barcode_num")) %>%
  rename(rinsert = insert_num, rcount = read_count) %>%
  right_join(select(insert_barcodes, insert_num, barcode_num, read_count), by = c("qname" = "barcode_num")) %>%
  rename(qinsert = insert_num, qcount = read_count) %>%
  # this exludes:
  # 1. barcodes that map to two distinct inserts
  # 2. barcodes that got lower count than another homologous barcode with same insert
  filter(!(qinsert == rinsert & qcount > rcount)) %>%
  arrange(qname) %>% 
  distinct(qname) %>%
  print()

[90m# A tibble: 32,563 × 1[39m
   qname       
   [3m[90m<chr>[39m[23m       
[90m 1[39m 10000_89lib1
[90m 2[39m 10000_89lib2
[90m 3[39m 10000_89lib3
[90m 4[39m 10000_89lib4
[90m 5[39m 10000_89lib5
[90m 6[39m 10000_89lib6
[90m 7[39m 10000_89lib7
[90m 8[39m 10001_89lib1
[90m 9[39m 10001_89lib2
[90m10[39m 10001_89lib3
[90m# … with 32,553 more rows[39m


## Write barcodes that do not clash to output

In [76]:
filtered_barcodes <- insert_barcodes %>% 
  anti_join(select(exclude1, qname), by = c("barcode_num" = "qname")) %>%
  anti_join(select(many_to_one_barcode_combinations, barcode_num), by = "barcode_num") %>%
  select(insert_num, barcode_num, barcode, read_count, sample) %>%
  arrange(desc(read_count)) %>%
  rename(linkage_count = read_count) %>%
  mutate(barcode_num = 1:dplyr::n()) %>%
  write_csv(output_file) %>%
  print()

[90m# A tibble: 43,178 × 5[39m
   sample insert_num barcode_num barcode                  linkage_count
   [3m[90m<chr>[39m[23m       [3m[90m<dbl>[39m[23m       [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m                            [3m[90m<dbl>[39m[23m
[90m 1[39m 89lib2         99           1 GAGGGCTTTGAGGCTGACTGTGGG           843
[90m 2[39m 89lib2        200           2 GGCGAGTACGGGGCCAGTGTTCTC           830
[90m 3[39m 89lib1        188           3 GGTTAGGACCACGGCATGTGCAGT           821
[90m 4[39m 89lib2        238           4 GTGGCCTGTTCCTAGCGCGCCGCT           815
[90m 5[39m 89lib2         92           5 GACAATGAGCGGGACGGGTTTAAT           814
[90m 6[39m 89lib2        201           6 GATGACCTGGTTGCTATGTGCGGT           792
[90m 7[39m 89lib2        222           7 GGGCATTGTGCTAAGTATGGGATG           784
[90m 8[39m 89lib1         43           8 TTGACGCCCTGGTGTTGTCGTGGG           778
[90m 9[39m 89lib1         21           9 GAGTGCGTGATGAGGTGGGGGGGT     