# Filter barcodes to remove ones aligning to multiple inserts or second barcode

Arvind Rasi Subramaniam

27 Oct 2021

**Edit this Rscript only in the accompanying .ipynb file. The `snakemake` workflow will automatically export it as a .R script.**

## Load libraries

In [None]:
library(Biostrings)
library(GenomicAlignments)
library(plyranges)
library(tidyverse)

## Define analysis-specific variables

In [None]:
args <- commandArgs(trailingOnly = T)
barcode1_alignment_file <- args[1]
barcode_insert_file <- args[2]
read_count_cutoff <- args[3]
output_file <- args[4]

# barcode1_alignment_file <- "../data/ref_vs_ref_alignments/70lib5/alignment_barcode1.bam"
# barcode_insert_file <- "../data/annotated_insert_barcode_counts/70lib5.csv"
# read_count_cutoff <- 1

## Read insert-barcode pair counts 

In [None]:
insert_barcodes <- read_csv(barcode_insert_file) %>% 
  rename(read_count = count) %>%
  filter(read_count >= as.integer(read_count_cutoff)) %>%
  print()

insert_barcodes  

## How many barcode_1 have multiple inserts?

In [None]:
many_to_one_barcode_combinations <- insert_barcodes %>% 
  group_by(barcode) %>% 
  mutate(n1 = dplyr::n()) %>% 
  ungroup() %>% 
  filter(n1 > 1) %>% 
  print()

## Fields to read from BAM file

In [None]:
# extract the number of mismatches and total edits
param <- ScanBamParam(
  # what = scanBamWhat(),
  what = c("qname", "flag"),
  # extract number of mismatches
  tag = c("XM"), 
  # include only snps; exclude indels
  simpleCigar = T
)

## Read barcode vs barcode alignments for barcodes 1

In [None]:
bamfile1 <- BamFile(barcode1_alignment_file)
alns1 <- readGAlignments(bamfile1, param = param) %>% 
  as_tibble() %>% 
  mutate(rname = as.character(seqnames)) %>% 
  select(rname, qname, flag, XM) %>% 
  type_convert() %>% 
  print()

## Find barcode_1 that are linked to distinct insert or might be sequencing errors

In [None]:
exclude1 <- alns1 %>% 
  filter(rname != qname) %>%
  left_join(select(insert_barcodes, insert_num, barcode_num, read_count), by = c("rname" = "barcode_num")) %>%
  rename(rinsert = insert_num, rcount = read_count) %>%
  right_join(select(insert_barcodes, insert_num, barcode_num, read_count), by = c("qname" = "barcode_num")) %>%
  rename(qinsert = insert_num, qcount = read_count) %>%
  # this exludes:
  # 1. barcodes that map to two distinct inserts
  # 2. barcodes that got lower count than another homologous barcode with same insert
  filter(!(qinsert == rinsert & qcount > rcount)) %>%
  arrange(qname) %>% 
  distinct(qname) %>%
  print()

## Write barcodes that do not clash to output

In [None]:
filtered_barcodes <- insert_barcodes %>% 
  anti_join(select(exclude1, qname), by = c("barcode_num" = "qname")) %>%
  anti_join(select(many_to_one_barcode_combinations, barcode_num), by = "barcode_num") %>%
  select(insert_num, barcode_num, barcode, read_count) %>%
  arrange(desc(read_count)) %>%
  rename(linkage_count = read_count) %>%
  mutate(barcode_num = 1:dplyr::n()) %>%
  write_csv(output_file) %>%
  print()