# Search for some barcodes in the PacBio data

Import Python modules:

In [1]:
import io
import re

import Bio.Seq
import Bio.SeqIO

import pandas as pd

import pysam

Define the barcodes of interest.
These are barcodes with high invalid counts:

In [2]:
barcodes_of_interest = (
    pd.read_csv("../results/barcode_runs/counts_invalid_by_sample/libA_220121_1_no-antibody_control_100x_1.csv")
    .head()
    .assign(reverse_complement=lambda x: (
        x["barcode"]
        .map(lambda s: str(Bio.Seq.Seq(s).reverse_complement()))
        )
    )
)

barcodes_of_interest

Unnamed: 0,barcode,count,library,sample,reverse_complement
0,TGCTCTTTGAAAATGC,2081446,libA,220121_1_no-antibody_control_100x_1,GCATTTTCAAAGAGCA
1,ATGCGGGAACGAAACG,1618441,libA,220121_1_no-antibody_control_100x_1,CGTTTCGTTCCCGCAT
2,ATTTGTGCTTACCTTT,1532973,libA,220121_1_no-antibody_control_100x_1,AAAGGTAAGCACAAAT
3,CCGAACCGGCATGTAT,1380771,libA,220121_1_no-antibody_control_100x_1,ATACATGCCGGTTCGG
4,AGGCACTTGAACCTAT,1211934,libA,220121_1_no-antibody_control_100x_1,ATAGGTTCAAGTGCCT


Build a regex for all the barcodes of interest and their reverse complements:

In [3]:
barcodes_of_interest_regex = re.compile(
    "|".join(
        getattr(tup, col) for tup in barcodes_of_interest.itertuples()
        for col in ["barcode", "reverse_complement"]
    )
)

Search these barcodes in the PacBio data:

In [4]:
pacbio_runs = pd.read_csv("../data/pacbio_runs.csv")
pacbio_runs

Unnamed: 0,library,run,fastq,amplicon
0,libA,210730,/fh/fast/bloom_j/computational_notebooks/fwels...,plasmid
1,libB,210730,/fh/fast/bloom_j/computational_notebooks/fwels...,plasmid
2,libA,211026,/fh/fast/bloom_j/SR/ngs/pacbio/211026_SequelII...,plasmid
3,libB,211026,/fh/fast/bloom_j/SR/ngs/pacbio/211026_SequelII...,plasmid
4,libA,220218,/fh/fast/bloom_j/SR/ngs/pacbio/220218_WelshLoe...,plasmid
5,libB,220218,/fh/fast/bloom_j/SR/ngs/pacbio/220218_WelshLoe...,plasmid


Search these files to matches of the barcodes of interest:

In [5]:
for tup in pacbio_runs.itertuples(index=False):
    fastq = tup.fastq
    n = n_matched = 0
    with pysam.FastxFile(fastq) as f:
        for read in f:
            n += 1
            if barcodes_of_interest_regex.search(read.sequence):
                n_matched += 1
    print(f"{n_matched} / {n} CCSs matched a barcode of interest for {fastq}")

0 / 168497 CCSs matched a barcode of interest for /fh/fast/bloom_j/computational_notebooks/fwelsh/2021/210730_ccs/lib3_210730_ccs.fastq.gz
0 / 151179 CCSs matched a barcode of interest for /fh/fast/bloom_j/computational_notebooks/fwelsh/2021/210730_ccs/lib4_210730_ccs.fastq.gz
0 / 44018 CCSs matched a barcode of interest for /fh/fast/bloom_j/SR/ngs/pacbio/211026_SequelIIe/Frances_Pool-Cell1/fastx_files/demultiplex.bc1003_BAK8A_OA--bc1003_BAK8A_OA.hifi_reads.fastq.gz
0 / 54799 CCSs matched a barcode of interest for /fh/fast/bloom_j/SR/ngs/pacbio/211026_SequelIIe/Frances_Pool-Cell1/fastx_files/demultiplex.bc1008_BAK8A_OA--bc1008_BAK8A_OA.hifi_reads.fastq.gz
0 / 476728 CCSs matched a barcode of interest for /fh/fast/bloom_j/SR/ngs/pacbio/220218_WelshLoes/Welsh_Pool-Cell1/fastx_files/demultiplex.bc1003_BAK8A_OA--bc1003_BAK8A_OA.hifi_reads.fastq.gz
0 / 560194 CCSs matched a barcode of interest for /fh/fast/bloom_j/SR/ngs/pacbio/220218_WelshLoes/Welsh_Pool-Cell1/fastx_files/demultiplex.bc100