In [1]:
## Notebook env: sequencing_trblsht_py (sequencing_trblsht_py kernel)
## this notebook troubleshoots the low guide count in the cellranger output of the repeated CRISPRa screen

In [2]:
## possibilities to work through:
# contamination of sequences from other guide libraries of Brian
# the library actually transfected is different than what Brian thought and the list of guides he gave me
# I messed up in cell ranger guide library input (not true, Brian confirmed)


### plan is to load in the feature barcode reads, pull out the sequences at the expected barcode location and count them
### next plan is to match library sequences (botht the given library and other libraries Brian uses) to feature barcode reads
### in the end need to explain why only about ~1.5% of the reads are matching the inputed library


In [3]:
import dnaio
import pandas as pd
import os
import multiprocessing as mp
import re
import regex
from collections import Counter
from rapidfuzz import process, fuzz
import numpy as np

In [4]:
%%bash

cutadapt -j20 --trimmed-only -m 61 -M 61 -g ^ANGCAGTGGTATCAACGCAGAGTACATGG -o /home/ssobti/projects/heterogeneity_brian/output_data/CRISPRa_repeat_screen/low_guides_trblsht/SL424_S3_L004_R2_001.trim.fastq.gz /home/ssobti/projects/heterogeneity_brian/data/072623_MD231_CRISPRa_repeat_screen/BWHG37/SL424_S3_L004_R2_001.fastq.gz
cutadapt -j20 --trimmed-only -m 61 -M 61 -g ^ANGCAGTGGTATCAACGCAGAGTACATGG -o /home/ssobti/projects/heterogeneity_brian/output_data/CRISPRa_repeat_screen/low_guides_trblsht/SL425_S4_L004_R2_001.trim.fastq.gz /home/ssobti/projects/heterogeneity_brian/data/072623_MD231_CRISPRa_repeat_screen/BWHG37/SL425_S4_L004_R2_001.fastq.gz

This is cutadapt 4.4 with Python 3.10.12
Command line parameters: -j20 --trimmed-only -m 61 -M 61 -g ^ANGCAGTGGTATCAACGCAGAGTACATGG -o /home/ssobti/projects/heterogeneity_brian/output_data/CRISPRa_repeat_screen/low_guides_trblsht/SL424_S3_L004_R2_001.trim.fastq.gz /home/ssobti/projects/heterogeneity_brian/data/072623_MD231_CRISPRa_repeat_screen/BWHG37/SL424_S3_L004_R2_001.fastq.gz
Processing single-end reads on 20 cores ...
Finished in 279.301 s (1.203 µs/read; 49.87 M reads/minute).

=== Summary ===

Total reads processed:             232,134,564
Reads with adapters:               228,579,935 (98.5%)

== Read fate breakdown ==
Reads that were too short:           1,247,588 (0.5%)
Reads that were too long:            6,755,635 (2.9%)
Reads discarded as untrimmed:                0 (0.0%)
Reads written (passing filters):   224,131,341 (96.6%)

Total basepairs processed: 20,892,110,760 bp
Total written (filtered):  13,672,011,801 bp (65.4%)

=== Adapter 1 ===

Sequence: ANGCAGTGGTATCAACGC

In [5]:
## load in CRISPRa sequences and fastq files containing feature barcodes
guide_barcodes = pd.read_csv('/home/ssobti/projects/heterogeneity_brian/data/072623_MD231_CRISPRa_repeat_screen/cellranger/feature_ref_arg/GEM1_GEM2_CRISPRa_features.csv')
guide_barcodes = guide_barcodes['sequence'].values

guide_barcodes_possible_contamination_lib = pd.read_csv('/home/ssobti/projects/heterogeneity_brian/data/072623_MD231_CRISPRa_repeat_screen/cellranger/feature_ref_arg/OPL37_PS1_PS2_separated.csv', header=None)
guide_barcodes_possible_contamination_lib = guide_barcodes_possible_contamination_lib[1].values

path = '/home/ssobti/projects/heterogeneity_brian/output_data/CRISPRa_repeat_screen/low_guides_trblsht/'
sample_names = ['SL424_S3_L004_R2_001.trim.fastq.gz', 'SL425_S4_L004_R2_001.trim.fastq.gz']

save_path = '/home/ssobti/projects/heterogeneity_brian/output_data/CRISPRa_repeat_screen/low_guides_trblsht/'

In [6]:
def obtain_reads(path, sample_name, start = None, end = None):
    sample = dnaio.open(path + sample_name)
    out = []
    if start == None and end == None:
        for record in sample:
            out.append(str(record.sequence))

    else:
        for record in sample:
            out.append(str(record[start:end].sequence))
    
    return out

In [7]:
SL424_GEM1 = obtain_reads(path, sample_names[0])
SL425_GEM2 = obtain_reads(path, sample_names[1])
SL424_ROI_GEM1 = obtain_reads(path, sample_names[0], 0, 20)
SL425_ROI_GEM2 = obtain_reads(path, sample_names[1], 0, 20)

In [8]:
counts_GEM1_ranked = Counter(SL424_ROI_GEM1).most_common()
counts_GEM2_ranked = Counter(SL425_ROI_GEM2).most_common()

In [9]:
pct_GEM1_ranked = [(barcode, 100*number/len(SL424_ROI_GEM1)) for barcode, number in counts_GEM1_ranked]
pct_GEM2_ranked = [(barcode, 100*number/len(SL425_ROI_GEM2)) for barcode, number in counts_GEM2_ranked]

## whitelist analysis

In [10]:
def fuzzy_barcode_counts(counted_data, cutoff, barcodes):
    from rapidfuzz import process, fuzz
    import numpy as np
    kmers, counts = zip(*counted_data)
    kmers = np.array(kmers)
    counts = np.array(counts)
    
    consolidated_kmers = pd.DataFrame(columns=['kmer', 'count'])
    for barcode in barcodes:
        match_idxs = [x[2] for x in process.extract(barcode, kmers, scorer=fuzz.WRatio, score_cutoff=cutoff, limit = None)]
        consolidated_kmers = pd.concat([consolidated_kmers, pd.DataFrame({'kmer': [barcode], 'count': [sum(counts[match_idxs])]})]).reset_index(drop=True)
    
    consolidated_kmers['pct'] = consolidated_kmers['count']*100/sum(counts)        
    return consolidated_kmers

In [11]:
fuzzy_bc_counts_GEM1 = fuzzy_barcode_counts(counts_GEM1_ranked, 95, guide_barcodes)
fuzzy_bc_counts_GEM2 = fuzzy_barcode_counts(counts_GEM2_ranked, 95, guide_barcodes)
fuzzy_alt_bc_counts_GEM1 = fuzzy_barcode_counts(counts_GEM1_ranked, 95, guide_barcodes_possible_contamination_lib)
fuzzy_alt_bc_counts_GEM2 = fuzzy_barcode_counts(counts_GEM2_ranked, 95, guide_barcodes_possible_contamination_lib)

In [12]:
fuzzy_bc_counts_GEM1.to_csv(save_path + 'cadpt_fuzzy_bc_counts_GEM1.csv')
fuzzy_bc_counts_GEM2.to_csv(save_path + 'cadpt_fuzzy_bc_counts_GEM2.csv')
fuzzy_alt_bc_counts_GEM1.to_csv(save_path + 'cadpt_fuzzy_alt_bc_counts_GEM1.csv')
fuzzy_alt_bc_counts_GEM2.to_csv(save_path + 'cadpt_fuzzy_alt_bc_counts_GEM2.csv')

## kmer analysis (consolidate the counted kmers that differ by 1 base)

In [13]:
def fuzzy_read_consolidate_counts(counted_data, cutoff, total_counts = None):
    from rapidfuzz import process, fuzz
    import numpy as np
    kmers, counts = zip(*counted_data)
    kmers = np.array(kmers)
    counts = np.array(counts)

    if total_counts is None:
        initial_counts = sum(counts.copy())
    else:
        initial_counts = total_counts
    consolidated_kmers = pd.DataFrame(columns=['kmer', 'count'])
    while len(kmers) > 0:
        match_idxs = [x[2] for x in process.extract(kmers[0], kmers, scorer=fuzz.WRatio, score_cutoff=cutoff, limit = None)]
        ## add the most common kmer of that group as the name in the consolidated list, break ties by alphabetical order
        max_name = kmers[match_idxs][np.argmax(counts[match_idxs])]
        consolidated_kmers = pd.concat([consolidated_kmers, pd.DataFrame({'kmer': [max_name], 'count': [sum(counts[match_idxs])]})]).reset_index(drop=True)
        kmers = np.delete(kmers, match_idxs)
        counts = np.delete(counts, match_idxs)
    
    consolidated_kmers['pct'] = consolidated_kmers['count']*100/initial_counts
    return consolidated_kmers

In [14]:
sum_counts_GEM1 = sum([x[1] for x in counts_GEM1_ranked])
sum_counts_GEM2 = sum([x[1] for x in counts_GEM2_ranked])

fuzzy_protospacer_consolidated_counts_GEM1 = fuzzy_read_consolidate_counts(counts_GEM1_ranked[0:300], 95, sum_counts_GEM1)
fuzzy_protospacer_consolidated_counts_GEM2 = fuzzy_read_consolidate_counts(counts_GEM2_ranked[0:300], 95, sum_counts_GEM2)

In [15]:
fuzzy_protospacer_consolidated_counts_GEM1.to_csv(save_path + 'cadpt_fuzzy_protospacer_consolidated_counts_GEM1.csv')
fuzzy_protospacer_consolidated_counts_GEM2.to_csv(save_path + 'cadpt_fuzzy_protospacer_consolidated_counts_GEM2.csv')