In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import duckdb
from concurrent.futures import ThreadPoolExecutor, as_completed
import dask.dataframe as dd
import sys
import glob
import os
from Bio.Seq import Seq

sys.path.append("/global/scratch/projects/fc_mvslab/OpenProjects/Sanjana/TREBL/")
from scripts import initial_map, map_refiner, complexity, finder, preprocess, error_correct, plotting, umi_deduplicate


from tqdm import tqdm  # progress bar

# Looking at smallest file from new NKX2-2 data to see if automatic threshold detection is reasonable

In [6]:
file_path = "/global/scratch/projects/fc_mvslab/OpenProjects/Caitlin/TL4B2/Sanj_Test/r18_RTBC_Puro_Only_R2_T0_S115.assembled.fastq"


In [5]:
db_path = os.path.join("../../duckdb/NKX2-2_test.db")
db_path

'../../duckdb/NKX2-2_test.db'

In [8]:
step1_RTBC = finder.Barcode(name = "RTBC",
                       preceder = "GCCCC",
                       post = "GCGG",
                       length = 16)

RTBC_objects = [step1_RTBC]

# Extract UMIs and barcodes from reads
umi_mapper = initial_map.InitialMapper(db_path = db_path,
                                   step_name = f"trebl_exp", 
                                   seq_file = file_path,
                                   design_file_path = None,
                                   bc_objects = RTBC_objects,
                                   reverse_complement = True,
                                   umi_object= finder.Barcode(name = "UMI", preceder = "", post = "", length = 12))
umi_mapper.create_map()

Reading 1 FASTQ/TXT file(s)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Done in 3.75 seconds.

Reverse complement of sequences...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Done in 2.73 seconds.

Extracting 1 barcodes...
Regex for RTBC: GCCCC(.*)GCGG
Done in 0.37 seconds.

Extracting UMI...
UMI: extracting last 12 bases
Done in 0.30 seconds.

Done in 0.61 seconds.

Mapping complete.


In [15]:
# Only keep barcodes of correct length, then error correct
refiner = map_refiner.MapRefiner(db_path = db_path,
                                    bc_objects=RTBC_objects,
                                    column_pairs = [],
                                    reads_threshold = 0,
                                    map_order = ['quality', 'error_corrected'],
                                    step_name=f"trebl_exp", 
                                    descriptor = "",
                                    output_figures_path = "../../output/",
                                    manual_ec_threshold=None)
refiner.refine_map_from_db()

Base prefix (stable across descriptors): trebl_exp_RTBC_
Full prefix for this instance: trebl_exp_RTBC_

Using the following step order:
1. initial
2. quality
3. error_corrected


Filtering to high-quality reads...
Created table: trebl_exp_RTBC_quality â€” filtered for TRUE in all *_qual columns.
Done in 0.75 seconds.


=== Running error correction step on trebl_exp_RTBC_quality ===

=== Applying whitelist for trebl_exp ===
Generating FASTQ: /global/scratch/projects/fc_mvslab/OpenProjects/Sanjana/TREBL/output/trebl_exp_RTBC_filtered_barcodes_extracted.fastq
Wrote 1409837 reads to /global/scratch/projects/fc_mvslab/OpenProjects/Sanjana/TREBL/output/trebl_exp_RTBC_filtered_barcodes_extracted.fastq
Done in 0.66 seconds.

Whitelist already exists, skipping:
  /global/scratch/projects/fc_mvslab/OpenProjects/Sanjana/TREBL/output/trebl_exp_RTBC_filtered_barcodes_extracted_whitelist.txt
Unique canonical barcodes: 2
Done in 0.12 seconds.

Whitelist application complete for trebl_exp at trebl_ex

In [None]:
refiner.plot_loss()
refiner.plot_error_correction()