# Initial preparation

## Load scripts

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os
os.chdir("..")
dir_root = os.getcwd()
print("Root directory:", dir_root)

# Load functions
os.chdir(os.path.join(dir_root, "script"))
with open("function.py") as f:
    code = f.read()
    exec(code)

# Load customized multicore functions
import custom_multicore as cmulti

print('Loaded functions:', datetime.now())
os.chdir(dir_root)

Root directory: /workdir
Loaded functions: 2024-01-18 13:37:57.712980


## [Specify each time!] database name and sources
- `database_name`: arbitrary, might be [scientific_name]_[genome version]
 - `url_genome`, `url_rna`: URL of RefSeq data
   - you may look through the folder specified below
   - and retrieve the newest version or data for another species
   - `url_genome`: usually ends in "_genomic.fna.gz" and the file size is largest in the folder
   - `url_rna`: usually ends in "_rna_from_genomic.fna.gz"

In [2]:
# Bigelowiella natans- (Maybe) the smallest eukaryotic genome
database_name = "Bigelowiella_natans_ASM245v1"
database_path = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/455/GCF_000002455.1_ASM245v1/GCF_000002455.1_ASM245v1"

url_genome = database_path + "_genomic.fna.gz"
url_rna = database_path + "_rna_from_genomic.fna.gz"

dir_database = os.path.join(
    dir_root, "database", database_name)
print("Database directory:", dir_database)

Database directory: /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1


## Make directories and download the data

In [3]:
os.chdir(dir_root)
f_setup_directory(dir_database)
f_initial_directory_setup(dir_database, 
                        {url_genome: "genome.gz", url_rna: "rna.gz",}
                       )

##############################################
2023-12-30 16:20:08.225923 Download source data
Download from: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/455/GCF_000002455.1_ASM245v1/GCF_000002455.1_ASM245v1_genomic.fna.gz
Size: 128680 bytes
Download to: /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/source/genome.gz
.Download from: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/455/GCF_000002455.1_ASM245v1/GCF_000002455.1_ASM245v1_rna_from_genomic.fna.gz
Size: 116518 bytes
Download to: /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/source/rna.gz
.

## Cut into chromosomal sequences
- This makes many smaller files which are easier to handle, compared to the large single genome file

In [4]:
f_parse_sequence(dir_database)

##############################################
2023-12-30 16:20:12.954466 Extract genome sequence
Loading genome data...
2023-12-30 16:20:12.984048 4 records were found. Picking out full chromosomes only...
NC_010004.1 NC_010004.1 Bigelowiella natans nucleomorph chromosome 1, complete sequence
2023-12-30 16:20:13.022786 Saved chr_1.fasta
NC_010005.1 NC_010005.1 Bigelowiella natans nucleomorph chromosome 2, complete sequence
2023-12-30 16:20:13.044355 Saved chr_2.fasta
NC_010006.1 NC_010006.1 Bigelowiella natans nucleomorph chromosome 3, complete sequence
2023-12-30 16:20:13.056826 Saved chr_3.fasta
NC_008408.1 NC_008408.1 Bigelowiella natans plastid, complete genome
2023-12-30 16:20:13.065201 Saved chr_4.fasta
4 Chromosome(s) found

2023-12-30 16:20:13.067723 Saved list_chr_id.csv
2023-12-30 16:20:13.070543 Saved list_chr_desc.csv
2023-12-30 16:20:13.073209 Saved to dict_chr.pickle


## Extract mRNA info

In [5]:
f_parse_annotation(dir_database)

##############################################
2023-12-30 16:20:13.079400 Extract mRNA annotation
2023-12-30 16:20:13.100937 361 records were found
mRNA: 283
On a full chromosome: 283
Not 5prime-partial:  283
Not 3prime-partial:  283
No special annotation:  283
No NM_ accession was found
Using XM_ (predicted mRNA model) tag instead
gene-coding:  283

All entries in the data are formatted as expected

Make sure again that the source is full chromosome in `dict_chr`
Passed all filters: 178

Example data:
           chr                 ID    gene  chr_file  \
0  NC_010004.1   XM_001712699.1_4   sf3b4         1   
1  NC_010004.1  XM_001712704.1_11    tcpD         1   
2  NC_010004.1  XM_001712705.1_12  rpl13A         1   
3  NC_010004.1  XM_001712706.1_13    rpoF         1   
4  NC_010004.1  XM_001712708.1_15   sf3b5         1   

                                            mRNA_pos  is_on_revcom  
0  [[8969, 9055], [9075, 9123], [9142, 9230], [92...         False  
1  [[15869, 15979], [15

# Target candidate extraction & rudimentary selection

## Extract target candidate NGGs

In [6]:
f_extract_NGG_exonal(dir_database)

##############################################
2023-12-30 16:20:13.443069 Extract NGG on exonal regions
Loading data...
2023-12-30 16:20:13.454808 Process chr No. 1
Length =  140590
Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/all/common/NGG_common_chr_1.csv
Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/all/common/NGG_common_chr_1.pickle
Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/all/union/NGG_union_chr_1.csv
Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/all/union/NGG_union_chr_1.pickle
2023-12-30 16:20:13.538162 Process chr No. 2
Length =  134144
Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/all/common/NGG_common_chr_2.csv
Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/all/common/NGG_com

##  Sieve the target candidates, part 
 - - The very minimal requiremen''''''

In [7]:
f_sieve_target_1(dir_database)

##############################################
2023-12-30 16:20:13.729484 Sieve the target candidates, part 1
2023-12-30 16:20:13.729512 Concatenate csv files for exonal common parts
Load NGG_common_chr_1.csv
Load NGG_common_chr_2.csv
Load NGG_common_chr_3.csv
Load NGG_common_chr_4.csv

2023-12-30 16:20:13.770156 Concatenate csv files for exonal union parts
Load NGG_union_chr_1.csv
Load NGG_union_chr_2.csv
Load NGG_union_chr_3.csv
Load NGG_union_chr_4.csv

2023-12-30 16:20:13.795795 Merge
2023-12-30 16:20:13.804954 Convert to actual gRNA sequence by changing the initial to G
2023-12-30 16:20:13.806525 Omit duplicates
2023-12-30 16:20:13.808492 Drop entries containing N
4919 entries are found
Result contained targets for 177 genes
Number of genes with [1-2, 3-5, 6-] targets: [0, 12, 165]
2023-12-30 16:20:13.823052 Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/pd_merged_before_structure.pickle


## Calculate sgRNA secondary structure

In [8]:
n_core_available = get_n_core(100)
f_add_RNA_structure(dir_database, n_core_available)
f_sieve_target_2(dir_database)

All cores: 4
Idle cores: 3
##############################################
2023-12-30 16:20:14.832532 Calculate sgRNA secondary structure
Add stem-loop sequences
Estimating 1 0:00:00.048001
Estimating 2 0:00:00.452141
Estimating 3 0:00:04.896978
Estimated completion time: 2023-12-30 16:20:28
2023-12-30 16:20:20.237928 Start processing 3 jobs in total
Use 3 cores in parallel


  return bound(*args, **kwds)


2023-12-30 16:20:37.789178 Done parallel processing.
2023-12-30 16:20:37.801719 Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/pd_merged_with_structure.pickle
2023-12-30 16:20:37.817166 Flag if GC > 60%
2023-12-30 16:20:37.837655 Flag if containing TTTT
2023-12-30 16:20:37.839505 Flag if dimerizing with vector sequence
Estimating 1 0:00:00.006043
Estimating 2 0:00:00.058249
Estimating 3 0:00:00.603694
Estimating 4 0:00:02.280756
Estimated completion time: 2023-12-30 16:20:41
Flag if score > 60
2023-12-30 16:20:42.757932 Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/pd_merged_all_before_offtarget.pickle


## Cut in smaller chunks and export for off-target & qPCR search

In [9]:
f_prepare_massive_search(dir_database)

2023-12-30 16:20:42.764504 Prepare the list of target candidates for off-target search

Show stats for the `whole` dataset with all candidates:
4919 entries are found
Result contained targets for 177 genes
Number of genes with [1-2, 3-5, 6-] targets: [0, 12, 165]

Show stats for the `strict` dataset with candidates passing filters:
1896 entries are found
Result contained targets for 172 genes
Number of genes with [1-2, 3-5, 6-] targets: [19, 39, 114]

For the genes with only < 6 + 1 candidates, try salvaging candidates from the `whole` list:
Salvage applied to 79 genes in combination

Show stats for the salvaged dataset:
2515 entries are found
Result contained targets for 177 genes
Number of genes with [1-2, 3-5, 6-] targets: [0, 12, 165]

Save as .pickle
2023-12-30 16:20:42.816365 Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/pd_selected_before_offtarget.pickle
2023-12-30 16:20:42.817887 Cut into chunks & export as pseudo-binary string

# Extract genomic NAG/NGG

In [10]:
f_extract_NGG_genomic(dir_database)

##############################################
2023-12-30 16:20:42.878368 Extract all [20nt]NAG/NGG on genome
2023-12-30 16:20:42.881256 Process chr No. 1 (accession: NC_010004.1 )
10525 (forward), 10668 (rev-com) NAG/NGG were found
Output to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_genomic/pickle
2023-12-30 16:20:43.128889 Cut into chunks & export as pseudo-binary string
Done.
2023-12-30 16:20:43.169947 Cut into chunks & export as pseudo-binary string
Done.
2023-12-30 16:20:43.216500 Process chr No. 2 (accession: NC_010005.1 )
10461 (forward), 10145 (rev-com) NAG/NGG were found
Output to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_genomic/pickle
2023-12-30 16:20:43.421888 Cut into chunks & export as pseudo-binary string
Done.
2023-12-30 16:20:43.459064 Cut into chunks & export as pseudo-binary string
Done.
2023-12-30 16:20:43.492719 Process chr No. 3 (accession: NC_010006.1 )
7729 (forward), 7760 (rev-com) NAG/NGG were found
Outpu

# Off-target score

## Parallel calculation

In [11]:
list_args, n_core_available = f_offtarget_prepare_args(dir_database, use_percent=100)
result = cmulti.main_multiprocessing(cmulti.calc_offtarget_score, list_args, core_num=n_core_available)

All cores: 4
Idle cores: 4
2023-12-30 16:20:44.747042 Start processing 8 jobs in total
Use 4 cores in parallel
2023-12-30 16:20:44.7651922023-12-30 16:20:44.765721  Calculate off-target scores forCalculate off-target scores for  targets: targets: 000000000000000 000000000000000genome: 001_r_000000000000000 genome:  in process 001_f_00000000000000024578 in process 
24577
2023-12-30 16:20:46.754959 Output to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/off_target/offtarget_score_000000000000000_x_001_f_000000000000000.pickle
2023-12-30 16:20:46.775097 Calculate off-target scores for targets: 000000000000000 genome: 002_f_000000000000000 in process 2023-12-30 16:20:46.80901224577 Output to
 /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/off_target/offtarget_score_000000000000000_x_001_r_000000000000000.pickle
2023-12-30 16:20:46.825192 Calculate off-target scores for targets: 000000000000000 genome: 002_r_000000000000000 in process 24578
2023-12-30 

## Sum up

In [12]:
f_offtarget_result_sum(dir_database)

2023-12-30 16:20:51.423775 Sum up the calculated off-target scores
Process files in /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/off_target
8 files are found
2023-12-30 16:20:51.427657 Processing file 1 : offtarget_score_000000000000000_x_001_f_000000000000000
2023-12-30 16:20:51.436184 Processing file 2 : offtarget_score_000000000000000_x_001_r_000000000000000
2023-12-30 16:20:51.445237 Processing file 3 : offtarget_score_000000000000000_x_002_f_000000000000000
2023-12-30 16:20:51.454673 Processing file 4 : offtarget_score_000000000000000_x_002_r_000000000000000
2023-12-30 16:20:51.466389 Processing file 5 : offtarget_score_000000000000000_x_003_f_000000000000000
2023-12-30 16:20:51.473947 Processing file 6 : offtarget_score_000000000000000_x_003_r_000000000000000
2023-12-30 16:20:51.482386 Processing file 7 : offtarget_score_000000000000000_x_004_f_000000000000000
2023-12-30 16:20:51.490092 Processing file 8 : offtarget_score_000000000000000_x_004_r_0000000000000

# qPCR primers

## Parallel calculation

In [13]:
list_args, n_core_available = f_qPCR_prepare_args(dir_database, use_percent=100)
result = cmulti.main_multiprocessing(cmulti.calc_qPCR, list_args, core_num=n_core_available)

All cores: 4
Idle cores: 4
2023-12-30 16:20:52.516466 Start processing 11 jobs in total
Use 4 cores in parallel
2023-12-30 16:20:52.5322602023-12-30 16:20:52.533016 Calculate qPCR primers for  Calculate qPCR primers fortargets:  targets:000000000000000  000000000000502 in processin process 24935
 24936
Load new
Load new
2023-12-30 16:43:16.527561 Output to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/qPCR/qPCR_primer_000000000000502.pickle
2023-12-30 16:43:16.545574 Calculate qPCR primers for targets: 000000000000753 in process 24936
Load new
2023-12-30 16:51:26.992918 Output to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/qPCR/qPCR_primer_000000000000000.pickle
2023-12-30 16:51:27.010443 Calculate qPCR primers for targets: 000000000000251 in process 24935
Load new
Load new
2023-12-30 17:14:44.700465 Output to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/qPCR/qPCR_primer_000000000000753.pickle
2023-12-30 17:14:44.719633 Cal

## Sum up

In [14]:
f_qPCR_result_aggregate(dir_database)

2023-12-30 18:42:15.955735 Sum up the calculated off-target scores
Process files in /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/qPCR
11 files are found
2023-12-30 18:42:15.961842 Processing file 1 : qPCR_primer_000000000000000
2023-12-30 18:42:15.979474 Processing file 2 : qPCR_primer_000000000000251
2023-12-30 18:42:15.989672 Processing file 3 : qPCR_primer_000000000000502
2023-12-30 18:42:15.997609 Processing file 4 : qPCR_primer_000000000000753
2023-12-30 18:42:16.004454 Processing file 5 : qPCR_primer_000000000001004
2023-12-30 18:42:16.010909 Processing file 6 : qPCR_primer_000000000001255
2023-12-30 18:42:16.019487 Processing file 7 : qPCR_primer_000000000001506
2023-12-30 18:42:16.027929 Processing file 8 : qPCR_primer_000000000001757
2023-12-30 18:42:16.036833 Processing file 9 : qPCR_primer_000000000002008
2023-12-30 18:42:16.046173 Processing file 10 : qPCR_primer_000000000002259
2023-12-30 18:42:16.054594 Processing file 11 : qPCR_primer_000000000002510

# Final processing
Combine the results of all analyses and export
- Transform off-target scores to the final form
- Flag where qPCR primers are missing

In [15]:
f_final_processing(dir_database)

2023-12-30 18:42:16.078231 Start final processing
Reading data...
Everything else than off-target search and qPCR search: /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/NGG_target_candidate/pd_selected_before_offtarget.pickle
Off-target scores: /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/off_target_score_sum.pickle
qPCR primers: /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/qPCR_primer_aggregated.pickle
2023-12-30 18:42:16.127071 Final calculation of off-target scores...
2023-12-30 18:42:16.142878 Final calculation of qPCR primers...
2023-12-30 18:42:16.173446 Add primer sequences for AAV construction...
2515 entries are found
Result contained targets for 177 genes
Number of genes with [1-2, 3-5, 6-] targets: [0, 12, 165]
2023-12-30 18:42:16.189293 Sort...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2023-12-30 18:42:17.238482 Export the final version
2023-12-30 18:42:17.266010 Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/final/final_sort_pos.pickle
2023-12-30 18:42:17.339924 Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/final/final_sort_pos.csv
2023-12-30 18:42:17.339993 Cut into chunks & export to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/final/final_sort_pos
2023-12-30 18:42:17.407090 Completed export to 1 file(s)
2023-12-30 18:42:17.411745 Exported summary to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/final/final_sort_pos_summary.csv

2023-12-30 18:42:18.506523 Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/final/final_sort_name.pickle
2023-12-30 18:42:18.582379 Saved to /workdir/temp/CRISPR/database/Bigelowiella_natans_ASM245v1/result/final/final_sort_name.csv
2023-12-30 18:42:18.582452 Cut into chunks & export to /workdir/temp/CRISPR/database/