# Initial preparation

## Load scripts

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os
os.chdir("..")
dir_root = os.getcwd()
print("Root directory:", dir_root)

# Load functions
os.chdir(os.path.join(dir_root, "script"))
with open("function.py") as f:
    code = f.read()
    exec(code)

# Load customized multicore functions
import custom_multicore as cmulti

print('Loaded functions:', datetime.now())
os.chdir(dir_root)

## [Specify each time!] database name and sources
- `database_name`: arbitrary, might be [scientific_name]_[genome version]
 - `url_genome`, `url_rna`: URL of RefSeq data
   - you may look through the folder specified below
   - and retrieve the newest version or data for another species
   - `url_genome`: usually ends in "_genomic.fna.gz" and the file size is largest in the folder
   - `url_rna`: usually ends in "_rna_from_genomic.fna.gz"

In [None]:
# Mouse, an old version
database_name = "Mus_musculus_GRCm38_p4"
url_genome = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCF_000001635.24_GRCm38.p4/GCF_000001635.24_GRCm38.p4_genomic.fna.gz'
url_rna = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCF_000001635.24_GRCm38.p4/GCF_000001635.24_GRCm38.p4_rna_from_genomic.fna.gz'

# # Budding yeast
# database_name = "Saccharomyces_cerevisiae_R64"
# url_genome = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/Saccharomyces_cerevisiae/all_assembly_versions/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.fna.gz"
# url_rna = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/Saccharomyces_cerevisiae/all_assembly_versions/GCF_000146045.2_R64/GCF_000146045.2_R64_rna_from_genomic.fna.gz"

dir_database = os.path.join(dir_root, "database", database_name)
print("Database directory:", dir_database)

## Make directories and download the data

In [None]:
os.chdir(dir_root)
f_setup_directory(dir_database)
f_initial_directory_setup(dir_database, 
                        {url_genome: "genome.gz", url_rna: "rna.gz",}
                       )

## Cut into chromosomal sequences
- This makes many smaller files which are easier to handle, compared to the large single genome file

In [None]:
f_parse_sequence(dir_database)

## Extract mRNA info

In [None]:
f_parse_annotation(dir_database)

# Target candidate extraction & rudimentary selection

## Extract target candidate NGGs

In [None]:
f_extract_NGG_exonal(dir_database)

##  Sieve the target candidates, part 
 - - The very minimal requiremen''''''

In [None]:
f_sieve_target_1(dir_database)

## Calculate sgRNA secondary structure

In [None]:
n_core_available = get_n_core()
f_add_RNA_structure(dir_database, n_core_available)
f_sieve_target_2(dir_database)

## Cut in smaller chunks and export for off-target & qPCR search

In [None]:
f_prepare_massive_search(dir_database)

# Extract genomic NAG/NGG

In [None]:
f_extract_NGG_genomic(dir_database)

# Off-target score

## Parallel calculation

In [None]:
list_args, n_core_available = f_offtarget_prepare_args(dir_database)
result = cmulti.main_multiprocessing(cmulti.calc_offtarget_score, list_args, core_num=n_core_available)

## Sum up

In [None]:
f_offtarget_result_sum(dir_database)

# qPCR primers

## Parallel calculation

In [None]:
list_args, n_core_available = f_qPCR_prepare_args(dir_database)
result = cmulti.main_multiprocessing(cmulti.calc_qPCR, list_args, core_num=n_core_available)

## Sum up

In [None]:
f_qPCR_result_aggregate(dir_database)

# Final processing
Combine the results of all analyses and export
- Transform off-target scores to the final form
- Flag where qPCR primers are missing

In [None]:
f_final_processing(dir_database)