In [1]:
import os
import multiprocessing as mp
import sys

sys.path.insert(0, '..')
from sequencing_process.plot_fastq_gz_or_bam import plot_fastq_gz_or_bam
from sequencing_process.process_fastq_gz import align_fastq_gzs_using_bwa
from sequencing_process.process_bam import call_variants_on_bam_using_freebayes_and_multiprocess
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, annotate_vcf_gz_using_bcftools, filter_vcf_gz_using_bcftools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [9]:
GRCH_DIRECTORY_PATH = '/Volumes/Jumis/data/grch'
assert os.path.isdir(GRCH_DIRECTORY_PATH)

# make_reference_genome_for_sequencing_process(GRCH_DIRECTORY_PATH)

FASTA_GZ_FILE_PATH = os.path.join(
    GRCH_DIRECTORY_PATH,
    'GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz')
assert os.path.isfile(FASTA_GZ_FILE_PATH)

CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, 'clinvar.vcf.gz')
assert os.path.isfile(CLINVAR_VCF_GZ_FILE_PATH)

N_JOBS = 2
OVERWRITE = False

In [10]:
FASTQ_GZ_0_FILE_PATH = 'simulation_0.fq.gz'
assert os.path.isfile(FASTQ_GZ_0_FILE_PATH)

FASTQ_GZ_1_FILE_PATH = 'simulation_1.fq.gz'
assert os.path.isfile(FASTQ_GZ_1_FILE_PATH)

with mp.Pool(processes=N_JOBS) as p:

    r0 = p.apply_async(
        plot_fastq_gz_or_bam,
        args=[FASTQ_GZ_0_FILE_PATH],
        kwds=dict(overwrite=OVERWRITE))

    r1 = p.apply_async(
        plot_fastq_gz_or_bam,
        args=[FASTQ_GZ_1_FILE_PATH],
        kwds=dict(overwrite=OVERWRITE))

fastqp --output simulation_1.fq.gz.plot --text simulation_1.fq.gz.plot.tsv simulation_1.fq.gz
<multiprocessing.pool.ApplyResult object at 0x114ce0978>
<multiprocessing.pool.ApplyResult object at 0x114ce0ef0>


In [None]:
BAM_FILE_PATH = None

if BAM_FILE_PATH:
    assert os.path.isfile(BAM_FILE_PATH)

else:
    BAM_FILE_PATH = align_fastq_gzs_using_bwa(
        FASTA_GZ_FILE_PATH, [
            FASTQ_GZ_0_FILE_PATH,
            FASTQ_GZ_1_FILE_PATH,
        ],
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)
    
    plot_fastq_gz_or_bam(BAM_FILE_PATH, overwrite=OVERWRITE)

In [None]:
VCF_GZ_FILE_PATH = None

if VCF_GZ_FILE_PATH:
    assert os.path.isfile(VCF_GZ_FILE_PATH)

else:
    VCF_GZ_FILE_PATH = call_variants_on_bam_using_freebayes_and_multiprocess(
        BAM_FILE_PATH,
        FASTA_FILE_PATH, ['chr{}'.format(i)
                          for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'],
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)

    VCF_GZ_FILE_PATH = annotate_vcf_gz_using_snpeff(
        VCF_GZ_FILE_PATH, n_jobs=N_JOBS, overwrite=OVERWRITE)

    VCF_GZ_FILE_PATH = annotate_vcf_gz_using_bcftools(
        VCF_GZ_FILE_PATH,
        CLINVAR_VCF_GZ_FILE_PATH,
        '--columns =ID,INFO',
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)

    VCF_GZ_FILE_PATH = filter_vcf_gz_using_bcftools(
        VCF_GZ_FILE_PATH, n_jobs=N_JOBS, overwrite=OVERWRITE)

In [None]:
[str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']