In [1]:
import os
import multiprocessing as mp
import sys

sys.path.insert(0, '..')
from sequencing_process.make_reference_genome_for_sequencing_process import make_reference_genome_for_sequencing_process
from sequencing_process.plot_fastq_gz_or_bam import plot_fastq_gz_or_bam
from sequencing_process.process_fastq_gz import align_fastq_gzs_using_bwa
from sequencing_process.process_bam import sort_and_index_bam_using_samtools, call_variants_on_bam_using_freebayes_and_multiprocess
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, annotate_vcf_gz_using_bcftools, filter_vcf_gz_using_bcftools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
GRCH_DIRECTORY_PATH = '/Volumes/Jumis/data/grch'
assert os.path.isdir(GRCH_DIRECTORY_PATH)

# make_reference_genome_for_sequencing_process(GRCH_DIRECTORY_PATH)

FASTA_GZ_FILE_PATH = os.path.join(
    GRCH_DIRECTORY_PATH,
    'GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz')
assert os.path.isfile(FASTA_GZ_FILE_PATH)

FASTA_FILE_PATH = FASTA_GZ_FILE_PATH[:-3]
assert os.path.isfile(FASTA_FILE_PATH)

CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, 'clinvar.vcf.gz')
assert os.path.isfile(CLINVAR_VCF_GZ_FILE_PATH)

N_JOBS = 2
OVERWRITE = True

PLOT = True

In [3]:
FASTQ_GZ_0_FILE_PATH = './5k/simulation.bwa.read1.fastq.gz'
assert os.path.isfile(FASTQ_GZ_0_FILE_PATH)

FASTQ_GZ_1_FILE_PATH = './5k/simulation.bwa.read2.fastq.gz'
assert os.path.isfile(FASTQ_GZ_1_FILE_PATH)

if PLOT:
    p = mp.Pool(processes=2)

    r0 = p.apply_async(
        plot_fastq_gz_or_bam,
        args=[FASTQ_GZ_0_FILE_PATH],
        kwds=dict(overwrite=OVERWRITE))

    r1 = p.apply_async(
        plot_fastq_gz_or_bam,
        args=[FASTQ_GZ_1_FILE_PATH],
        kwds=dict(overwrite=OVERWRITE))

fastqp --output ./5k/simulation.bwa.read2.fastq.gz.plot --text ./5k/simulation.bwa.read2.fastq.gz.plot.tsv ./5k/simulation.bwa.read2.fastq.gz
fastqp --output ./5k/simulation.bwa.read1.fastq.gz.plot --text ./5k/simulation.bwa.read1.fastq.gz.plot.tsv ./5k/simulation.bwa.read1.fastq.gz


In [4]:
BAM_FILE_PATH = None

if BAM_FILE_PATH:
    assert os.path.isfile(BAM_FILE_PATH)

else:
    BAM_FILE_PATH = align_fastq_gzs_using_bwa(
        FASTA_GZ_FILE_PATH, [
            FASTQ_GZ_0_FILE_PATH,
            FASTQ_GZ_1_FILE_PATH,
        ],
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)

    BAM_FILE_PATH = sort_and_index_bam_using_samtools(
        BAM_FILE_PATH, n_jobs=N_JOBS, overwrite=OVERWRITE)

    if PLOT:
        plot_fastq_gz_or_bam(BAM_FILE_PATH, overwrite=OVERWRITE)

bwa mem -t 2 /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz ./5k/simulation.bwa.read1.fastq.gz ./5k/simulation.bwa.read2.fastq.gz | samtools view -Sb --threads 2 > ./5k/align_fastq_gzs_using_bwa.bam
samtools sort --threads 2 ./5k/align_fastq_gzs_using_bwa.bam > ./5k/sort_and_index_bam_using_samtools.bam
Consider removing unsorted .bam file ./5k/align_fastq_gzs_using_bwa.bam.
samtools index -@ 2 ./5k/sort_and_index_bam_using_samtools.bam
fastqp --output ./5k/sort_and_index_bam_using_samtools.bam.plot --text ./5k/sort_and_index_bam_using_samtools.bam.plot.tsv ./5k/sort_and_index_bam_using_samtools.bam


In [5]:
VCF_GZ_FILE_PATH = None

if VCF_GZ_FILE_PATH:
    assert os.path.isfile(VCF_GZ_FILE_PATH)

else:
    VCF_GZ_FILE_PATH = call_variants_on_bam_using_freebayes_and_multiprocess(
        BAM_FILE_PATH,
        FASTA_FILE_PATH, ['chr{}'.format(i)
                          for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'],
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)
    
    VCF_GZ_FILE_PATH = annotate_vcf_gz_using_snpeff(
        VCF_GZ_FILE_PATH, n_jobs=N_JOBS, overwrite=OVERWRITE)

    VCF_GZ_FILE_PATH = annotate_vcf_gz_using_bcftools(
        VCF_GZ_FILE_PATH,
        CLINVAR_VCF_GZ_FILE_PATH, ['--columns =ID,INFO'],
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)

    VCF_GZ_FILE_PATH = filter_vcf_gz_using_bcftools(
        VCF_GZ_FILE_PATH, n_jobs=N_JOBS, overwrite=OVERWRITE)

freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr5 ./5k/sort_and_index_bam_using_samtools.bam > ./5k/call_variants_on_bam_using_freebayes.--region_chr5.vcf
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr1 ./5k/sort_and_index_bam_using_samtools.bam > ./5k/call_variants_on_bam_using_freebayes.--region_chr1.vcf
bgzip --threads 1 --force ./5k/call_variants_on_bam_using_freebayes.--region_chr5.vcf; tabix --force ./5k/call_variants_on_bam_using_freebayes.--region_chr5.vcf.gz
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr6 ./5k/sort_and_index_bam_using_samtools.bam > ./5k/call_variants_on_bam_using_freebayes.--region_chr6.vcf
bgzip --threads 1 --force ./5k/call_variants_on_bam_using_freebayes.--region_chr1.vcf; tabix --force ./5k/call_variants_on_bam_usin

freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chrX ./5k/sort_and_index_bam_using_samtools.bam > ./5k/call_variants_on_bam_using_freebayes.--region_chrX.vcf
bgzip --threads 1 --force ./5k/call_variants_on_bam_using_freebayes.--region_chr19.vcf; tabix --force ./5k/call_variants_on_bam_using_freebayes.--region_chr19.vcf.gz
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr20 ./5k/sort_and_index_bam_using_samtools.bam > ./5k/call_variants_on_bam_using_freebayes.--region_chr20.vcf
bgzip --threads 1 --force ./5k/call_variants_on_bam_using_freebayes.--region_chr20.vcf; tabix --force ./5k/call_variants_on_bam_using_freebayes.--region_chr20.vcf.gz
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chrM ./5k/sort_and_index_bam_using_samtools.bam > ./5k/call_variants_on

In [6]:
if PLOT:
    p.terminate()