In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

sys.path.insert(0, '..')
from sequencing_process.support.support.path import clean_path
from sequencing_process.make_reference_genome import make_reference_genome
from sequencing_process.download_clinvar_vcf_gz import download_clinvar_vcf_gz
from sequencing_process.process_fastq_gz import check_fastq_gzs_using_fastqc, trim_fastq_gzs_using_skewer, align_fastq_gzs_using_bwa_mem
from sequencing_process.process_bam import sort_and_index_bam_using_samtools_sort_and_index, mark_duplicates_in_bam_using_picard_markduplicates, check_bam_using_samtools_flagstat, get_variants_from_bam_using_freebayes_and_multiprocess, get_variants_from_bam_using_strelka
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, rename_chromosome_of_vcf_gz_using_bcftools_annotate, annotate_vcf_gz_using_bcftools_annotate, filter_vcf_gz_using_bcftools_view

In [None]:
GRCH_DIRECTORY_PATH = clean_path('~/Jumis/sequencing_process')
assert os.path.isdir(GRCH_DIRECTORY_PATH)

DIRECTORY_PATH = clean_path('../simulation')
assert os.path.isdir(DIRECTORY_PATH)

REGIONS = ['chr{}'.format(i) for i in range(1, 23)] + [
    'chrX',
    'chrY',
    'chrM',
]

N_JOB = 4
MEMORY = '12G'
OVERWRITE = True

VARIANT_METHOD = 'freebayes'

CLINVAR_VERSION = '20180128'

In [None]:
try:
    FASTA_GZ_FILE_PATH = make_reference_genome(GRCH_DIRECTORY_PATH)

except FileExistsError:
    FASTA_GZ_FILE_PATH = os.path.join(
        GRCH_DIRECTORY_PATH,
        'GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz')

    FASTA_FILE_PATH = os.path.splitext(FASTA_GZ_FILE_PATH)[0]
    assert os.path.isfile(FASTA_FILE_PATH)

try:
    CLINVAR_VCF_GZ_FILE_PATH = download_clinvar_vcf_gz(
        GRCH_DIRECTORY_PATH, version=CLINVAR_VERSION)

except FileExistsError:
    CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, [
        file_name for file_name in os.listdir(GRCH_DIRECTORY_PATH)
        if 'clinvar' in file_name and file_name.endswith('.gz')
    ][0])

In [None]:
fastq_gz_1_file_path = os.path.join(DIRECTORY_PATH, '10k.bwa.read1.fastq.gz')
assert os.path.isfile(fastq_gz_1_file_path)

fastq_gz_2_file_path = os.path.join(DIRECTORY_PATH, '10k.bwa.read2.fastq.gz')
assert os.path.isfile(fastq_gz_2_file_path)

fastq_gz_1_trimmed_file_path, fastq_gz_2_trimmed_file_path = trim_fastq_gzs_using_skewer(
    (
        fastq_gz_1_file_path,
        fastq_gz_2_file_path, ),
    output_directory_path=os.path.join(DIRECTORY_PATH, 'trimmed_fastq_gz'),
    n_job=N_JOB,
    overwrite=OVERWRITE)

check_fastq_gzs_using_fastqc(
    (
        fastq_gz_1_file_path,
        fastq_gz_2_file_path,
        fastq_gz_1_trimmed_file_path,
        fastq_gz_2_trimmed_file_path, ),
    n_job=N_JOB,
    overwrite=OVERWRITE)

In [None]:
bam_file_path = align_fastq_gzs_using_bwa_mem(
    (
        fastq_gz_1_trimmed_file_path,
        fastq_gz_2_trimmed_file_path, ),
    FASTA_GZ_FILE_PATH,
    n_job=N_JOB,
    output_bam_file_path=os.path.join(DIRECTORY_PATH, 'aligned.bam'),
    overwrite=OVERWRITE)

sorted_and_indexed_bam_file_path = sort_and_index_bam_using_samtools_sort_and_index(
    bam_file_path,
    remove_input_bam_file_path=True,
    n_job=N_JOB,
    overwrite=OVERWRITE)

duplicate_removed_bam_file_path = mark_duplicates_in_bam_using_picard_markduplicates(
    sorted_and_indexed_bam_file_path,
    memory=MEMORY,
    remove_duplicates=True,
    remove_input_bam_file_path_and_its_index=True,
    n_job=N_JOB,
    output_bam_file_path=os.path.join(DIRECTORY_PATH, 'duplicate_removed.bam'),
    overwrite=OVERWRITE)

check_bam_using_samtools_flagstat(
    duplicate_removed_bam_file_path, n_job=N_JOB, overwrite=OVERWRITE)

In [None]:
if VARIANT_METHOD == 'freebayes':

    vcf_gz_file_path = get_variants_from_bam_using_freebayes_and_multiprocess(
        duplicate_removed_bam_file_path,
        FASTA_FILE_PATH,
        REGIONS,
        n_job=N_JOB,
        output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'freebayes.vcf'),
        overwrite=OVERWRITE)

    keep_filters = ()
    include_expression = '10<DP & 30<QUAL & 10<(QUAL/AO) & 1<SRF & 1<SRR & 1<SAF & 1<SAR & 1<RPR & 1<RPL'

elif VARIANT_METHOD == 'strelka':

    vcf_gz_file_path = get_variants_from_bam_using_strelka(
        duplicate_removed_bam_file_path,
        FASTA_FILE_PATH,
        os.path.join(DIRECTORY_PATH, 'strelka'),
        n_job=N_JOB,
        overwrite=OVERWRITE)

    keep_filters = ('PASS', )
    include_expression = None

else:
    raise ValueError('Unknown VARIANT_METHOD: {}.'.format(VARIANT_METHOD))

filtered_vcf_gz_file_path = filter_vcf_gz_using_bcftools_view(
    vcf_gz_file_path,
    regions=REGIONS,
    keep_filters=keep_filters,
    include_expression=include_expression,
    n_job=N_JOB,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'filtered.vcf'),
    overwrite=OVERWRITE)

chromosome_renamed_vcf_gz_file_path = rename_chromosome_of_vcf_gz_using_bcftools_annotate(
    filtered_vcf_gz_file_path,
    n_job=N_JOB,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH,
                                      'chromosome_renamed.vcf'),
    overwrite=OVERWRITE)

snpeff_annotated_vcf_gz_file_path = annotate_vcf_gz_using_snpeff(
    chromosome_renamed_vcf_gz_file_path,
    'GRCh38.86',
    memory=MEMORY,
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOB,
    overwrite=OVERWRITE)

clinvar_annotated_vcf_gz_file_path = annotate_vcf_gz_using_bcftools_annotate(
    snpeff_annotated_vcf_gz_file_path,
    CLINVAR_VCF_GZ_FILE_PATH, ('--columns =ID,INFO', ),
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOB,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'annotated.vcf'),
    overwrite=OVERWRITE)