In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.insert(0, '..')
from sequencing_process.support.support.path import clean_path
from sequencing_process.make_reference_genome import make_reference_genome
from sequencing_process.download_clinvar_vcf_gz import download_clinvar_vcf_gz
from sequencing_process.process_fastq_gz import check_fastq_gzs_using_fastqc, trim_fastq_gzs_using_skewer, align_fastq_gzs_using_bwa_mem
from sequencing_process.process_bam import sort_and_index_bam_using_samtools_sort_and_index, mark_duplicates_in_bam_using_picard_markduplicates, check_bam_using_samtools_flagstat, call_variants_on_bam_using_freebayes_and_multiprocess
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, rename_chromosomes_of_vcf_gz_using_bcftools_annotate, annotate_vcf_gz_using_bcftools_annotate, filter_vcf_gz_using_bcftools_view

In [3]:
GRCH_DIRECTORY_PATH = clean_path('~/Downloads')
assert os.path.isdir(GRCH_DIRECTORY_PATH)

try:
    make_reference_genome(GRCH_DIRECTORY_PATH)
except FileExistsError:
    pass

try:
    download_clinvar_vcf_gz(GRCH_DIRECTORY_PATH)
except FileExistsError:
    pass

FASTA_GZ_FILE_PATH = os.path.join(
    GRCH_DIRECTORY_PATH,
    'GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz')
assert os.path.isfile(FASTA_GZ_FILE_PATH)

FASTA_FILE_PATH = FASTA_GZ_FILE_PATH[:-3]
assert os.path.isfile(FASTA_FILE_PATH)

CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, 'clinvar.vcf.gz')
assert os.path.isfile(CLINVAR_VCF_GZ_FILE_PATH)

In [4]:
N_JOBS = 2
OVERWRITE = True

In [5]:
DIRECTORY_PATH = clean_path('~/Downloads')

fastq_gz_1_file_path = os.path.join(DIRECTORY_PATH,
                                    'simulation.bwa.read1.fastq.gz')
assert os.path.isfile(fastq_gz_1_file_path)

fastq_gz_2_file_path = os.path.join(DIRECTORY_PATH,
                                    'simulation.bwa.read2.fastq.gz')
assert os.path.isfile(fastq_gz_2_file_path)

check_fastq_gzs_using_fastqc(
    (
        fastq_gz_1_file_path,
        fastq_gz_2_file_path, ),
    n_job=N_JOBS,
    overwrite=OVERWRITE)


fastqc --threads 2 /Users/k/Downloads/simulation.bwa.read1.fastq.gz /Users/k/Downloads/simulation.bwa.read2.fastq.gz


In [9]:
fastq_gz_1_trimmed_file_path, fastq_gz_2_trimmed_file_path = trim_fastq_gzs_using_skewer(
    (
        fastq_gz_1_file_path,
        fastq_gz_2_file_path, ),
    end_quality=10,
    output_directory_path=os.path.join(DIRECTORY_PATH, 'trimmed_fastq_gz'),
    overwrite=OVERWRITE)

check_fastq_gzs_using_fastqc(
    (
        fastq_gz_1_trimmed_file_path,
        fastq_gz_2_trimmed_file_path, ),
    n_job=N_JOBS,
    overwrite=OVERWRITE)


skewer -x ../resource/general_bad_sequences.fasta -r 0 -d 0 --end-quality 10 --min 30 -n --output /Users/k/Downloads/trimmed_fastq_gz/ --masked-output --excluded-output --threads 1 -m pe -y ../resource/general_bad_sequences.fasta /Users/k/Downloads/simulation.bwa.read1.fastq.gz /Users/k/Downloads/simulation.bwa.read2.fastq.gz
/Users/k/Downloads/trimmed_fastq_gz/trimmed.log:
skewer v0.2.2 [April 4, 2016]
COMMAND LINE:	skewer -x ../resource/general_bad_sequences.fasta -r 0 -d 0 --end-quality 10 --min 30 -n --output /Users/k/Downloads/trimmed_fastq_gz/ --masked-output --excluded-output --threads 1 -m pe -y ../resource/general_bad_sequences.fasta /Users/k/Downloads/simulation.bwa.read1.fastq.gz /Users/k/Downloads/simulation.bwa.read2.fastq.gz
Input file:	/Users/k/Downloads/simulation.bwa.read1.fastq.gz
Paired file:	/Users/k/Downloads/simulation.bwa.read2.fastq.gz
trimmed:	/Users/k/Downloads/trimmed_fastq_gz/trimmed-pair1.fastq, /Users/k/Downloads/trimmed_fastq_gz/trimmed-pair2.fastq

Para

In [None]:
bam_file_path = align_fastq_gzs_using_bwa_mem(
    (
        fastq_gz_1_trimmed_file_path,
        fastq_gz_2_trimmed_file_path, ),
    FASTA_GZ_FILE_PATH,
    n_job=N_JOBS,
    output_bam_file_path=os.path.join(DIRECTORY_PATH, 'aligned.bam'),
    overwrite=OVERWRITE)

sorted_and_indexed_bam_file_path = sort_and_index_bam_using_samtools_sort_and_index(
    bam_file_path,
    remove_input_bam_file_path=True,
    n_job=N_JOBS,
    overwrite=OVERWRITE)

duplicate_removed_bam_file_path = mark_duplicates_in_bam_using_picard_markduplicates(
    sorted_and_indexed_bam_file_path,
    remove_duplicates=True,
    remove_input_bam_file_path_and_its_index=True,
    n_job=N_JOBS,
    output_bam_file_path=os.path.join(DIRECTORY_PATH, 'duplicate_removed.bam'),
    overwrite=OVERWRITE)

check_bam_using_samtools_flagstat(
    duplicate_removed_bam_file_path, n_job=N_JOBS, overwrite=OVERWRITE)


bwa index /Users/k/Downloads/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz


In [None]:
vcf_gz_file_path = call_variants_on_bam_using_freebayes_and_multiprocess(
    duplicate_removed_bam_file_path,
    FASTA_FILE_PATH, ('chr{}'.format(i) for i in range(1, 23)) + (
        'chrX',
        'chrY',
        'chrM', ),
    n_job=N_JOBS,
    overwrite=OVERWRITE)

chromosomes_renamed_vcf_gz_file_path = rename_chromosomes_of_vcf_gz_using_bcftools_annotate(
    vcf_gz_file_path,
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOBS,
    overwrite=OVERWRITE)

snpeff_annotated_vcf_gz_file_path = annotate_vcf_gz_using_snpeff(
    chromosomes_renamed_vcf_gz_file_path,
    'GRCh38.86',
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOBS,
    overwrite=OVERWRITE)

clinvar_annotated_vcf_gz_file_path = annotate_vcf_gz_using_bcftools_annotate(
    snpeff_annotated_vcf_gz_file_path,
    CLINVAR_VCF_GZ_FILE_PATH, ('--columns =ID,INFO', ),
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOBS,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'annotated.vcf'),
    overwrite=OVERWRITE)

filtered_vcf_gz_file_path = filter_vcf_gz_using_bcftools_view(
    clinvar_annotated_vcf_gz_file_path,
    n_job=N_JOBS,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'filtered.vcf'),
    overwrite=OVERWRITE)