In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

sys.path.insert(0, '..')
from sequencing_process.support.support.path import clean_path
from sequencing_process.make_reference_genome import make_reference_genome
from sequencing_process.download_clinvar_vcf_gz import download_clinvar_vcf_gz
from sequencing_process.process_fastq_gz import check_fastq_gzs_using_fastqc, trim_fastq_gzs_using_skewer, align_fastq_gzs_using_bwa_mem
from sequencing_process.process_bam import sort_and_index_bam_using_samtools_sort_and_index, mark_duplicates_in_bam_using_picard_markduplicates, check_bam_using_samtools_flagstat, get_variants_from_bam_using_freebayes_and_multiprocess, get_variants_from_bam_using_strelka
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, rename_chromosomes_of_vcf_gz_using_bcftools_annotate, annotate_vcf_gz_using_bcftools_annotate, filter_vcf_gz_using_bcftools_view

In [3]:
GRCH_DIRECTORY_PATH = clean_path('~/Jumis/grch/sequencing_process')
assert os.path.isdir(GRCH_DIRECTORY_PATH)

DIRECTORY_PATH = clean_path('../simulation')
assert os.path.isdir(DIRECTORY_PATH)

regions = ['chr{}'.format(i) for i in range(1, 23)] + [
    'chrX',
    'chrY',
    'chrM',
]

N_JOB = 2
OVERWRITE = True

VARIANT_METHOD = 'freebayes'

In [4]:
try:
    make_reference_genome(GRCH_DIRECTORY_PATH, overwrite=False)
except FileExistsError:
    pass

try:
    download_clinvar_vcf_gz(GRCH_DIRECTORY_PATH, overwrite=OVERWRITE)
except FileExistsError:
    pass

FASTA_GZ_FILE_PATH = os.path.join(
    GRCH_DIRECTORY_PATH,
    'GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz')
assert os.path.isfile(FASTA_GZ_FILE_PATH)

FASTA_FILE_PATH = FASTA_GZ_FILE_PATH[:-3]
assert os.path.isfile(FASTA_FILE_PATH)

CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, 'clinvar.vcf.gz')
assert os.path.isfile(CLINVAR_VCF_GZ_FILE_PATH)

Downloading ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz =(into)=> /Users/k/Jumis/grch/sequencing_process/clinvar.vcf.gz ...
Downloading ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi =(into)=> /Users/k/Jumis/grch/sequencing_process/clinvar.vcf.gz.tbi ...


In [5]:
fastq_gz_1_file_path = os.path.join(DIRECTORY_PATH, '10k.bwa.read1.fastq.gz')
assert os.path.isfile(fastq_gz_1_file_path)

fastq_gz_2_file_path = os.path.join(DIRECTORY_PATH, '10k.bwa.read2.fastq.gz')
assert os.path.isfile(fastq_gz_2_file_path)

fastq_gz_1_trimmed_file_path, fastq_gz_2_trimmed_file_path = trim_fastq_gzs_using_skewer(
    (
        fastq_gz_1_file_path,
        fastq_gz_2_file_path, ),
    end_quality=10,
    output_directory_path=os.path.join(DIRECTORY_PATH, 'trimmed_fastq_gz'),
    overwrite=OVERWRITE)


skewer -x ../resource/general_bad_sequences.fasta -r 0 -d 0 --end-quality 10 --min 30 -n --output /Users/k/Jumis/github_kwatme/sequencing_process/simulation/trimmed_fastq_gz/ --masked-output --excluded-output --threads 1 -m pe -y ../resource/general_bad_sequences.fasta /Users/k/Jumis/github_kwatme/sequencing_process/simulation/10k.bwa.read1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/10k.bwa.read2.fastq.gz
/Users/k/Jumis/github_kwatme/sequencing_process/simulation/trimmed_fastq_gz/trimmed.log:
skewer v0.2.2 [April 4, 2016]
COMMAND LINE:	skewer -x ../resource/general_bad_sequences.fasta -r 0 -d 0 --end-quality 10 --min 30 -n --output /Users/k/Jumis/github_kwatme/sequencing_process/simulation/trimmed_fastq_gz/ --masked-output --excluded-output --threads 1 -m pe -y ../resource/general_bad_sequences.fasta /Users/k/Jumis/github_kwatme/sequencing_process/simulation/10k.bwa.read1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/10k.bwa.read2.fastq.g

In [7]:
check_fastq_gzs_using_fastqc(
    (
        fastq_gz_1_file_path,
        fastq_gz_2_file_path,
        fastq_gz_1_trimmed_file_path,
        fastq_gz_2_trimmed_file_path, ),
    n_job=N_JOB,
    overwrite=OVERWRITE)


fastqc --threads 2 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/10k.bwa.read1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/10k.bwa.read2.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/trimmed_fastq_gz/trimmed-pair1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/trimmed_fastq_gz/trimmed-pair2.fastq.gz


In [None]:
bam_file_path = align_fastq_gzs_using_bwa_mem(
    (
        fastq_gz_1_trimmed_file_path,
        fastq_gz_2_trimmed_file_path, ),
    FASTA_GZ_FILE_PATH,
    n_job=N_JOB,
    output_bam_file_path=os.path.join(DIRECTORY_PATH, 'aligned.bam'),
    overwrite=OVERWRITE)

sorted_and_indexed_bam_file_path = sort_and_index_bam_using_samtools_sort_and_index(
    bam_file_path,
    remove_input_bam_file_path=True,
    n_job=N_JOB,
    overwrite=OVERWRITE)

duplicate_removed_bam_file_path = mark_duplicates_in_bam_using_picard_markduplicates(
    sorted_and_indexed_bam_file_path,
    remove_duplicates=True,
    remove_input_bam_file_path_and_its_index=True,
    n_job=N_JOB,
    output_bam_file_path=os.path.join(DIRECTORY_PATH, 'duplicate_removed.bam'),
    overwrite=OVERWRITE)

check_bam_using_samtools_flagstat(
    duplicate_removed_bam_file_path, n_job=N_JOB, overwrite=OVERWRITE)


bwa mem -t 2 -v 3 /Users/k/Jumis/grch/sequencing_process/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/trimmed_fastq_gz/trimmed-pair1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/trimmed_fastq_gz/trimmed-pair2.fastq.gz | ../resource/k8-0.2.3/k8-darwin ../resource/bwa-postalt.js /Users/k/Jumis/grch/sequencing_process/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz.alt | samtools view -Sb --threads 2 > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/aligned.bam

samtools sort --threads 2 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/aligned.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/sort_and_index_bam_using_samtools_sort_and_index.bam

rm -rf /Users/k/Jumis/github_kwatme/sequencing_process/simulation/aligned.bam

samtools index -@ 2 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/sort_and_index_bam_using_samtools

In [None]:
if VARIANT_METHOD == 'freebayes':

    vcf_gz_file_path = get_variants_from_bam_using_freebayes_and_multiprocess(
        duplicate_removed_bam_file_path,
        FASTA_FILE_PATH,
        regions,
        n_job=N_JOB,
        output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'freebayes.vcf'),
        overwrite=OVERWRITE)

    keep_filters = ()
    include_expression = '10<DP & 30<QUAL & 10<(QUAL/AO) & 1<SRF & 1<SRR & 1<SAF & 1<SAR & 1<RPR & 1<RPL'

elif VARIANT_METHOD == 'strelka':

    vcf_gz_file_path = get_variants_from_bam_using_strelka(
        duplicate_removed_bam_file_path,
        FASTA_FILE_PATH,
        os.path.join(DIRECTORY_PATH, 'strelka'),
        n_job=N_JOB,
        overwrite=OVERWRITE)

    keep_filters = ('PASS', )
    include_expression = None

filtered_vcf_gz_file_path = filter_vcf_gz_using_bcftools_view(
    vcf_gz_file_path,
    regions=regions,
    keep_filters=keep_filters,
    include_expression=include_expression,
    n_job=N_JOB,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'filtered.vcf'),
    overwrite=OVERWRITE)

chromosomes_renamed_vcf_gz_file_path = rename_chromosomes_of_vcf_gz_using_bcftools_annotate(
    filtered_vcf_gz_file_path,
    n_job=N_JOB,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH,
                                      'chromosomes_renamed.vcf'),
    overwrite=OVERWRITE)

snpeff_annotated_vcf_gz_file_path = annotate_vcf_gz_using_snpeff(
    chromosomes_renamed_vcf_gz_file_path,
    'GRCh38.86',
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOB,
    overwrite=OVERWRITE)

clinvar_annotated_vcf_gz_file_path = annotate_vcf_gz_using_bcftools_annotate(
    snpeff_annotated_vcf_gz_file_path,
    CLINVAR_VCF_GZ_FILE_PATH, ('--columns =ID,INFO', ),
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_job=N_JOB,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'annotated.vcf'),
    overwrite=OVERWRITE)



freebayes --fasta-reference /Users/k/Jumis/grch/sequencing_process/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr1 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/duplicate_removed.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/get_variants_from_bam_using_freebayes.--region_chr1.vcf
freebayes --fasta-reference /Users/k/Jumis/grch/sequencing_process/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr5 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/duplicate_removed.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/get_variants_from_bam_using_freebayes.--region_chr5.vcf
