In [1]:
import os
import sys

sys.path.insert(0, '..')
from sequencing_process.support.support.path import clean_path
from sequencing_process.process_fastq_gz import check_fastq_gzs_using_fastqc, trim_fastq_gzs_using_skewer, align_fastq_gzs_using_bwa_mem
from sequencing_process.process_bam import sort_and_index_bam_using_samtools, mark_duplicates_in_bam_using_picard_markduplicates, check_bam_using_samtools_flagstat, call_variants_on_bam_using_freebayes, call_variants_on_bam_using_freebayes_and_multiprocess
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, annotate_vcf_gz_using_bcftools, filter_vcf_gz_using_bcftools

%load_ext autoreload
%autoreload 2
%matplotlib inline

## Set reference file paths

In [2]:
GRCH_DIRECTORY_PATH = clean_path('/Volumes/Jumis/data/grch')
assert os.path.isdir(GRCH_DIRECTORY_PATH)

FASTA_GZ_FILE_PATH = os.path.join(
    GRCH_DIRECTORY_PATH,
    'GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz')
assert os.path.isfile(FASTA_GZ_FILE_PATH)

FASTA_FILE_PATH = FASTA_GZ_FILE_PATH[:-3]
assert os.path.isfile(FASTA_FILE_PATH)

CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, 'clinvar.vcf.gz')
assert os.path.isfile(CLINVAR_VCF_GZ_FILE_PATH)

## Set parameters

In [3]:
N_JOBS = 4
OVERWRITE = True

## Check .fastq.gz

In [4]:
DIRECTORY_PATH = clean_path('../simulation/1k')

fastq_gz_1_file_path = os.path.join(DIRECTORY_PATH,
                                    'simulation.bwa.read1.fastq.gz')
assert os.path.isfile(fastq_gz_1_file_path)

fastq_gz_2_file_path = os.path.join(DIRECTORY_PATH,
                                    'simulation.bwa.read2.fastq.gz')
assert os.path.isfile(fastq_gz_2_file_path)

check_fastq_gzs_using_fastqc(
    [fastq_gz_1_file_path, fastq_gz_2_file_path],
    n_jobs=N_JOBS,
    overwrite=OVERWRITE)

Using paired .fastq.gz file paths ...
fastqc --threads 4 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/simulation.bwa.read1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/simulation.bwa.read2.fastq.gz


## Trim and recheck .fastq.gz

In [5]:
fastq_gz_1_trimmed_file_path, fastq_gz_2_trimmed_file_path = trim_fastq_gzs_using_skewer(
    [fastq_gz_1_file_path, fastq_gz_2_file_path],
    end_quality=3,
    overwrite=OVERWRITE)

check_fastq_gzs_using_fastqc(
    [fastq_gz_1_trimmed_file_path, fastq_gz_2_trimmed_file_path],
    n_jobs=N_JOBS,
    overwrite=OVERWRITE)

Using paired .fastq.gz file paths ...
skewer -x /Users/k/Jumis/github_kwatme/sequencing_process/resource/general_bad_sequence.fasta -r 0 -d 0 --end-quality 3 --min 30 -n --output /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/trim_fastq_gzs_using_skewer --masked-output --excluded-output --threads 1 -y /Users/k/Jumis/github_kwatme/sequencing_process/resource/general_bad_sequence.fasta /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/simulation.bwa.read1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/simulation.bwa.read2.fastq.gz
/Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/trim_fastq_gzs_using_skewer-trimmed.log:
skewer v0.2.2 [April 4, 2016]
COMMAND LINE:	skewer -x /Users/k/Jumis/github_kwatme/sequencing_process/resource/general_bad_sequence.fasta -r 0 -d 0 --end-quality 3 --min 30 -n --output /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/trim_fastq_gzs_using_skewer --masked-output --excluded-output --

## Align to the reference genome

In [6]:
bam_file_path = align_fastq_gzs_using_bwa_mem(
    [fastq_gz_1_trimmed_file_path, fastq_gz_2_trimmed_file_path],
    FASTA_GZ_FILE_PATH,
    n_jobs=N_JOBS,
    overwrite=OVERWRITE)

sorted_and_indexed_bam_file_path = sort_and_index_bam_using_samtools(
    bam_file_path,
    remove_input_bam_file_path=True,
    n_jobs=N_JOBS,
    overwrite=OVERWRITE)

check_bam_using_samtools_flagstat(
    sorted_and_indexed_bam_file_path, n_jobs=N_JOBS, overwrite=OVERWRITE)

duplicate_removed_bam_file_path = mark_duplicates_in_bam_using_picard_markduplicates(
    sorted_and_indexed_bam_file_path,
    remove_duplicates=True,
    remove_input_bam_file_path_and_its_index=True,
    n_jobs=N_JOBS,
    overwrite=OVERWRITE)

check_bam_using_samtools_flagstat(
    duplicate_removed_bam_file_path, n_jobs=N_JOBS, overwrite=OVERWRITE)

Using paired .fastq.gz file paths ...
bwa mem -t 4 /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/trim_fastq_gzs_using_skewer-trimmed-pair1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/trim_fastq_gzs_using_skewer-trimmed-pair2.fastq.gz | /Users/k/Jumis/github_kwatme/sequencing_process/resource/k8-0.2.3/k8-darwin /Users/k/Jumis/github_kwatme/sequencing_process/resource/bwa-postalt.js /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz.alt | samtools view -Sb --threads 4 > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/align_fastq_gzs_using_bwa_mem.bam
samtools sort --threads 4 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/align_fastq_gzs_using_bwa_mem.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/sort_and_index_bam_using_samtools.bam
Consider removing unsorted .bam f

## Detect variant

In [8]:
vcf_gz_file_path = call_variants_on_bam_using_freebayes_and_multiprocess(
    clean_bam_file_path,
    FASTA_FILE_PATH, ['chr{}'.format(i)
                      for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'],
    n_jobs=N_JOBS,
    overwrite=OVERWRITE)

snpeff_annotated_vcf_gz_file_path = annotate_vcf_gz_using_snpeff(
    vcf_gz_file_path,
    'GRCh38.86',
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_jobs=N_JOBS,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'snpeff.vcf'),
    overwrite=OVERWRITE)

clinvar_annotated_vcf_gz_file_path = annotate_vcf_gz_using_bcftools(
    snpeff_annotated_vcf_gz_file_path,
    CLINVAR_VCF_GZ_FILE_PATH, ['--columns =ID,INFO'],
    remove_input_vcf_gz_file_path_and_its_index=True,
    n_jobs=N_JOBS,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'annotated.vcf'),
    overwrite=OVERWRITE)

filtered_vcf_gz_file_path = filter_vcf_gz_using_bcftools(
    clinvar_annotated_vcf_gz_file_path,
    n_jobs=N_JOBS,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'filtered.vcf'),
    overwrite=OVERWRITE)

freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr5 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/clean.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/call_variants_on_bam_using_freebayes.--region_chr5.vcf
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr3 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/clean.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/call_variants_on_bam_using_freebayes.--region_chr3.vcf
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr7 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/clean.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/call_variants_on_bam_using_freebayes.--region_chr7.vcf
freebayes --fasta-reference /Volumes/

freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr17 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/clean.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/call_variants_on_bam_using_freebayes.--region_chr17.vcf
bgzip --threads 1 --force /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/call_variants_on_bam_using_freebayes.--region_chr15.vcf && tabix --force /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/call_variants_on_bam_using_freebayes.--region_chr15.vcf.gz
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr16 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/clean.bam > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/call_variants_on_bam_using_freebayes.--region_chr16.vcf
bgzip --threads 1 --force /Users/k/Jumis/github_kwatme/sequencing_proces

CalledProcessError: Command 'snpEff -Xmx12G -htmlStats /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/snpeff.vcf.stats.html -csvStats /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/snpeff.vcf.stats.csv -t -verbose -noLog GRCh38.86 /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/concatenate_vcf_gzs_using_bcftools.vcf.gz > /Users/k/Jumis/github_kwatme/sequencing_process/simulation/5k/snpeff.vcf' returned non-zero exit status 255.