In [1]:
import os
import multiprocessing as mp
import sys

sys.path.insert(0, '..')
from sequencing_process.support.support.path import clean_path
from sequencing_process.make_reference_genome_for_sequencing_process import make_reference_genome_for_sequencing_process
from sequencing_process.plot_fastq_gz_or_bam import plot_fastq_gz_or_bam
from sequencing_process.process_fastq_gz import align_fastq_gzs_using_bwa
from sequencing_process.process_bam import sort_and_index_bam_using_samtools, mark_duplicates_in_bam_using_picard, call_variants_on_bam_using_freebayes_and_multiprocess
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, annotate_vcf_gz_using_bcftools, filter_vcf_gz_using_bcftools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
GRCH_DIRECTORY_PATH = clean_path('/Volumes/Jumis/data/grch')
assert os.path.isdir(GRCH_DIRECTORY_PATH)

# make_reference_genome_for_sequencing_process(GRCH_DIRECTORY_PATH)

FASTA_GZ_FILE_PATH = os.path.join(
    GRCH_DIRECTORY_PATH,
    'GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz')
assert os.path.isfile(FASTA_GZ_FILE_PATH)

FASTA_FILE_PATH = FASTA_GZ_FILE_PATH[:-3]
assert os.path.isfile(FASTA_FILE_PATH)

CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, 'clinvar.vcf.gz')
assert os.path.isfile(CLINVAR_VCF_GZ_FILE_PATH)

N_JOBS = 4
OVERWRITE = True

PLOT = True

In [3]:
DIRECTORY_PATH = clean_path('1k')
fastq_gz_0_file_path = os.path.join(DIRECTORY_PATH,
                                    'simulation.bwa.read1.fastq.gz')
assert os.path.isfile(fastq_gz_0_file_path)

fastq_gz_1_file_path = os.path.join(DIRECTORY_PATH,
                                    'simulation.bwa.read2.fastq.gz')
assert os.path.isfile(fastq_gz_1_file_path)

if PLOT:
    p = mp.Pool(processes=2)

    p.apply_async(
        plot_fastq_gz_or_bam,
        args=[fastq_gz_0_file_path],
        kwds=dict(overwrite=OVERWRITE))

    p.apply_async(
        plot_fastq_gz_or_bam,
        args=[fastq_gz_1_file_path],
        kwds=dict(overwrite=OVERWRITE))

fastqp --output /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/simulation.bwa.read1.fastq.gz.plot --text /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/simulation.bwa.read1.fastq.gz.plot.tsv /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/simulation.bwa.read1.fastq.gz
fastqp --output /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/simulation.bwa.read2.fastq.gz.plot --text /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/simulation.bwa.read2.fastq.gz.plot.tsv /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/simulation.bwa.read2.fastq.gz


In [4]:
aligned_bam_file_path = align_fastq_gzs_using_bwa(
    FASTA_GZ_FILE_PATH, [
        fastq_gz_0_file_path,
        fastq_gz_1_file_path,
    ],
    n_jobs=N_JOBS,
    overwrite=OVERWRITE)

sorted_and_indexed_bam_file_path = sort_and_index_bam_using_samtools(
    aligned_bam_file_path, n_jobs=N_JOBS, overwrite=OVERWRITE)
os.remove(aligned_bam_file_path)

duplicate_marked_bam_file_path = mark_duplicates_in_bam_using_picard(
    sorted_and_indexed_bam_file_path,
    n_jobs=N_JOBS,
    output_bam_file_path=os.path.join(DIRECTORY_PATH, 'aligned.bam'),
    overwrite=OVERWRITE)
os.remove(sorted_and_indexed_bam_file_path)
os.remove(sorted_and_indexed_bam_file_path + '.bai')

if PLOT:
    plot_fastq_gz_or_bam(duplicate_marked_bam_file_path, overwrite=OVERWRITE)

bwa mem -t 4 /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/simulation.bwa.read1.fastq.gz /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/simulation.bwa.read2.fastq.gz | /Users/k/Jumis/github_kwatme/sequencing_process/sequencing_process/k8-0.2.3/k8-darwin /Users/k/Jumis/github_kwatme/sequencing_process/sequencing_process/bwa-postalt.js /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz.alt | samtools view -Sb --threads 4 > /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/align_fastq_gzs_using_bwa.bam
samtools sort --threads 4 /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/align_fastq_gzs_using_bwa.bam > /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/sort_and_index_bam_using_samtools.bam
Consider removing unsorted .bam file /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/align_fastq_gz

In [5]:
vcf_gz_file_path = call_variants_on_bam_using_freebayes_and_multiprocess(
    duplicate_marked_bam_file_path,
    FASTA_FILE_PATH, ['chr{}'.format(i)
                      for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'],
    n_jobs=N_JOBS,
    overwrite=OVERWRITE)

snpeff_annotated_vcf_gz_file_path = annotate_vcf_gz_using_snpeff(
    vcf_gz_file_path,
    n_jobs=N_JOBS,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'snpeff.vcf'),
    overwrite=OVERWRITE)
os.remove(vcf_gz_file_path)
os.remove(vcf_gz_file_path + '.tbi')

clinvar_annotated_vcf_gz_file_path = annotate_vcf_gz_using_bcftools(
    snpeff_annotated_vcf_gz_file_path,
    CLINVAR_VCF_GZ_FILE_PATH, ['--columns =ID,INFO'],
    n_jobs=N_JOBS,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'annotated.vcf'),
    overwrite=OVERWRITE)
os.remove(snpeff_annotated_vcf_gz_file_path)
os.remove(snpeff_annotated_vcf_gz_file_path + '.tbi')

filtered_vcf_gz_file_path = filter_vcf_gz_using_bcftools(
    clinvar_annotated_vcf_gz_file_path,
    n_jobs=N_JOBS,
    output_vcf_file_path=os.path.join(DIRECTORY_PATH, 'filtered.vcf'),
    overwrite=OVERWRITE)

freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr1 /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/aligned.bam > /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/call_variants_on_bam_using_freebayes.--region_chr1.vcf
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr5 /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/aligned.bam > /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/call_variants_on_bam_using_freebayes.--region_chr5.vcf
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr7 /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/aligned.bam > /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/call_variants_on_bam_using_freebayes.--region_chr7.vcf
freebayes --fasta-reference /Volumes/Jumis/

bgzip --threads 1 --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/call_variants_on_bam_using_freebayes.--region_chr15.vcf; tabix --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/call_variants_on_bam_using_freebayes.--region_chr15.vcf.gz
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr16 /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/aligned.bam > /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/call_variants_on_bam_using_freebayes.--region_chr16.vcf
bgzip --threads 1 --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/call_variants_on_bam_using_freebayes.--region_chr14.vcf; tabix --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/call_variants_on_bam_using_freebayes.--region_chr14.vcf.gz
freebayes --fasta-reference /Volumes/Jumis/data/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region ch

bgzip --threads 4 --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/snpeff.vcf; tabix --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/snpeff.vcf.gz
bcftools annotate --annotations /Volumes/Jumis/data/grch/clinvar.vcf.gz --threads 4 --columns =ID,INFO /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/snpeff.vcf.gz > /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/annotated.vcf
bgzip --threads 4 --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/annotated.vcf; tabix --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/annotated.vcf.gz
bcftools view --include '10<=DP & 10<=QUAL & 10<=(QUAL/AO) & 1<=SRF & 1<=SRR & 1<=SAF & 1<=SAR & 1<=RPR & 1<=RPL' --threads 4 /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/annotated.vcf.gz > /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/filtered.vcf
bgzip --threads 4 --force /Users/k/Jumis/github_kwatme/sequencing_process/notebook/1k/filte

In [6]:
if PLOT:
    p.terminate()