In [1]:
import os

import sys
sys.path.insert(0, '..')

from sequencing_process.process_fastq_gz import align_fastq_gzs_using_bwa
from sequencing_process.process_bam import call_variants_on_bam_using_freebayes_and_multiprocess
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, annotate_vcf_gz_using_bcftools, filter_vcf_gz_using_bcftools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
GRCH_DIRECTORY_PATH = '/media/k/Chocolate/Jumis/data/grch'
assert os.path.isdir(GRCH_DIRECTORY_PATH)

FASTA_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH,
                               'Homo_sapiens.GRCh38.dna.primary_assembly.fa')
assert os.path.isfile(FASTA_FILE_PATH)

FASTA_GZ_FILE_PATH = FASTA_FILE_PATH + '.gz'
assert os.path.isfile(FASTA_GZ_FILE_PATH)

CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, 'clinvar.vcf.gz')
assert os.path.isfile(CLINVAR_VCF_GZ_FILE_PATH)

In [3]:
FASTQ_GZ_0_FILE_PATH = 'simulation_0.fq.gz'
FASTQ_GZ_1_FILE_PATH = 'simulation_1.fq.gz'

N_JOBS = 7

In [4]:
BAM_FILE_PATH = 'sort_bam_using_samtools.bam'

if not BAM_FILE_PATH:

    BAM_FILE_PATH = align_fastq_gzs_using_bwa(
        FASTA_GZ_FILE_PATH, [
            FASTQ_GZ_0_FILE_PATH,
            FASTQ_GZ_1_FILE_PATH,
        ],
        n_jobs=N_JOBS)

In [5]:
VCF_GZ_FILE_PATH = None

if not VCF_GZ_FILE_PATH:

    VCF_GZ_FILE_PATH = call_variants_on_bam_using_freebayes_and_multiprocess(
        BAM_FILE_PATH, FASTA_FILE_PATH, n_jobs=N_JOBS)

    VCF_GZ_FILE_PATH = annotate_vcf_gz_using_snpeff(
        VCF_GZ_FILE_PATH, n_jobs=N_JOBS)

    VCF_GZ_FILE_PATH = annotate_vcf_gz_using_bcftools(
        VCF_GZ_FILE_PATH,
        CLINVAR_VCF_GZ_FILE_PATH,
        '--columns =ID,INFO',
        n_jobs=N_JOBS)

    VCF_GZ_FILE_PATH = filter_vcf_gz_using_bcftools(
        VCF_GZ_FILE_PATH, n_jobs=N_JOBS)

freebayes --fasta-reference /media/k/Chocolate/Jumis/data/grch/Homo_sapiens.GRCh38.dna.primary_assembly.fa --region 7 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_7.vcf
freebayes --fasta-reference /media/k/Chocolate/Jumis/data/grch/Homo_sapiens.GRCh38.dna.primary_assembly.fa --region 2 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_2.vcf
freebayes --fasta-reference /media/k/Chocolate/Jumis/data/grch/Homo_sapiens.GRCh38.dna.primary_assembly.fa --region 6 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_6.vcf
freebayes --fasta-reference /media/k/Chocolate/Jumis/data/grch/Homo_sapiens.GRCh38.dna.primary_assembly.fa --region 5 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_5.vcf
freebayes --fasta-reference /media/k/Chocolate/Jumis/data/grch/Homo_sapiens.GRCh38.dna.primary_assembly.fa --region 1 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_1.vcf


bgzip --force --threads 1 call_variants_on_bam_using_freebayes.--region_22.vcf; tabix --force call_variants_on_bam_using_freebayes.--region_22.vcf.gz
bgzip --force --threads 1 call_variants_on_bam_using_freebayes.--region_Y.vcf; tabix --force call_variants_on_bam_using_freebayes.--region_Y.vcf.gz
bgzip --force --threads 1 call_variants_on_bam_using_freebayes.--region_X.vcf; tabix --force call_variants_on_bam_using_freebayes.--region_X.vcf.gz
bcftools concat --allow-overlaps --threads 7 call_variants_on_bam_using_freebayes.--region_1.vcf.gz call_variants_on_bam_using_freebayes.--region_2.vcf.gz call_variants_on_bam_using_freebayes.--region_3.vcf.gz call_variants_on_bam_using_freebayes.--region_4.vcf.gz call_variants_on_bam_using_freebayes.--region_5.vcf.gz call_variants_on_bam_using_freebayes.--region_6.vcf.gz call_variants_on_bam_using_freebayes.--region_7.vcf.gz call_variants_on_bam_using_freebayes.--region_8.vcf.gz call_variants_on_bam_using_freebayes.--region_9.vcf.gz call_variants_