In [1]:
import os
import multiprocessing as mp
import sys

sys.path.insert(0, '..')
from sequencing_process.plot_fastq_gz_or_bam import plot_fastq_gz_or_bam
from sequencing_process.process_fastq_gz import align_fastq_gzs_using_bwa
from sequencing_process.process_bam import call_variants_on_bam_using_freebayes_and_multiprocess
from sequencing_process.process_vcf_gz import annotate_vcf_gz_using_snpeff, annotate_vcf_gz_using_bcftools, filter_vcf_gz_using_bcftools

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [11]:
GRCH_DIRECTORY_PATH = '/home/k/Downloads/grch'
assert os.path.isdir(GRCH_DIRECTORY_PATH)

# make_reference_genome_for_sequencing_process(GRCH_DIRECTORY_PATH)

FASTA_GZ_FILE_PATH = os.path.join(
    GRCH_DIRECTORY_PATH,
    'GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz')
assert os.path.isfile(FASTA_GZ_FILE_PATH)

CLINVAR_VCF_GZ_FILE_PATH = os.path.join(GRCH_DIRECTORY_PATH, 'clinvar.vcf.gz')
assert os.path.isfile(CLINVAR_VCF_GZ_FILE_PATH)

N_JOBS = 30
OVERWRITE = True

In [3]:
FASTQ_GZ_0_FILE_PATH = '/home/k/Downloads/t1.peripheral_blood.genome.dna.0.fastq.gz'
assert os.path.isfile(FASTQ_GZ_0_FILE_PATH)

FASTQ_GZ_1_FILE_PATH = '/home/k/Downloads/t1.peripheral_blood.genome.dna.0.fastq.gz'
assert os.path.isfile(FASTQ_GZ_1_FILE_PATH)

with mp.Pool(processes=N_JOBS) as p:

    r0 = p.apply_async(
        plot_fastq_gz_or_bam,
        args=[FASTQ_GZ_0_FILE_PATH],
        kwds=dict(overwrite=OVERWRITE))
    print(r0)

    r1 = p.apply_async(
        plot_fastq_gz_or_bam,
        args=[FASTQ_GZ_1_FILE_PATH],
        kwds=dict(overwrite=OVERWRITE))
    print(r1)

fastqp --output simulation_0.fq.gz.plot --text simulation_0.fq.gz.plot.tsv simulation_0.fq.gz
<multiprocessing.pool.ApplyResult object at 0x7f83c00c9f28>
<multiprocessing.pool.ApplyResult object at 0x7f83c0057128>


In [4]:
BAM_FILE_PATH = None

if BAM_FILE_PATH:
    assert os.path.isfile(BAM_FILE_PATH)

else:
    BAM_FILE_PATH = align_fastq_gzs_using_bwa(
        FASTA_GZ_FILE_PATH, [
            FASTQ_GZ_0_FILE_PATH,
            FASTQ_GZ_1_FILE_PATH,
        ],
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)
    
    plot_fastq_gz_or_bam(BAM_FILE_PATH, overwrite=OVERWRITE)

bwa mem -t 2 /home/k/Downloads/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa.gz simulation_0.fq.gz simulation_1.fq.gz | samtools view -Sb --threads 2 > align_fastq_gzs_using_bwa.bam
samtools sort --threads 2 align_fastq_gzs_using_bwa.bam > sort_bam_using_samtools.bam
samtools index -@ 2 sort_bam_using_samtools.bam
fastqp --output sort_bam_using_samtools.bam.plot --text sort_bam_using_samtools.bam.plot.tsv sort_bam_using_samtools.bam


In [8]:
VCF_GZ_FILE_PATH = None

if VCF_GZ_FILE_PATH:
    assert os.path.isfile(VCF_GZ_FILE_PATH)

else:
    VCF_GZ_FILE_PATH = call_variants_on_bam_using_freebayes_and_multiprocess(
        BAM_FILE_PATH,
        FASTA_FILE_PATH, ['chr{}'.format(i)
                          for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'],
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)

    VCF_GZ_FILE_PATH = annotate_vcf_gz_using_snpeff(
        VCF_GZ_FILE_PATH, n_jobs=N_JOBS, overwrite=OVERWRITE)

    VCF_GZ_FILE_PATH = annotate_vcf_gz_using_bcftools(
        VCF_GZ_FILE_PATH,
        CLINVAR_VCF_GZ_FILE_PATH,
        '--columns =ID,INFO',
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)

    VCF_GZ_FILE_PATH = filter_vcf_gz_using_bcftools(
        VCF_GZ_FILE_PATH, n_jobs=N_JOBS, overwrite=OVERWRITE)

freebayes --fasta-reference /home/k/Downloads/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr5 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_chr5.vcf
freebayes --fasta-reference /home/k/Downloads/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr1 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_chr1.vcf

HLA-A*01:01:01:01	HLA00001	3503	3261547044	72	73

freebayes --fasta-reference /home/k/Downloads/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr9 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_chr9.vcf

HLA-A*01:01:01:01	HLA00001	3503	3261547044	72	73

freebayes --fasta-reference /home/k/Downloads/grch/GCA_000001405.15_GRCh38_full_plus_hs38DH-extra_analysis_set.fa --region chr13 sort_bam_using_samtools.bam > call_variants_on_bam_using_freebayes.--region_chr13.vcf

HLA-A*01:01:01:01	HLA00001	3503	3261547044	

MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x7f8414bcf630>'. Reason: 'TypeError("cannot serialize '_io.TextIOWrapper' object",)'

In [6]:
[str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 'X',
 'Y',
 'MT']