In [None]:
import multiprocessing as mp
import os
import sys

import pandas as pd

sys.path.insert(0, '..')
from sequencing_process.plot_fastq_gz_or_bam import plot_fastq_gz_or_bam
from sequencing_process.process_fastq_gz import count_transcripts_using_kallisto

In [None]:
CDNA_FASTA_GZ_FILE_PATH = '../../../data/grch/Homo_sapiens.GRCh38.cdna.all.fa.gz'
assert os.path.exists(CDNA_FASTA_GZ_FILE_PATH)

ENST_INFORMATION_FILE_PATH = '../../../data/enst_information.tsv'
assert os.path.exists(ENST_INFORMATION_FILE_PATH)

FASTQ_GZ_0_FILE_PATH = '../data/fastq/t1.tumor.rna.0.fastq.gz'
assert os.path.exists(FASTQ_GZ_0_FILE_PATH)

FASTQ_GZ_1_FILE_PATH = '../data/fastq/t1.tumor.rna.1.fastq.gz'
assert os.path.exists(FASTQ_GZ_1_FILE_PATH)

KALLISTO_OUTPUT_DIRECTORY_PATH = '../output/kallisto/'

N_JOBS = 2
OVER_WRITE = False

In [None]:
with mp.Pool(processes=N_JOBS) as p:

    r0 = p.apply_async(
        plot_fastq_gz_or_bam,
        args=[FASTQ_GZ_0_FILE_PATH],
        kwds=dict(overwrite=OVERWRITE))
    print(r0)

    r1 = p.apply_async(
        plot_fastq_gz_or_bam,
        args=[FASTQ_GZ_1_FILE_PATH],
        kwds=dict(overwrite=OVERWRITE))
    print(r1)

In [None]:
if not os.path.exists(
        os.path.join(KALLISTO_OUTPUT_DIRECTORY_PATH, 'abundance.tsv')):

    count_transcripts_using_kallisto(
        CDNA_FASTA_GZ_FILE_PATH, [
            FASTQ_GZ_0_FILE_PATH,
            FASTQ_GZ_1_FILE_PATH,
        ],
        OUTPUT_DIRECTORY_PATH,
        n_jobs=N_JOBS,
        overwrite=OVERWRITE)

In [None]:
tpm = pd.read_table(
    os.path.join(KALLISTO_OUTPUT_DIRECTORY_PATH, 'abundance.tsv'),
    index_col=0)['tpm']
tpm

In [None]:
enst_information = pd.read_table(ENST_INFORMATION_FILE_PATH, index_col=1)
enst_to_gene_name = enst_information['Gene name'].to_dict()

In [None]:
tpm.index = tpm.index.map(enst_to_gene_name.get)
tpm_by_gene = tpm
tpm_by_gene.index.name = 'Gene'
tpm_by_gene

In [None]:
tpm_by_gene_max = tpm_by_gene.groupby(level=0).max()
tpm_by_gene_max.name = 'TPM by Gene Max'
tpm_by_gene_max

In [None]:
fp = os.path.join(KALLISTO_OUTPUT_DIRECTORY_PATH, 'tpm_by_gene_max.tsv')
if not os.path.exists(fp) or OVERWRITE:
    tpm_by_gene_max.to_csv(fp, sep='\t', header=True)