In [1]:
## Processing proteomic and genomic data for jurkat ##
# 0. Prepare Mapping Files 
# 1. Prepare Isoform Information Table from sqanti Output
# 2. Make Gene Length Statistics File
# 3. Abundace Data by Gene
# 4. Include PolyA Tail Info
# 5. Proteomics Analysis

#### Import Modules ####
# Python Modules #
import numpy as np
import pandas as pd 
from pathlib import Path

# Custom Modules #
from m_gen_maps import GenMap
from m_sqantitable import sqtab
from m_make_gene_length_table import GenLenTab 
from m_MMprocess import MMproc

#### Input Files ####
sqanti_out = './../jurkat_analysis/a_SQANTI3_out/jurkat_classification.txt'
tpm_file =  '../jurkat_analysis/a_jurkat_tpms_kallisto/c_gene_and_iso_kallisto_tables/a_jurkat_gene_kallisto.tsv'
ribodep_tpm = '../jurkat_analysis/a_polyA-_data/b_ribodeplete_jurkat_geo/d_kallist_table_rdeplete_jurkat.tsv' # expects normalized data
pbacc_to_gene_file = './uniprot_acc_to_gencode_gene.tsv'


## Optional Inputs ##
# Can be either input or output locations
# If file exists, use as input. If file does not exist, use to custimize output location. 
ensg_to_gene = "./ensg_to_gene.tsv"
enst_to_trans = "./enst_to_trans.tsv"

gene_len_stats_tab =  './gen_len_stats.tsv'


#### Output Files ####
sqantitab = "./sqanti_isoform_tab.tsv"
gen_level_tab = './gene_based_info.tsv'
comparison_tab = "./comparison_table.tsv"

#### TODO ###
# Separate Transcriptomic Data from Genomic Data
# Clean up inputs and outputs list 

####



In [3]:
#### Part 1 : Prepare Isoform Information Table from sqanti Output ####
sq_isotab = sqtab(sqanti_out, ensg_to_gene, enst_to_trans, sqantitab)
# TODO: Some pb_acc map to more than 1 gene

Isoform Table from sqanti output has been prepared


In [4]:
#### Part 2: Make Gene Length Statistics File ####
gene_len_stats = GenLenTab(fa_file, gen_len_stats_tab = './gen_len_stats.tsv',gen_isolen_tab='./gen_len.tsv')


The gene statistics table has already been prepared. Skipping this step


In [5]:
#### Part 3: Abundace Data by Gene ####
# TODO: Make into function?

## Pacbio Abundace from sqanti output ##
#sum(cpm per isoform) -> cpm per gene
cpm_data = sq_isotab[['gene', 'cpm']]
cpm_by_gene = cpm_data.groupby(['gene']).agg(cpm = ('cpm', 'sum')).reset_index(level=['gene'])

## Illumina Abundance from Kallisto output ##
tpm_by_gene = pd.read_csv(tpm_file, delimiter='\t')
tpm_by_gene['gene'] = tpm_by_gene['gene'].str.replace('-', '_')

## Merge Abundace Data ##
abund = pd.merge(cpm_by_gene, tpm_by_gene, how='right', on='gene')

# Make sure there is no gene mapping issues. 
test = pd.merge(cpm_by_gene, tpm_by_gene, how='inner', on='gene')
pb_only = cpm_by_gene[~cpm_by_gene['gene'].isin(test['gene'])]

## Merge with Gene Length Data and Save ##
gen_lenab = pd.merge(gene_len_stats, abund, how="outer", on='gene')

In [6]:
#### Part 4: Include PolyA Tail Info ####

## Ribodepletion Data ##
ribo = pd.read_csv(ribodep_tpm, sep='\t')
rgen = ribo.groupby(['gene']).agg(rtpm=('tpm', 'sum')).reset_index()
rgen['log(rtpm+1)'] = np.log10(rgen['rtpm'] + 1)

# Add 1 to tpm data #
tpm_by_gene['log(tpm+1)'] = np.log10(tpm_by_gene['tpm'] + 1)

# Merge and calc rtpm/tpm on log scale
pA = pd.merge(rgen, tpm_by_gene, how = 'outer', on='gene')
pA['ratio'] = pA['log(rtpm+1)']/pA['log(tpm+1)']
npA = pA[pA['ratio']>120].reset_index()

# Add PolyA info to Gene Length table 
pA_gen = gen_lenab.assign(r_ispolyA =~ gen_lenab.gene.isin(npA.gene))

# Output Gene-Level Table 
pA_gen.to_csv(gen_level_tab, sep='\t', index=False, na_rep=0)