In [2]:
%pylab inline
import scipy
import os 
from collections import OrderedDict
import glob
from riboraptor.helpers import path_leaf
import pandas as pd
import seaborn as sns
sns.set_context('paper', font_scale=2)
sns.set_style('white')
def counts_to_tpm(counts, sizes):
    """Counts to TPM
    Parameters
    ----------
    counts: array like
            Series/array of counts
    sizes: array like
           Series/array of region sizes
    """
    rate = np.log(counts).subtract(np.log(sizes))
    denom = np.log(np.sum(np.exp(rate)))
    tpm = np.exp(rate - denom + np.log(1e6))
    return tpm

def featurecounts_to_tpm(fc_f, outfile):
    """Convert htseq-counts file to tpm
    Parameters
    ----------
    fc_f: string
             Path to htseq-count output
    outfile: string
             Path to output file with tpm values
    """
    feature_counts = pd.read_csv(fc_f, sep='\t')
    feature_counts = feature_counts.set_index('Geneid')
    feature_counts = feature_counts.drop(
        columns=["Chr", "Start", "End", "Strand"]
    )
    lengths = feature_counts["Length"]
    feature_counts = feature_counts.drop(columns=["Length"])
    tpm = feature_counts.apply(lambda x: counts_to_tpm(x, lengths), axis=0)
    tpm.columns = [col.replace('bams_unique/', '').replace('.bam', '') for col in tpm.columns]
    tpm.to_csv(outfile, sep="\t", index=True, header=True)


Populating the interactive namespace from numpy and matplotlib


# Read Orthologs

In [1]:
ribotricer_index_hg38 = pd.read_table('/home/cmb-panasas2/skchoudh/genomes/hg38/ribotricer_v96_annotation_longest_candidate_orfs.tsv', sep='\t')
ribotricer_index_hg38.head()

NameError: name 'pd' is not defined

In [2]:
orthologs_Mmul8 = pd.read_csv('../../re-ribo-smk/data//orthologs/human_macaque_ortholog.tsv', sep='\t')
orthologs_Mmul8 = orthologs_Mmul8[orthologs_Mmul8["Macaque homology type"] == "ortholog_one2one"][["Gene stable ID", "Macaque gene stable ID"]]
orthologs_Mmul8.columns = ["gene_id", "ortholog"]
orthologs_Mmul8 = orthologs_Mmul8.drop_duplicates().set_index('ortholog')

In [3]:
orthologs_Mmul8.head()

Unnamed: 0_level_0,gene_id
ortholog,Unnamed: 1_level_1
ENSMMUG00000028699,ENSG00000198888
ENSMMUG00000028695,ENSG00000198763
ENSMMUG00000028689,ENSG00000198804
ENSMMUG00000028686,ENSG00000198712
ENSMMUG00000028684,ENSG00000228253


In [4]:
orthologs_panTro3 = pd.read_csv('../../re-ribo-smk/data//orthologs/human_chimp_ortholog.tsv', sep='\t')

orthologs_panTro3 = orthologs_panTro3[orthologs_panTro3["Chimpanzee homology type"] == "ortholog_one2one"][["Gene stable ID", "Chimpanzee gene stable ID"]]
orthologs_panTro3.columns = ["gene_id", "ortholog"]
orthologs_panTro3 = orthologs_panTro3.drop_duplicates().set_index('ortholog')
orthologs_panTro3.head()

Unnamed: 0_level_0,gene_id
ortholog,Unnamed: 1_level_1
ENSPTRG00000042641,ENSG00000198888
ENSPTRG00000042626,ENSG00000198763
ENSPTRG00000042642,ENSG00000210127
ENSPTRG00000042657,ENSG00000198804
ENSPTRG00000042660,ENSG00000198712


In [5]:

three_way_ortholog = orthologs_panTro3.reset_index().set_index('gene_id').rename(columns={'ortholog': 'panTro3'}).join(orthologs_Mmul8.reset_index().set_index('gene_id').rename(columns={'ortholog': 'Mmul8'}), how='inner')
three_way_ortholog.head()

Unnamed: 0_level_0,panTro3,Mmul8
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000198888,ENSPTRG00000042641,ENSMMUG00000028699
ENSG00000198763,ENSPTRG00000042626,ENSMMUG00000028695
ENSG00000198804,ENSPTRG00000042657,ENSMMUG00000028689
ENSG00000198712,ENSPTRG00000042660,ENSMMUG00000028686
ENSG00000228253,ENSPTRG00000042653,ENSMMUG00000028684


# Read Metadata

In [None]:
metadata_ribo = pd.read_csv('../../re-ribo-smk/data/ortho-datasets-metadata/SRP062129_metadata.tsv', 
                            sep='\t')
metadata_rna = pd.read_csv('../../re-ribo-smk/data/ortho-datasets-metadata/SRP028612_metadata.tsv',
                           sep='\t')
metadata = pd.concat([metadata_ribo, metadata_rna])

metadata_panTro3 = metadata[metadata.species=='panTro3']

metadata_GRCh38 = metadata[metadata.species=='GRCh38'].sort_values(by=['assay', 'sex'])
female_ribo = metadata_GRCh38.query("sex == 'female' and assay== 'ribo'").experiment_accession
female_rna = metadata_GRCh38.query("sex == 'female' and assay== 'rna'").experiment_accession
male_ribo = metadata_GRCh38.query("sex == 'male' and assay== 'ribo'").experiment_accession
male_rna = metadata_GRCh38.query("sex == 'male' and assay== 'rna'").experiment_accession
female_combinations = [(x,y) for x in female_ribo for y in female_rna]
male_combinations = [(x,y) for x in male_ribo for y in male_rna]
ribo_rna_pairs_GRCh38 = female_combinations + male_combinations

