In [44]:
%pylab inline
import re
import pandas as pd
from collections import OrderedDict
import matplotlib
import seaborn as sns
import pyupset as pyu
from riboraptor.helpers import path_leaf


matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

sns.set_style('white')
sns.set_context('paper', font_scale=2)

def strip_tx_version(txid):
    return re.sub('\.\d+', '', txid)


Populating the interactive namespace from numpy and matplotlib


In [2]:
human_tfs = pd.read_table('/home/cmb-06/as/skchoudh/github_projects/ribocop-results/tf-data/human_TFs_ensembl.txt', names=['gene_id'], header=None)
mouse_tfs = pd.read_table('/home/cmb-06/as/skchoudh/github_projects/ribocop-results/tf-data/mouse_TFs_symbol.txt', names=['gene_name'], header=None)

In [68]:
def get_human_tf_status(gene_id):
    if gene_id in human_tfs.gene_id.tolist():
        return 'TF'
    return 'non-TF'

def get_mouse_tf_status(gene_name):
    if gene_name in mouse_tfs.gene_name.tolist():
        return 'TF'
    return 'non-TF'


In [69]:
mouse_annotation = pd.read_table('/home/cmb-panasas2/skchoudh/genomes/mm10/annotation/mm10_gene_names_stripped.tsv', names=['gene_id', 'gene_name', 'gene_type']).set_index('gene_id')
human_annotation = pd.read_table('/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/hg38_gene_names_stripped.tsv', names=['gene_id', 'gene_name', 'gene_type']).set_index('gene_id')


In [70]:
human_uorf_candidates = pd.read_table('/home/cmb-06/as/skchoudh/github_projects/ribocop-results/real/uorfs/human_uorf_all_samples.txt', names=['gene_id'], header=None)
human_uorf_candidates['gene_name'] = human_annotation.loc[human_uorf_candidates.gene_id, 'gene_name'].tolist()
mouse_uorf_candidates = pd.read_table('/home/cmb-06/as/skchoudh/github_projects/ribocop-results/real/uorfs/mouse_uorf_all_samples.txt', names=['gene_id'], header=None)
mouse_uorf_candidates['gene_name'] = mouse_annotation.loc[mouse_uorf_candidates.gene_id, 'gene_name'].tolist()

In [71]:
mouse_uorf_candidates[['gene_name']].to_csv('/home/cmb-06/as/skchoudh/github_projects/ribocop-results/real/uorfs/mouse_uorf_all_samples_gene_names.txt', header=None, index=False)
human_uorf_candidates[['gene_name']].to_csv('/home/cmb-06/as/skchoudh/github_projects/ribocop-results/real/uorfs/human_uorf_all_samples_gene_names.txt', header=None, index=False)

In [72]:
len(set(mouse_uorf_candidates['gene_name'].tolist()).intersection(mouse_tfs.gene_name))


30

In [28]:
len(set(mouse_uorf_candidates['gene_name'].tolist()))

384

In [29]:
len(set(human_uorf_candidates['gene_id'].tolist()).intersection(human_tfs.gene_id))


18

In [30]:
len(set(human_uorf_candidates.gene_id))

222

In [31]:
18/222

0.08108108108108109

In [83]:
human_uorf_candidates = pd.read_table('/home/cmb-06/as/skchoudh/github_projects/ribocop-results/real/uorfs/human_uorf_two_samples.txt', names=['gene_id'], header=None)
human_uorf_candidates['gene_name'] = human_annotation.loc[human_uorf_candidates.gene_id, 'gene_name'].tolist()
human_uorf_candidates['gene_type'] = human_uorf_candidates.gene_id.apply(get_human_tf_status)
human_uorf_candidates['species'] = 'human'
mouse_uorf_candidates = pd.read_table('/home/cmb-06/as/skchoudh/github_projects/ribocop-results/real/uorfs/mouse_uorf_three_samples.txt', names=['gene_id'], header=None)
mouse_uorf_candidates['gene_name'] = mouse_annotation.loc[mouse_uorf_candidates.gene_id, 'gene_name'].tolist()
mouse_uorf_candidates['gene_type'] = mouse_uorf_candidates.gene_name.apply(get_mouse_tf_status)
mouse_uorf_candidates['species'] = 'mouse'



In [84]:
human_mouse_combined_uorf = pd.concat([human_uorf_candidates, mouse_uorf_candidates])

df = human_mouse_combined_uorf
df = df[['species', 'gene_type']].groupby(['species', 'gene_type']).size().unstack(fill_value=0)
df = df.reset_index()
df1 = pd.melt(df, id_vars=['species'])
df1.to_csv('/home/cmb-panasas2/skchoudh/github_projects/ribocop-results/real/uorfs/human_mouse_uorf_two_samples_combined.summary.tsv', sep='\t', index=False, header=True)

In [85]:
df.to_csv('/home/cmb-panasas2/skchoudh/github_projects/ribocop-results/real/uorfs/human_mouse_uorf_two_samples_combined.tsv', sep='\t', index=False, header=True)

In [86]:
len(set(mouse_uorf_candidates['gene_name'].tolist()).intersection(mouse_tfs.gene_name))


192

In [87]:
len(set(mouse_uorf_candidates['gene_name'].tolist()))

1951

In [88]:
192/1951

0.09841107124551512

In [89]:
len(set(human_uorf_candidates['gene_id'].tolist()).intersection(human_tfs.gene_id))


173

In [90]:
len(set(human_uorf_candidates.gene_id))

1801

In [91]:
173/1801


0.09605774569683509

# How many genes are orthologous between human and mouse?

In [105]:
mm_hg_ortho = pd.read_table('../../svd-project/ortholog_tsv/mmusculus_hsapiens.tsv')[['ensembl_gene_id', 'hsapiens_homolog_ensembl_gene']]
mm_hg_ortho = mm_hg_ortho.loc[mm_hg_ortho.ensembl_gene_id.isin(mouse_uorf_candidates.gene_id)]
mm_hg_ortho = mm_hg_ortho.loc[mm_hg_ortho.hsapiens_homolog_ensembl_gene.isin(human_uorf_candidates.gene_id)]

mm_hg_ortho_mm = mm_hg_ortho[['ensembl_gene_id']].rename(columns={'ensembl_gene_id': 'gene_id'})
mm_hg_ortho_mm['gene_name'] = mouse_annotation.loc[mm_hg_ortho_mm.gene_id, 'gene_name'].tolist()
mm_hg_ortho_mm['species'] = 'mouse'
mm_hg_ortho_hg = mm_hg_ortho[['hsapiens_homolog_ensembl_gene']].rename(columns={'hsapiens_homolog_ensembl_gene': 'gene_id'})
mm_hg_ortho_hg['species'] = 'human'

mm_hg_ortho_mm['gene_type'] = mm_hg_ortho_mm.gene_name.apply(get_mouse_tf_status)
mm_hg_ortho_hg['gene_type'] = mm_hg_ortho_hg.gene_id.apply(get_human_tf_status)

mm_hg_ortho = pd.concat([mm_hg_ortho_hg, mm_hg_ortho_mm])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [107]:
df = mm_hg_ortho[['species', 'gene_type']].groupby(['species', 'gene_type']).size().unstack(fill_value=0)
df = df.reset_index()
df2 = pd.melt(df, id_vars=['species'])
df2.to_csv('/home/cmb-panasas2/skchoudh/github_projects/ribocop-results/real/uorfs/human_mouse_uorf_two_samples_ortholog.summary.tsv', sep='\t', index=False, header=True)

KeyError: "['ensembl_gene_id'] not in index"

In [100]:
mm_hg_ortho

Unnamed: 0,gene_id,gene_type
169,ENSG00000165669,non-TF
186,ENSG00000234857,non-TF
1029,ENSG00000196233,non-TF
1688,ENSG00000013561,non-TF
1909,ENSG00000067900,non-TF
2093,ENSG00000101574,non-TF
2239,ENSG00000119820,non-TF
2498,ENSG00000162144,non-TF
3103,ENSG00000151292,non-TF
3295,ENSG00000099194,non-TF
