### Xena Query Outlier Example

Xena exposes a flexible query interface allowing you to select what you want as well as the shape of what you want.

A higher level python library with examples can be found at:

https://github.com/ucscXena/ucsc-xena-server/tree/master/python

Each hub also exposes a console that you can use to develop and test queries directly:

http://toil.xenahubs.net/console.html

Paste:

(query {:select [:name] :from [:dataset] :where [:like :name "%target%"]})

to get a list of all the datasets with 'target' in their title.

In [15]:
import numpy as np
import pandas as pd
import xena_query as xena

In [2]:
# https://xenabrowser.net/datapages/ has a list of all datasets, for this example we're going to
# dig around in the Treehouse TARGET ALL data


In [3]:
# Get a list of cohorts for this hub
xena.all_cohorts(hub)

[u'GTEX',
 u'TCGA Pan-Cancer (PANCAN)',
 u'TCGA TARGET GTEx',
 u'TARGET Pan-Cancer (PANCAN)',
 u'TCGA and TARGET Pan-Cancer (PANCAN)']

In [4]:
# Get a list of the datasets TARGET Pan-Cancer (PANCAN)
xena.datasets_list_in_cohort(hub, "TARGET Pan-Cancer (PANCAN)")

[u'filter_TARGET_Neuroblastoma',
 u'filter_TARGET_Recurrent_Blood_Derived_Cancer_Bone_Marrow',
 u'filter_TARGET_Clear_Cell_Sarcoma_of_the_Kidney',
 u'filter_TARGET_AML',
 u'filter_TARGET_Acute_Lymphoblastic_Leukemia',
 u'filter_TARGET_Post_treatment_Blood_Cancer_Bone_Marrow',
 u'target_rsem_isopct',
 u'filter_TARGET_Kidney_Rhabdoid_Tumor',
 u'filter_TARGET_Primary_Blood_Derived_Cancer_Bone_Marrow',
 u'filter_TARGET_Solid_Tissue_Normal',
 u'TARGET_donor_allprojects_transfer_to_sample',
 u'filter_TARGET_NormalsWithRNAdata',
 u'target_RSEM_isoform_fpkm',
 u'filter_TARGET_Recurrent_Solid_Tumor',
 u'target_Kallisto_est_counts',
 u'filter_TARGET_Primary_Solid_Tumor',
 u'target_expected_count',
 u'target_RSEM_Hugo_norm_count',
 u'filter_TARGET_Wilms_Tumor',
 u'TARGET_phenotype',
 u'filter_TARGET_TumorsWithRNAdata',
 u'target_Kallisto_tpm',
 u'target_gene_expected_count',
 u'target_rsem_isoform_tpm',
 u'target_RSEM_gene_tpm',
 u'target_RSEM_gene_fpkm',
 u'filter_TARGET_Primary_Blood_Derived_Ca

In [5]:
# Get a list of all the features in the expression dataset
all_features = pd.DataFrame(xena.dataset_field(hub, "target_RSEM_Hugo_norm_count"), columns=["Gene Symbol"])
print "Total number of features:", len(all_features)
all_features.head()

Total number of features: 58582


Unnamed: 0,Gene Symbol
0,5S_rRNA
1,5_8S_rRNA
2,7SK
3,A1BG
4,A1BG-AS1


In [6]:
# Get a list of cancer genes as features
cancer_genes = pd.read_table("cancer_genes.tsv")
print "Number of Cancer Genes:", len(cancer_genes)
cancer_genes.head()

Number of Cancer Genes: 602


Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Chr Band,Somatic,Germline,Tumour Types(Somatic),Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
0,ABI1,abl-interactor 1,10006,10:26748570-26860863,10p11.2,yes,,AML,,,L,Dom,TSG,T,KMT2A,,,"ABI1,E3B1,ABI-1,SSH3BP1,10006"
1,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25,9:130835447-130885683,9q34.1,yes,,"CML, ALL, T-ALL",,,L,Dom,oncogene,"T, Mis","BCR, ETV6, NUP214",,,"ABL1,p150,ABL,c-ABL,JTK7,bcr/abl,v-abl,P00519,..."
2,ABL2,"c-abl oncogene 2, non-receptor tyrosine kinase",27,1:179107718-179143044,1q24-q25,yes,,AML,,,L,Dom,oncogene,T,ETV6,,,"ABL2,ARG,RP11-177A2_3,ABLL,P42684,ENSG00000143..."
3,ACKR3,atypical chemokine receptor 3,57007,2:-,2q37.3,yes,,lipoma,,,M,Dom,oncogene,T,HMGA2,,,
4,ACSL3,acyl-CoA synthetase long-chain family member 3,2181,2:222908773-222941654,2q36,yes,,prostate,,,E,Dom,,T,ETV1,,,"2181,PRO2194,ACS3,FACL3,O95573,ENSG00000123983..."


In [7]:
# Subset the features to just those from the cancer list
filtered_features = cancer_genes.merge(all_features, how="inner", on="Gene Symbol")
print "Cancer gene features in dataset:", len(filtered_features)
filtered_features.head()

Cancer gene features in dataset: 589


Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Chr Band,Somatic,Germline,Tumour Types(Somatic),Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
0,ABI1,abl-interactor 1,10006,10:26748570-26860863,10p11.2,yes,,AML,,,L,Dom,TSG,T,KMT2A,,,"ABI1,E3B1,ABI-1,SSH3BP1,10006"
1,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25,9:130835447-130885683,9q34.1,yes,,"CML, ALL, T-ALL",,,L,Dom,oncogene,"T, Mis","BCR, ETV6, NUP214",,,"ABL1,p150,ABL,c-ABL,JTK7,bcr/abl,v-abl,P00519,..."
2,ABL2,"c-abl oncogene 2, non-receptor tyrosine kinase",27,1:179107718-179143044,1q24-q25,yes,,AML,,,L,Dom,oncogene,T,ETV6,,,"ABL2,ARG,RP11-177A2_3,ABLL,P42684,ENSG00000143..."
3,ACKR3,atypical chemokine receptor 3,57007,2:-,2q37.3,yes,,lipoma,,,M,Dom,oncogene,T,HMGA2,,,
4,ACSL3,acyl-CoA synthetase long-chain family member 3,2181,2:222908773-222941654,2q36,yes,,prostate,,,E,Dom,,T,ETV1,,,"2181,PRO2194,ACS3,FACL3,O95573,ENSG00000123983..."


In [8]:
# Get a list of all the samples identifiers
samples = pd.DataFrame(xena.dataset_samples(hub, "target_RSEM_Hugo_norm_count"), columns=["Sample ID"])
print "Samples found:", len(samples)
samples.head()

Samples found: 734


Unnamed: 0,Sample ID
0,TARGET-30-PAMEZH-01
1,TARGET-30-PATEKG-01
2,TARGET-52-PASDLA-11
3,TARGET-10-PAPECF-09
4,TARGET-30-PASWFB-01


In [16]:
# Get expression levels for the filtered gene list for all the samples
expression = pd.DataFrame(xena.dataset_probe_values(hub, "target_RSEM_Hugo_norm_count", 
                                       list(samples["Sample ID"].values), 
                                       list(filtered_features["Gene Symbol"].values)),
                          index=list(filtered_features["Gene Symbol"].values),
                          columns=list(samples["Sample ID"].values), dtype=np.float32)
expression.head()

Unnamed: 0,TARGET-30-PAMEZH-01,TARGET-30-PATEKG-01,TARGET-52-PASDLA-11,TARGET-10-PAPECF-09,TARGET-30-PASWFB-01,TARGET-30-PALUYS-01,TARGET-20-PANLIR-09,TARGET-20-PARUBT-09,TARGET-30-PASXRJ-01,TARGET-10-PAPDWT-04,...,TARGET-50-PAKYLT-01,TARGET-50-PAKFYV-01,TARGET-10-PAPLDM-04,TARGET-10-PAPISG-04,TARGET-10-PAPEJN-09,TARGET-10-PANCVR-04,TARGET-21-PATKWH-09,TARGET-10-PASDYK-03,TARGET-50-PAKXWB-01,TARGET-50-PALERC-01
ABI1,9.8204,10.8253,11.0938,11.5639,10.4566,10.063,11.9426,13.3771,11.1769,14.4451,...,11.5538,11.2825,12.8582,11.9172,12.6953,14.2776,13.4626,14.2672,11.0018,11.094
ABL1,11.9789,11.2616,11.8218,11.6569,12.1429,12.2606,11.3195,9.4311,11.8375,14.0254,...,13.0998,12.8914,12.5844,12.4331,11.9663,15.1095,7.8118,13.0625,12.5424,12.921
ABL2,10.9827,11.043,10.0031,9.5065,11.2968,10.8239,9.4916,8.167,10.6976,10.0839,...,11.4743,10.9515,11.6621,9.555,12.0088,12.0256,9.5932,11.4478,10.9206,10.9928
ACKR3,8.6053,9.4424,8.9924,9.5515,8.3144,10.674,6.8925,5.6306,9.7664,7.1259,...,10.9786,10.3091,8.8574,2.8383,9.6128,4.5885,4.3741,10.2907,10.6948,11.0695
ACSL3,10.4631,11.25,11.6645,8.462,11.1604,10.4821,11.0054,12.3537,11.2083,11.6697,...,12.7122,12.9794,10.6162,9.9544,10.9331,12.0244,10.3556,11.8332,11.2223,11.2098


In [17]:
# See if any are not expressed at all in all samples
expression[(expression.T == 0).all()]

Unnamed: 0,TARGET-30-PAMEZH-01,TARGET-30-PATEKG-01,TARGET-52-PASDLA-11,TARGET-10-PAPECF-09,TARGET-30-PASWFB-01,TARGET-30-PALUYS-01,TARGET-20-PANLIR-09,TARGET-20-PARUBT-09,TARGET-30-PASXRJ-01,TARGET-10-PAPDWT-04,...,TARGET-50-PAKYLT-01,TARGET-50-PAKFYV-01,TARGET-10-PAPLDM-04,TARGET-10-PAPISG-04,TARGET-10-PAPEJN-09,TARGET-10-PANCVR-04,TARGET-21-PATKWH-09,TARGET-10-PASDYK-03,TARGET-50-PAKXWB-01,TARGET-50-PALERC-01
DUX4L1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Calculate some stats
expression.describe()

Unnamed: 0,TARGET-30-PAMEZH-01,TARGET-30-PATEKG-01,TARGET-52-PASDLA-11,TARGET-10-PAPECF-09,TARGET-30-PASWFB-01,TARGET-30-PALUYS-01,TARGET-20-PANLIR-09,TARGET-20-PARUBT-09,TARGET-30-PASXRJ-01,TARGET-10-PAPDWT-04,...,TARGET-50-PAKYLT-01,TARGET-50-PAKFYV-01,TARGET-10-PAPLDM-04,TARGET-10-PAPISG-04,TARGET-10-PAPEJN-09,TARGET-10-PANCVR-04,TARGET-21-PATKWH-09,TARGET-10-PASDYK-03,TARGET-50-PAKXWB-01,TARGET-50-PALERC-01
count,589.0,589.0,589.0,589.0,589.0,589.0,589.0,589.0,589.0,589.0,...,589.0,589.0,589.0,589.0,589.0,589.0,589.0,589.0,589.0,589.0
mean,9.645804,9.678156,9.958623,9.157219,9.703074,9.714006,9.46464,9.003629,10.07783,9.802085,...,10.598978,10.329492,10.01228,9.213649,9.932474,11.343916,8.765203,11.155754,10.262547,10.288921
std,3.290569,3.313647,3.35387,4.272143,3.554448,3.516336,4.024158,4.224857,3.239847,4.69112,...,3.198588,3.454184,4.562861,4.477618,4.421628,4.054626,4.531193,3.509086,3.298093,3.501954
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.3625,8.6642,8.7754,7.6008,8.1877,8.5121,7.4079,7.0636,9.2703,7.4378,...,9.4822,9.0379,7.8083,7.0092,8.0518,9.155,6.2604,8.8655,9.1572,9.1133
50%,10.3829,10.4784,10.7805,10.2426,10.7755,10.5143,10.7324,9.939,10.864,11.2668,...,11.4216,11.2389,11.3077,10.6138,11.2964,12.5426,9.9861,11.8232,11.0169,11.1911
75%,11.6826,11.6079,11.9825,11.884,12.0746,11.9537,12.2392,11.8571,11.9491,13.1017,...,12.6542,12.6109,13.3127,12.227,12.9833,14.123,12.09,13.597,12.2778,12.5898
max,17.6313,17.257799,16.6401,19.1518,16.8806,17.8258,17.564699,18.628799,16.750799,19.573099,...,16.8965,16.234699,17.7087,18.494699,18.561501,18.940901,18.021,19.0483,17.580099,16.384001


In [19]:
# Get some stats per gene
expression.T.describe().T.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ABI1,734.0,11.928934,1.206492,9.4488,10.9251,11.74595,12.877825,15.6091
ABL1,734.0,12.089824,0.962255,7.8118,11.566075,12.1249,12.708775,15.1095
ABL2,734.0,10.631668,0.954671,6.4885,10.04135,10.73695,11.233225,13.256
ACKR3,734.0,7.77858,2.58461,0.0,6.111875,8.22825,9.6953,14.2935
ACSL3,734.0,10.931127,1.036644,7.1345,10.339275,11.0144,11.595975,13.7378


In [20]:
# Find all the genes in a sample expressed over the 75% of the cohort
sample = expression["TARGET-30-PAMEZH-01"]
cutoff = expression.T.describe().T["75%"]
up = sample[(sample > cutoff)]
print "Outliers:", len(up)
up.head()

Outliers: 111


ACVR1    10.9343
AKT1     12.8083
ALDH2    12.6925
ALK      11.0032
AMER1     9.5687
Name: TARGET-30-PAMEZH-01, dtype: float32