## Examples of pulling expression data directly out of the MedBook mongo database

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Connect to the database
import pymongo
db = pymongo.MongoClient("mongo").MedBook
print "Found {} data sets".format(db.data_sets.count())

Found 28 data sets


In [4]:
def get_expression_levels(data_set_name, limit=0):
    """
    Returns a pandas dataframe with rows as features and columns as samples
    """
    data_set = db.data_sets.find_one({"name": data_set_name})
    print "Found {} samples in {}".format(len(data_set["sample_labels"]), data_set_name)
    features = db.genomic_expression.find({"data_set_id": data_set["_id"]}, limit=limit)
    features = features.sort([("feature_label", pymongo.ASCENDING)])
    return pd.DataFrame({f["feature_label"]: f["values"] for f in features}, 
                        index=data_set["sample_labels"], dtype=np.float32).T

In [8]:
# Get expression levels for all samples in a study
# This can take a few seconds if you pick a big dataset
samples = get_expression_levels("Treehouse Prospectives 2016.10.14")
print "Found {} samples with {} features".format(samples.shape[1], samples.shape[0])
print "First 5 features"
samples.head()

Found 11 samples in Treehouse Prospectives 2016.10.14
Found 11 samples with 58581 features
First 5 features


Unnamed: 0,ckcc/TH03_0112_S01_RNASeq,ckcc/TH03_0114_S01_RNASeq,ckcc/TH03_0118_S01_RNASeq,ckcc/TH03_0010_S02_RNASeq,ckcc/TH03_0113_S01_RNASeq,ckcc/TH03_0115_S01_RNASeq,ckcc/TH03_0113_S02_RNASeq,ckcc/TH03_0025_S02_RNASeq,ckcc/TH03_0117_S01_RNASeq,ckcc/TH03_0116_S01_RNASeq,ckcc/TH03_0112_S02_RNASeq
5S_rRNA,0.0,0.0,0.0,0.0,0.0,0.034135,0.0,0.0,0.0,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,0.137901,0.0,0.0,0.0,0.0,0.182055,0.0,0.0,0.0,0.0,0.0
A1BG,6.38863,10.72381,8.41336,7.476323,9.012438,8.226566,10.706632,7.350043,6.907166,14.760183,6.608877
A1BG-AS1,6.96807,2.785655,8.379173,7.006485,7.645007,8.136575,8.594339,7.54425,5.962975,6.243307,6.57355


In [9]:
# Get data from just one sample
samples["ckcc/TH03_0118_S01_RNASeq"].head()

5S_rRNA      0.000000
5_8S_rRNA    0.000000
7SK          0.000000
A1BG         8.413360
A1BG-AS1     8.379173
Name: ckcc/TH03_0118_S01_RNASeq, dtype: float32

In [13]:
# Compute stats on expression for just JAK2 accross all samples
samples.T["JAK2"].describe()

count    11.000000
mean      9.363747
std       1.097198
min       6.494574
25%       9.270689
50%       9.702656
75%      10.048644
max      10.142463
Name: JAK2, dtype: float64