# initialization

In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client['test']

# retrieving all biosample ids for a given ontology

In [2]:
biosampleIds = db.biosamples.distinct("id", {"bio_characteristics.ontology_terms.term_id" : "NCIT:C4017"})
for i in range(10):
    print(biosampleIds[i])

PGX_AM_BS_GSM217472
PGX_AM_BS_BrCa-fri-S0052
PGX_AM_BS_BrCa-fri-S0081
PGX_AM_BS_GSM217554
PGX_AM_BS_GSM217530
PGX_AM_BS_GSM217450
PGX_AM_BS_GSM217411
PGX_AM_BS_GSM217489
PGX_AM_BS_GSM217434
PGX_AM_BS_BrCa-fri-S1522


# biosample ids are used to get callset ids

In [3]:
callsetIds = db.callsets.distinct("id", { 'biosample_id' : { '$in' : biosampleIds } })
for i in range(10):
    print(callsetIds[i])

PGX_AM_CS_GSM217530
PGX_AM_CS_GSM217442
PGX_AM_CS_BrCa-fri-S1504
PGX_AM_CS_GSM217419
PGX_AM_CS_GSM217411
PGX_AM_CS_GSM217497
PGX_AM_CS_GSM217479
PGX_AM_CS_GSM217436
PGX_AM_CS_BrCa-fri-S1524
PGX_AM_CS_BrCa-fri-S0257


# variant query


In [4]:
variantIds = db.variants.distinct("id", 
	{ '$and' : [
			{ 'reference_name': '1'},
			{ 'start'	:	{ '$gte': 10000 }},
			{ 'end' 	:	{ '$lte': 10000000 }},
		],
	}
)
for i in range(10):
    print(variantIds[i])

PGX_AM_V_109994
PGX_AM_V_109884
PGX_AM_V_111075
PGX_AM_V_110476
PGX_AM_V_110050
PGX_AM_V_110391
PGX_AM_V_109609
PGX_AM_V_9681
PGX_AM_V_109161
PGX_AM_V_110906


# Using variantIds to retrieve the callset ids

 These can then be intersected with the callsetIds from a metadata query like above.

In [5]:
callsetIds_from_variants = db.variants.distinct("calls.call_set_id", { 'id' : { '$in' : variantIds } })
for i in range(10):
    print(callsetIds[i])

PGX_AM_CS_GSM217530
PGX_AM_CS_GSM217442
PGX_AM_CS_BrCa-fri-S1504
PGX_AM_CS_GSM217419
PGX_AM_CS_GSM217411
PGX_AM_CS_GSM217497
PGX_AM_CS_GSM217479
PGX_AM_CS_GSM217436
PGX_AM_CS_BrCa-fri-S1524
PGX_AM_CS_BrCa-fri-S0257
