# initialization

In [9]:
from pymongo import MongoClient
client = MongoClient()
db = client['arraymap_ga4gh']

# retrieving all biosample ids for a given ontology

In [10]:
biosampleIds = db.biosamples.distinct("id", {"bio_characteristics.ontology_terms.term_id" : "NCIT:C4017"})
for i in range(10):
    print(biosampleIds[i])

PGX_AM_BS_GSM514646
PGX_AM_BS_GSM510744
PGX_AM_BS_GSM303729
PGX_AM_BS_GSM481747
PGX_AM_BS_TCGA-AR-A0TX-01A-11D-A087-01
PGX_AM_BS_GSM182856
PGX_AM_BS_GSM511255
PGX_AM_BS_GSM783942
PGX_AM_BS_GSM255268
PGX_AM_BS_GSM481821


# biosample ids are used to get callset ids

In [11]:
callsetIds = db.callsets.distinct("id", { 'biosample_id' : { '$in' : biosampleIds } })
for i in range(10):
    print(callsetIds[i])

PGX_AM_CS_GSM231342
PGX_AM_CS_GSM321941
PGX_AM_CS_GSM481374
PGX_AM_CS_TCGA-A1-A0SK-01A-12D-A087-01
PGX_AM_CS_GSM217530
PGX_AM_CS_GSM510699
PGX_AM_CS_GSM149983
PGX_AM_CS_TCGA-BH-A0EA-01A-11D-A111-01
PGX_AM_CS_GSM933800
PGX_AM_CS_GSM303756


# variant query


In [12]:
variantIds = db.variants.distinct("id", 
	{ '$and' : [
			{ 'reference_name': '1'},
			{ 'start'	:	{ '$gte': 10000 }},
			{ 'end' 	:	{ '$lte': 100000 }},
		],
	}
)
for i in range(10):
    print(variantIds[i])

PGX_AM_V_1043499
PGX_AM_V_620100
PGX_AM_V_1962374
PGX_AM_V_1107284
PGX_AM_V_1253061
PGX_AM_V_2258777
PGX_AM_V_1419401
PGX_AM_V_1270893
PGX_AM_V_1880600
PGX_AM_V_1435065


# Using variantIds to retrieve the callset ids

 These can then be intersected with the callsetIds from a metadata query like above.

In [13]:
callsetIds_from_variants = db.variants.distinct("calls.call_set_id", { 'id' : { '$in' : variantIds } })
for i in range(10):
    print(callsetIds[i])

PGX_AM_CS_GSM231342
PGX_AM_CS_GSM321941
PGX_AM_CS_GSM481374
PGX_AM_CS_TCGA-A1-A0SK-01A-12D-A087-01
PGX_AM_CS_GSM217530
PGX_AM_CS_GSM510699
PGX_AM_CS_GSM149983
PGX_AM_CS_TCGA-BH-A0EA-01A-11D-A111-01
PGX_AM_CS_GSM933800
PGX_AM_CS_GSM303756
