In [1]:
from omicidx.geo import parser as gp

In [2]:
?gp.get_geo_accessions

[0;31mSignature:[0m
[0mgp[0m[0;34m.[0m[0mget_geo_accessions[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0metyp[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size[0m[0;34m=[0m[0;36m1000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0madd_term[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0memail[0m[0;34m=[0m[0;34m'user@example.com'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Get GEO accessions by etyp

Useful for getting all the ETYP accessions for
later bulk processing

Parameters
----------
etyp: str
    One of GSE, GPL, GSM, GDS
batch_size: int 
    the number of accessions to return in one batch. 
    Transparent to the user, as this returns an iterator.
add_term: str
    Add a search term for the query. Useful to limit
    by date or search for specific text. For example, 
    to limit by date: '2007/01/01:2007/03/01[PDAT]'
email: str
    user email (not imp

In [3]:
import datetime
import logging
logging.basicConfig(level=logging.INFO)

logger = logging.getLogger()

def yielder():
    start_date = datetime.date(2020, 1, 1)
    end_date = datetime.date.today()
    delta = datetime.timedelta(days=1)
    while start_date < end_date:
        logger.info(f"date: {start_date}")
        d1 = start_date.strftime('%Y/%m/%d')
        start_date = start_date+delta
        d2 = start_date.strftime('%Y/%m/%d')
        term = f"{d1}:{d2}[PDAT]"
        geo_acc_iterator = gp.get_geo_accessions(add_term=term)
        for a in geo_acc_iterator:
            yield a

In [75]:
import httpx
import asyncio

queue = asyncio.Queue(15)


async def get_url(GEO):
    url = f"https://geo-serverless-whnnxetv4q-uc.a.run.app/geo/{GEO}"
    async with httpx.AsyncClient() as client:
        try:
            resp = await client.get(url)
            print(GEO)
            print(resp.json())
            return resp.json()
        except:
            print('error')
    
async def producer(queue):
    for geo in yielder():
        await queue.put(geo)
    
async def consumer(queue):
    while True:
        val = await queue.get()
        ret = await get_url(val)
        queue.task_done()
        return(ret)

async def main():
    consumers = []
    for _ in range(15):
        consumer1 = asyncio.create_task(consumer(queue))
        consumers.append(consumer1)
    try:
        for geo in yielder():
            await queue.put(geo)
        await queue.join()
    finally:
        for t in consumers:
            t.cancel()


In [73]:
await main()

INFO:root:date: 2020-01-01
INFO:root:found 8552 records for None database


GSE84351
{'GSE84351': {'title': 'Reprogramming Hutchinson-Gilford Progeria Syndrome fibroblasts resets epigenomic landscape in patient-derived induced pluripotent stem cells [Affymetrix]', 'status': 'Public on Jan 02 2020', 'submission_date': '2016-07-13', 'last_update_date': '2020-01-04', 'accession': 'GSE84351', 'subseries': [], 'bioprojects': ['PRJNA340427'], 'sra_studies': [], 'contact': {'city': 'Ottawa', 'name': {'first': 'Carol', 'middle': '', 'last': 'Perez-Iratxeta'}, 'email': 'ogicinfo@ohri.ca', 'state': 'ON', 'address': '501 Smyth Rd.', 'department': 'Ottawa Hospital Research Institute', 'country': 'Canada', 'web_link': None, 'institute': 'Ontario Genomics Innovation Centre (OGIC)', 'zip_postal_code': None, 'phone': '(613) 737-8899    -73255'}, 'type': ['Expression profiling by array'], 'summary': 'Hutchinson-Gilford Progeria Syndrome (HGPS) is a segmental premature aging disorder caused by the accumulation of the truncated form of Lamin A known as Progerin within the nuclea

CancelledError: 