In [1]:
import sys
import os
from urllib.request import urlopen, urlretrieve
import gzip
HOMEDIR = os.getcwd()

import numpy as np
import pandas as pd
from Bio import Entrez, Medline

In [2]:
Entrez.email = 'sowmyamanojna@gmail.com'
handle = Entrez.esearch(db='gds', term="PCOS", retmax=90, usehistory="n")

In [3]:
results = Entrez.read(handle)
newhandle = Entrez.esummary(db='gds', retmax=10, webenv=results['WebEnv'], query_key=results['QueryKey'])
summary = Entrez.read(newhandle)

In [4]:
def get_soft_url(file_id, datatype):
    """Get soft file depending on file type and ID"""
    if datatype == 'GSE':
        # soft_file = "ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/" + datatype + file_id + "/" + datatype + file_id + "_family.soft.gz"
        soft_file = "https://ftp.ncbi.nlm.nih.gov/geo/series/" + "GSE" + file_id[:-3] + "nnn/GSE" + file_id + "/soft/GSE" + file_id + "_family.soft.gz"
    else:
        soft_file = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/" + "GSE" + file_id[:-3] + "nnn/GSE" + file_id + "/soft/GSE" + file_id + "_family.soft.gz"
        # soft_file = "ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/" + "GSE" + file_id + "/" + "GSE" + file_id + "_family.soft.gz"
        #soft_file = "ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/" + datatype + "/" + datatype + file_id + ".soft.gz"
    return soft_file

In [5]:
hits = []
for i in range(len(summary)):
    samples = []
    geo_data = {'id':summary[i]['Id'],
                'n_samples':summary[i]['n_samples'],
                'pubdate':summary[i]['PDAT'],
                'platform':summary[i]['PlatformTitle'],
                'suppfile':summary[i]['suppFile'],
                'taxon':summary[i]['taxon'],
                'entry_type':summary[i]['entryType'],
                'gpl':summary[i]['GPL'],
                'gse':summary[i]['GSE'],
                'pubmed_ids':summary[i]['PubMedIds'],
                'title':summary[i]['title'],
                'gds_type':summary[i]['gdsType'],
                'summary':summary[i]['summary'],
                'soft_file': "https" + summary[i]["FTPLink"][3:] + "/soft/" + summary[i]["Accession"] + "_family.soft.gz"
                }
    hits.append(geo_data)

In [21]:
def parse_soft(softzip):
    """ Take a SOFT gz file and parse it for specific data parse_soft
        Dictionary with platform and available suppfile(s) when available
        Type: results['platform'] (string)
        Type: results['suppfiles'] (list)
    """
    print("Softzip:", softzip)
    z = gzip.open(softzip, 'r')
    results = {}
    suppfiles = []
    platform = ''
    for line in z:
        line = str(line)
        # Get platform title
        if line.startswith(b'!Platform_title'):
            print("Found Platform!")
            index = line.find(b'= ')
            platform = str(line[index+2:])
            continue
            
        # Get link to supplementary data file(s)
        if line.startswith(b'!Series_supplementary_file'):
            print("Found Supplementary")
            index = line.find(b'= ')
            suppfiles.append(str(line[index+2:]).strip())
            continue
        
        # If platform and suppfiles found, stop reading (long!) file.
        if platform != '' and suppfiles != []:
            break
        else:
            continue
    results['platform'] = platform.strip()
    results['suppfiles'] = suppfiles    
    return results

In [22]:
def dl_geo(data):
    """Download geo data and log info in the following format
    """
    for item in data:
        if item['suppfile'] != "":
            # raw_data = 'ftp://ftp.ncbi.nih.gov/geo/series/GSE' + str(item['gse'][:-3]) + '/GSE' + str(item['gse']) + '/suppl/GSE' + str(item['gse']) + '_RAW.tar'
            raw_data = "https://ftp.ncbi.nlm.nih.gov/geo/series/" + "GSE" + str(item['gse'][:-3]) + "nnn/GSE" + str(item['gse']) + "/suppl/"
            print(raw_data)
        else:
            raw_data = ''
        
        uniqueid =  "GSE" + str(item['gse'])
        newdir = HOMEDIR + "/" + uniqueid
        
        """ Check to see if the directory already exists.
            If so, skip to next item.
        """
        if os.path.isdir(newdir) == True:
            print("Skipped " + uniqueid)
            continue
        else:
            os.makedirs(newdir) # make new directory for sample
            os.chdir(newdir) # change to the created directory
            """ Download the SOFT file for series
            """
            print("uniqueid:", uniqueid)
            save_soft = uniqueid + "_family.soft.gz"
            print("URL retrieve:", urlretrieve(item['soft_file'], save_soft))
            if urlretrieve(item['soft_file'], save_soft) == False:
                print("No " + str(save_soft) + " file available.")
                break
            else:
        
                """ Parse the downloaded SOFT file for platform & possible suppfile(s)
                    Dictionary returned with platform and suppfiles (when available.)
                """
                print("save_soft:", save_soft)
                soft_res = parse_soft(save_soft)
                print(soft_res)
                """ Download the raw file if available
                    First attempts with constructed url, then falls back on SOFT file.
                    If then not available, nothing is stored.
                """
                if raw_data != '': # if generated _RAW.tar.gz generated (CEL, TXT, avail)
                    save_raw = uniqueid + "_RAW.tar" # create local filename
                    print("RAW data:", raw_data)
                    if urlretrieve(raw_data, save_raw) == False: # if dl attempt fails try soft for urls
                        for supf in soft_res['suppfiles']: # check for multiple suppfile urls
                            if supf != '': # verify if supplementary files entry is not empty or not available
                                if urlretrieve(supf, os.path.basename(supf)) == False: # if dl attempt fails give up
                                    pass
                                else:
                                    print("supf:", supf)
                                    urlretrieve(supf, os.path.basename(supf)) # download the file, finally!
                            else:
                                pass
                    else:
                        pass
                else:
                    pass
            
            """ Write text file with related information about the current
                downloaded series. Save to filename with GSEID (ex. GSE0000.txt)
            """
            if item['platform'] == '':
                platf_name = soft_res['platform']
            else:
                platf_name = item['platform']
            geo_results = "GSE ID: " + str(item['gse']) + '\n' + \
                        "PubDate: " + str(item['pubdate']) + '\n' + \
                        "Number of Samples: " + str(item['n_samples']) + '\n' + \
                        "Title: " + item['title'] + '\n' + \
                        "Taxonomy: " + item['taxon'] + '\n' + \
                        "PubMed: " + str(item['pubmed_ids']) + '\n' + \
                        "SOFT downloaded from: " + item['soft_file'] + '\n' + \
                        "Platform: " + platf_name + '\n' + \
                        "Raw Data downloaded from: " + raw_data + "\n"
            filename = uniqueid + ".txt"
            f = open(filename, 'w')
            f.write(geo_results)
            f.close()
            os.chdir(HOMEDIR)
            
dl_geo(hits)

https://ftp.ncbi.nlm.nih.gov/geo/series/GSE146nnn/GSE146856/suppl/
Skipped GSE146856
https://ftp.ncbi.nlm.nih.gov/geo/series/GSE155nnn/GSE155314/suppl/
Skipped GSE155314
https://ftp.ncbi.nlm.nih.gov/geo/series/GSE145nnn/GSE145461/suppl/
uniqueid: GSE145461
URL retrieve: ('GSE145461_family.soft.gz', <http.client.HTTPMessage object at 0x7ff33851a880>)
save_soft: GSE145461_family.soft.gz
Softzip: GSE145461_family.soft.gz


TypeError: startswith first arg must be str or a tuple of str, not bytes

In [12]:
df = pd.DataFrame(summary)
# print(df.columns)
display(df)
print(df["suppFile"])

Unnamed: 0,Item,Id,Accession,GDS,title,summary,GPL,GSE,taxon,entryType,...,ExtRelations,n_samples,SeriesTitle,PlatformTitle,PlatformTaxa,SamplesTaxa,PubMedIds,Projects,FTPLink,GEO2R
0,[],200146856,GSE146856,,Novel PGK1 determines AR stability dependent o...,Ovulation disorder is the main characteristic ...,20795,146856,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",6,,,,,"[IntegerElement(33096483, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE146nn...,no
1,[],200155314,GSE155314,,Prenatal Androgenization Alters the Developmen...,Polycystic ovary syndrome (PCOS) is the most c...,21103,155314,Mus musculus,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",48,,,,,"[IntegerElement(33095238, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE155nn...,no
2,[],200145461,GSE145461,,Next Generation Sequencing Analysis of exosome...,Purpose: the goals of this study are to identi...,20301,145461,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",10,,,,,"[IntegerElement(32651991, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE145nn...,no
3,[],200155489,GSE155489,,Comprehensive molecular features of polycystic...,We aimed to elucidate the molecular features a...,20795,155489,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",20,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE155nn...,no
4,[],200156961,GSE156961,,Ovarian tissues of prenatally androgenized (PN...,This SuperSeries is composed of the SubSeries ...,13112,156961,Mus musculus,GSE,...,[],15,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
5,[],200156960,GSE156960,,MBD-seq in ovarian tissues of prenatally andro...,To investigate the molecular mechanism of PCOS...,13112,156960,Mus musculus,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",10,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
6,[],200156895,GSE156895,,RNA-seq in ovarian tissues of prenatally andro...,To investigate the molecular mechanism of PCOS...,13112,156895,Mus musculus,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",5,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
7,[],200156140,GSE156140,,ATAC-Seq Data of Adipose Stem Cells from Subcu...,"In previous studies, multiple animal models (r...",20301,156140,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",18,,,,,"[IntegerElement(33228780, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
8,[],200156067,GSE156067,,RNA-seq Data of Newly-Formed Adipocytes from A...,Normal-weight polycystic ovary syndrome (PCOS)...,20301,156067,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",18,,,,,"[IntegerElement(33228780, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
9,[],200154274,GSE154274,,"RRBS, RNA-seq of placenta in women with PCOS a...",Polycystic ovarian syndrome (PCOS) is characte...,16791;18694,154274,Homo sapiens; Rattus norvegicus,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",24,,,,,"[IntegerElement(32949999, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE154nn...,no


0                TXT
1                TXT
2               XLSX
3                CSV
4                   
5               XLSX
6               XLSX
7    BED, NARROWPEAK
8                TXT
9               XLSX
Name: suppFile, dtype: object


In [24]:
df = pd.DataFrame(summary)
df

Unnamed: 0,Item,Id,Accession,GDS,title,summary,GPL,GSE,taxon,entryType,...,ExtRelations,n_samples,SeriesTitle,PlatformTitle,PlatformTaxa,SamplesTaxa,PubMedIds,Projects,FTPLink,GEO2R
0,[],200146856,GSE146856,,Novel PGK1 determines AR stability dependent o...,Ovulation disorder is the main characteristic ...,20795,146856,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",6,,,,,"[IntegerElement(33096483, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE146nn...,no
1,[],200155314,GSE155314,,Prenatal Androgenization Alters the Developmen...,Polycystic ovary syndrome (PCOS) is the most c...,21103,155314,Mus musculus,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",48,,,,,"[IntegerElement(33095238, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE155nn...,no
2,[],200145461,GSE145461,,Next Generation Sequencing Analysis of exosome...,Purpose: the goals of this study are to identi...,20301,145461,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",10,,,,,"[IntegerElement(32651991, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE145nn...,no
3,[],200155489,GSE155489,,Comprehensive molecular features of polycystic...,We aimed to elucidate the molecular features a...,20795,155489,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",20,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE155nn...,no
4,[],200156961,GSE156961,,Ovarian tissues of prenatally androgenized (PN...,This SuperSeries is composed of the SubSeries ...,13112,156961,Mus musculus,GSE,...,[],15,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
5,[],200156960,GSE156960,,MBD-seq in ovarian tissues of prenatally andro...,To investigate the molecular mechanism of PCOS...,13112,156960,Mus musculus,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",10,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
6,[],200156895,GSE156895,,RNA-seq in ovarian tissues of prenatally andro...,To investigate the molecular mechanism of PCOS...,13112,156895,Mus musculus,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",5,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
7,[],200156140,GSE156140,,ATAC-Seq Data of Adipose Stem Cells from Subcu...,"In previous studies, multiple animal models (r...",20301,156140,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",18,,,,,"[IntegerElement(33228780, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
8,[],200156067,GSE156067,,RNA-seq Data of Newly-Formed Adipocytes from A...,Normal-weight polycystic ovary syndrome (PCOS)...,20301,156067,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",18,,,,,"[IntegerElement(33228780, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE156nn...,no
9,[],200154274,GSE154274,,"RRBS, RNA-seq of placenta in women with PCOS a...",Polycystic ovarian syndrome (PCOS) is characte...,16791;18694,154274,Homo sapiens; Rattus norvegicus,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP2...",24,,,,,"[IntegerElement(32949999, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE154nn...,no
