In [1]:

import os as os

import requests as req
import pandas as pd

tsv_table = '/home/pebert/work/code/mpggit/crossspecies/annotation/datasrc/sra/20180805_ENA_PRJEB6906_Villar2015.tsv'

clean_tsv = tsv_table.replace('.tsv', '_clean.tsv')

out_folder = '/TL/deep/fhgfs/projects/pebert/thesis/projects/cross_species/rawdata/validation'


def download_file(src, trg):
    
    tmp_trg = trg + '.part'
    response = req.get(src, stream=True)
    with open(tmp_trg, 'wb') as handle:
        for chunk in response.iter_content(chunk_size=4096):
            if chunk:  # filter out keep-alive new chunks
                handle.write(chunk)
    os.rename(tmp_trg, trg)
    return True


tsv = pd.read_csv(tsv_table, delimiter='\t', header=0)

species_ids = [9606, 9913, 9615, 9685, 9544, 13616, 10092, 9986, 10116, 9823]
assm = ['hg19', 'bosTau7', 'canFam3', 'felCat5', 'rheMac2',
        'monDom5', 'mm9', 'oryCun2', 'rn5', 'susScr2']
assm_lut = dict((s, a) for s, a in zip(species_ids, assm))

tsv = tsv.loc[tsv['tax_id'].isin(species_ids), :].copy()
tsv['mark'] = tsv['run_alias'].str.extract('(?P<MARK>(input|H3K4me3|H3K27Ac|Input))', expand=False)['MARK']
tsv.loc[tsv['mark'] == 'H3K27Ac', 'mark'] = 'H3K27ac'
tsv.loc[tsv['mark'] == 'input', 'mark'] = 'Input'
tsv.loc[pd.isnull(tsv['mark']), 'mark'] = 'Input'
tsv.sort_values(['scientific_name', 'sample_title', 'mark'], inplace=True)
tsv['assembly'] = tsv['tax_id'].replace(assm_lut)

biorep = 0
last_species = None
last_sample = None
dl_list = []
local_files = []
for row in tsv.itertuples():
    if last_species is None:
        last_species = row.scientific_name
        last_sample = row.sample_title
    if row.scientific_name != last_species:
        biorep = 0
        last_sample = row.sample_title
        last_species = row.scientific_name
    if row.sample_title != last_sample:
        last_sample = row.sample_title
        biorep += 1
    # bp_mm9_ERX1489062_se-b3_ncd4_H3K27ac.bam
    formatter = {'assm': row.assembly, 'exp': row.experiment_accession,
                 'biorep': biorep, 'mark': row.mark}
    out_name = 'vl_{assm}_{exp}_se-b{biorep}_liver_{mark}.fastq.gz'.format(**formatter)
    out_path = os.path.join(out_folder, out_name)
    if not os.path.isfile(out_path):
        dl_url = 'http://' +  row.fastq_ftp
        dl_list.append((dl_url, out_path))
    local_files.append(out_name)
    
tsv['local_file'] = local_files

if not os.path.isfile(clean_tsv):
    tsv.to_csv(clean_tsv, sep='\t', header=True, index=False)
    
for s, t in dl_list:
    _ = download_file(s, t)
    print('Success ', os.path.basename(t))
        
        
    



