In [8]:
from tqdm.autonotebook import tqdm
import pandas as pd 
import os
import matplotlib.pyplot as plt
import subprocess
import time
basepath = "/dartfs-hpc/rc/lab/H/HoenA/Lab/QNguyen/ResultsFiles/data"
asp_key = "/dartfs-hpc/rc/home/k/f00345k/.aspera/connect/etc/asperaweb_id_dsa.openssh"
asp_cmd = "ascp -k 1 -QT -l 300m -P33001 -i" 
wget_cmd = "wget"

## Getting CRC data 

First, we're processing and filtering the manifest data from ENA

In [9]:
manifest = pd.read_csv("crc_16s.tsv", sep="\t")
manifest = manifest[manifest.library_strategy == "AMPLICON"]
manifest = manifest[manifest.groupby("sample_title")['read_count'].transform('max') == manifest['read_count']]
manifest = manifest.reset_index().drop('index', axis = 1)

crc_path = basepath + "/crc_16s"

Then we go through each ftp or aspera link and download

In [10]:
# cmd = "{} {} {} {}".format(asp_cmd, asp_key, url, crc_path) for aspera - however 
# as of this current version aspera doesn't work due to permission issues 

def download_sample(idx):
    print(manifest.head)
    failed = []
    for j in range(0,2):
        url = manifest.iloc[idx].fastq_ftp.split(";")[j]
        sname = sname = manifest.iloc[idx].sample_title.split(" ")[-1]
        if j == 1:
            fname = sname + "_R2_001.fastq.gz"
        else:
            fname = sname + "_R1_001.fastq.gz"
        fullname = crc_path + "/" + fname
        cmd = "wget {} -O {}".format(url, crc_path + "/" + fname)
        if os.path.exists(fullname):
            pass
        else:
            print("Downloading {}".format(fname))
            if idx % 10 == 0 & idx >= 10:
                time.sleep(10)
            subprocess.run(args=["wget", url, "-O", fullname, "--quiet"], stdout=subprocess.DEVNULL)
            if os.path.getsize(fullname) == 0:
                print("For some reason this is not downloading, retrying...")
                subprocess.run(args=["wget", url, "-O", fullname, "--quiet"], stdout=subprocess.DEVNULL)
                if os.path.getsize(fullname) == 0:
                    print("This file is dud")
                    failed.append(url)
                    pass
        return(failed)

In [None]:
for i in tqdm(range(0, manifest.shape[0]):
    download_sample(idx)

## Getting HMP data 

In [None]:
"""
These functions download either the crc data or the ibd data 

db: An argument of SRAweb() type after a SRAweb() call from pysradb
For duplicate patient ids, we return runs with the largest number of 
samples available.  

For get_crc, we're restricting only amplicon sequencing data and including only the FR patients 
For get_ibd, we're restricting to only those sampled from the same site (terminal ileum)
"""
def get_crc(db):
    basepath = "/dartfs-hpc/rc/lab/H/HoenA/Lab/QNguyen/ResultsFiles/data"
    fullpath = basepath + "/crc_16s"
    if not os.path.exists(fullpath):
        os.mkdir(fullpath)
    mt = db.sra_metadata("PRJEB6070", detailed=True)
    mt_filt = mt[mt.library_strategy == "AMPLICON"]
    mt_filt = mt_filt[mt_filt.groupby("host subject id")['run_total_bases'].transform('max') == mt_filt['run_total_bases']]
    mt_filt = mt_filt[mt_filt['diagnosis'] != "N/A"]
    mt_filt.reset_index().drop("index", axis=1)
    final_metadata = mt_filt[["host subject id", "run_accession", "sample_accession", 
                              "diagnosis", "sample name", "sex", "age"]]
    final_metadata.to_csv("../data/crc_16s_metadata.csv")
    print("Downloading...")
    db.download(df = mt_filt, out_dir=fullpath, threads = 4, use_ascp=False, skip_confirmation=True)()
    return(0)

def get_ibd(db):
    basepath = "/dartfs-hpc/rc/lab/H/HoenA/Lab/QNguyen/ResultsFiles/data"
    fullpath = basepath + "/ibd_16s"
    if not os.path.exists(fullpath):
        os.mkdir(fullpath)
    mt = db.sra_metadata("PRJEB13679", detailed=True)
    mt_filt = mt[mt['biopsy location'] == "Terminal ileum"]
    mt_filt = mt_filt[mt_filt.groupby("anonymized name")['run_total_bases'].transform('max') == mt_filt['run_total_bases']]
    final_metadata = mt_filt[["run_accession", "sample_accession", "anonymized name", 
         "diagnosis", "sex", "biopsy location", "sample name"]]
    final_metadata.to_csv("../data/ibd_16s_metadata.csv")
    print("Downloading...")
    db.download(df = mt_filt, out_dir=fullpath, threads=4, use_ascp=True, skip_confirmation=True)()
    return(0)

# for some reason pysradb is missing a lot of info downloading manifest file directly 
# from NCBI 
def get_hmp(db):
    basepath = "/dartfs-hpc/rc/lab/H/HoenA/Lab/QNguyen/ResultsFiles/data"
    fullpath = basepath + "/hmp_16s"
    ref = pd.read_csv("sra_runtable.txt")
    ref_filt = ref[ref.groupby("Sample Name")['Bases'].transform('max') == ref['Bases']]
    ref_filt = ref_filt.rename(columns={"Run", "run_accession"})
    run_list = ref_filt.run_accession.to_list()
    final_metadata = ref_filt[["Sample Name", "run_accession", "analyte_type", "submitted_subject_id"]]
    final_metadata.to_csv("../data/hmp_16s_metadata.csv")
    # filter and download 
    if not os.path.exists(fullpath):
        os.mkdir(fullpath)
    mt = db.sra_metadata("SRP002395", detailed=False)
    mt_filt = mt[mt.run_accession.isin(run_list)]
    print("Downloading...")
    db.download(df=mt_filt, out_dir=fullpath, threads=4, use_ascp=True, skip_confirmation=True)()
    return(0)
    