# Downloading 16S raw data files from different sources

Last updated: 2022-04-05     
Quang Nguyen

In [3]:
from tqdm.autonotebook import tqdm
import pandas as pd 
import os
import matplotlib.pyplot as plt
import subprocess
import time
import sys
from multiprocessing import Pool
basepath = "/dartfs-hpc/rc/lab/H/HoenA/Lab/QNguyen/ResultsFiles/data"
asp_key = "/dartfs-hpc/rc/home/k/f00345k/.aspera/connect/etc/asperaweb_id_dsa.openssh"
asp_cmd = "ascp -k 1 -QT -l 300m -P33001 -i" 

  from tqdm.autonotebook import tqdm


In [2]:
!pwd

/dartfs-hpc/rc/home/k/f00345k/research/microbe_set_trait/analysis


## Getting CRC data 

First, we're processing and filtering the manifest data from ENA

In [18]:
manifest = pd.read_csv("crc_16s.tsv", sep="\t")
manifest = manifest[manifest.library_strategy == "AMPLICON"]
manifest = manifest[manifest.groupby("sample_title")['read_count'].transform('max') == manifest['read_count']]
manifest = manifest.reset_index().drop('index', axis = 1)

crc_path = basepath + "/crc_16s"

Then we go through each ftp or aspera link and download

In [19]:
# cmd = "{} {} {} {}".format(asp_cmd, asp_key, url, crc_path) for aspera - however 
# as of this current version aspera doesn't work due to permission issues 

def download_sample(idx):
    print(manifest.head)
    failed = []
    for j in range(0,2):
        url = manifest.iloc[idx].fastq_ftp.split(";")[j]
        sname = sname = manifest.iloc[idx].sample_title.split(" ")[-1]
        if j == 1:
            fname = sname + "_R2_001.fastq.gz"
        else:
            fname = sname + "_R1_001.fastq.gz"
        fullname = crc_path + "/" + fname
        cmd = "wget {} -O {}".format(url, crc_path + "/" + fname)
        if os.path.exists(fullname):
            pass
        else:
            print("Downloading {}".format(fname))
            if idx % 10 == 0 & idx >= 10:
                time.sleep(10)
            subprocess.run(args=["wget", url, "-O", fullname, "--quiet"], stdout=subprocess.DEVNULL)
            if os.path.getsize(fullname) == 0:
                print("For some reason this is not downloading, retrying...")
                subprocess.run(args=["wget", url, "-O", fullname, "--quiet"], stdout=subprocess.DEVNULL)
                if os.path.getsize(fullname) == 0:
                    print("This file is dud")
                    failed.append(url)
                    pass
        return(failed)

This code is run in parallel using the `multiprocessing` package across 5 cores using `nohup` on the Dartmouth Polaris server

## Getting HMP data 

HMP data can't be download directly using ENA due to unspecified restrictions. The raw 16S rRNA gene sequencing data files are available on their [website](https://www.hmpdacc.org/hmp/HMR16S/). As such we're hand-curating the manifest file for data download. An example link is "http://downloads.ihmpdcc.org/data/HMR16S/SRR040.tar.bz2"  

In [6]:
urls = ["http://downloads.ihmpdcc.org/data/HMR16S/SRR0" + str(i) + ".tar.bz2" for i in range(40,51)]
samples = ["SRR0{}000_SR0{}999".format(i,i) for i in range(40, 50)]
samples.append("SRR050000_SRR059999")

In [7]:
hmp_manifest = pd.DataFrame(
    {
        "samples":pd.Series(samples),
        "urls":pd.Series(urls)
    }
)
hmp_manifest
hmp_manifest.to_csv("../python/hmp_urls.csv")

Similar to files above, this is ran asynchronously across 5 cores using `multiprocessing` and `nohup` through the python script `download_hmp.py`. The files are then extracted using `tar` and converted to `fastq` files using `sff2fastq` utilities installed via `bioconda` 