In [62]:
import urllib
import os 
import concurrent.futures

import pandas as pd
from bs4 import BeautifulSoup

# Download assemblies/contigs from HMP2 study

Got a HMTL of the contigs download links from HMP2 from https://ibdmdb.org/downloads/html/products_MGX_2017-08-12.html

Data was processed by the following pipeline https://github.com/biobakery/hmp2_workflows/tree/master/hmp2_workflows

TLDR: 
1. Reads were processed using kneaddata. Trimmomatic and then removal of contaminants with bowtie2 https://huttenhower.sph.harvard.edu/kneaddata/
2. Assembled with MEGAHIT

In [13]:
soup = BeautifulSoup(open("products_MGX_2017-08-12.html"), 'html.parser')
a_results = soup.find_all("a")

In [31]:
download_links = []
download_paths = []
directory_path = "/orcd/data/braakman/001/data/assemblies/hmp2"

for element in a_results:
    download_link = element.get('href')
    if "contigs" in download_link:
        name = os.path.basename(download_link)
        file_path = os.path.join(directory_path, name)
        download_links.append(download_link)
        download_paths.append(file_path)

In [64]:
def download_file(link: str, path: str):
    try:
        urllib.request.urlretrieve(link, path)
        return (True, link)
    except:
        return (False, link)

with concurrent.futures.ThreadPoolExecutor() as executor:
    downloading = executor.map(download_file, download_links, download_paths)
    
    download_df = pd.DataFrame(downloading, columns = ["Downloaded", "Link"])

download_df

Unnamed: 0,Downloaded,Link
0,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...
1,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...
2,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...
3,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...
4,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...
...,...,...
1333,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...
1334,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...
1335,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...
1336,True,https://g-227ca.190ebd.75bc.data.globus.org/ib...


In [70]:
print(len(download_links))
download_df["Downloaded"].sum() == len(download_links)

1338


np.True_

All assemblies/contigs have been downloaded

# Look at Participant/Sample Metadata

In [72]:
sample_df = pd.read_csv("hmp2_metadata_2018-08-20.csv")

  sample_df = pd.read_csv("hmp2_metadata_2018-08-20.csv")


In [121]:
assembly_df = sample_df.query("data_type == 'metagenomics'").\
              assign(AssemblyName = sample_df["External ID"] + "_contigs.fna.gz")

assembly_df = assembly_df[assembly_df["AssemblyName"].isin([os.path.basename(path) for path in download_paths])]
assembly_df[["External ID", "diagnosis"]]

Unnamed: 0,External ID,diagnosis
1077,CSM5MCXD,CD
1078,CSM5MCYS,CD
1079,CSM67U9J,CD
1080,CSM67UA2,CD
1081,CSM67UGC,CD
...,...,...
2701,PSMA26A1,UC
2702,PSMA26A3,UC
2703,PSMB4MC1,UC
2704,PSMB4MC3,UC


In [139]:
assembly_df.drop_duplicates("Participant ID")[["Participant ID", "Age at diagnosis", "sex", "site_name", "diagnosis", "Antibiotics", "Chemotherapy"]]

Unnamed: 0,Participant ID,Age at diagnosis,sex,site_name,diagnosis,Antibiotics,Chemotherapy
1077,C3001,28.0,Female,Cedars-Sinai,CD,No,No
1091,C3002,47.0,Female,Cedars-Sinai,CD,No,Yes
1102,C3003,29.0,Female,Cedars-Sinai,UC,No,No
1114,C3004,33.0,Female,Cedars-Sinai,UC,No,No
1137,C3005,58.0,Female,Cedars-Sinai,UC,No,No
...,...,...,...,...,...,...,...
2654,P6028,9.0,Male,MGH Pediatrics,CD,Yes,No
2664,P6033,15.0,Male,MGH Pediatrics,CD,No,No
2675,P6035,16.0,Male,MGH Pediatrics,UC,No,No
2686,P6037,15.0,Male,MGH Pediatrics,CD,No,No
