# Additional datasets from other studies
This notebook describes the methods to access the genomes from other studies used in the analysis

In [122]:
import pandas as pd
import json
from pathlib import Path
import tarfile
import gzip
import shutil
import requests, json
import ast

In [2]:
# Set up paths
## Path to BGCflow directory
external_data_path = Path("/datadrive/bgcflow/data/external")

## Path to tables from other studies
external_table_path = Path("../tables/other_studies")
external_table_path.mkdir(parents=True, exist_ok=True)

## Generate a config template for BGCflow
df_bgcflow_template = pd.DataFrame(columns=["source","organism","genus","species","strain","closest_placement_reference"])

In [178]:
# external files required to run the notebook
# need to make this available in github!

file_gtdb_archaea = Path("/datadrive/bgcflow/data/external/gtdbtk/gtdbtk.ar122.summary.tsv")
file_gtdb_bacteria = Path("/datadrive/bgcflow/data/external/gtdbtk/gtdbtk.bac120.summary.tsv")
file_gtdb_warning = Path("../tables/gtdbtk.warnings.log")

file_bickhart_checkm = Path("../tables/Bickhart_et_al_checkm_out.tsv")
file_christoph_ncbi = Path("../data/PRJNA449266_AssemblyDetails.txt")
file_chen_metadata = Path("../data/Table_1_Discovery of an Abundance of Biosynthetic Gene Clusters in Shark Bay Microbial Mats.XLSX")

In [154]:
# GTDB dataset
df_gtdb_archaea = pd.read_csv(file_gtdb_archaea, sep="\t")
df_gtdb_bacteria = pd.read_csv(file_gtdb_bacteria, sep="\t")
df_gtdb = pd.concat([df_gtdb_archaea, df_gtdb_bacteria])
df_gtdb = df_gtdb.set_index("user_genome", drop=False)

## Dataset from Bickhart et al (464 genomes)
The dataset can be accessed from https://zenodo.org/record/5138306/files/hifi_das.bin3c.bins.tar.gz. The dataset are then downloaded using wget:

In [None]:
# MAG set for the Bickhart paper (464)
! wget -P ../data https://zenodo.org/record/5138306/files/hifi_das.bin3c.bins.tar.gz?download=1 -nc
! mv ../data/hifi_das.bin3c.bins.tar.gz\?download\=1 ../data/hifi_das.bin3c.bins.tar.gz
! tar -xvzf ../data/hifi_das.bin3c.bins.tar.gz -C /target/directory
tar = tarfile.open("sample.tar.gz")
tar.extractall()
tar.close()

The metadata for the genomes are then cleaned to create a list of genomes that will be used in the analysis:

In [179]:
# Download supplementary materials from the paper
! wget -P ../data https://static-content.springer.com/esm/art%3A10.1038%2Fs41587-021-01130-z/MediaObjects/41587_2021_1130_MOESM3_ESM.xlsx -nc

# EDA - How many HQ MAGs based on DAStool definition?
df_bickhart = pd.read_excel("../data/41587_2021_1130_MOESM3_ESM.xlsx", 'SupplementaryTable2')
df_bickhart.loc[:, "genome_id"] = [i.replace("_", ".") for i in df_bickhart.loc[:, 'HiFiBin']]
df_bickhart = df_bickhart.set_index("genome_id", drop=False)
complete = df_bickhart.loc[:, "HiFiBin Completeness"] > 90
contam = df_bickhart.loc[:, "HifiBin Contamination"] < 5
df_bickhart_HQ = df_bickhart.loc[complete & contam]
print(f"Bickhart et al has a total of {len(df_bickhart)} bins. Based on their methods, {len(df_bickhart_HQ)} are HQ.")

# Caitlin has ran CheckM to all bins. How many HQ MAGs based on CheckM definition?
df_bickhart_checkm = pd.read_csv(file_bickhart_checkm, sep="\t")
df_bickhart_checkm.loc[:, "genome_id"] = [i.replace(".contigs", "") for i in df_bickhart_checkm.loc[:, "Bin Id"]]
df_bickhart_checkm = df_bickhart_checkm.set_index("genome_id", drop=False)
complete = df_bickhart_checkm.loc[:, "Completeness"] > 90
contam = df_bickhart_checkm.loc[:, "Contamination"] < 5
df_bickhart_checkm_HQ = df_bickhart_checkm.loc[complete & contam]
print(f"After running CheckM on the {len(df_bickhart_checkm)} bins, we get {len(df_bickhart_checkm_HQ)} HQ MAGs.")

# Where does the tools disagree?
try:
    df_bickhart_HQ.loc[list(df_bickhart_checkm_HQ.index)]
except KeyError as e:
    print("These bins are classified as HQ in CheckM, but not in DAStools:")
    print(ast.literal_eval(e.args[0].replace(" not in index", "")))

File ‘../data/41587_2021_1130_MOESM3_ESM.xlsx’ already there; not retrieving.

Bickhart et al has a total of 695 bins. Based on their methods, 353 are HQ.
After running CheckM on the 695 bins, we get 286 HQ MAGs.
These bins are classified as HQ in CheckM, but not in DAStools:
['bin3c.119', 'bin3c.136', 'bin3c.169', 'bin3c.172', 'bin3c.181', 'bin3c.182', 'bin3c.183', 'bin3c.191', 'bin3c.212', 'bin3c.214', 'bin3c.219', 'bin3c.232', 'bin3c.239', 'bin3c.252', 'bin3c.254', 'bin3c.26', 'bin3c.266', 'bin3c.274', 'bin3c.276', 'bin3c.292', 'bin3c.345', 'bin3c.347', 'bin3c.396', 'bin3c.400', 'bin3c.42', 'bin3c.426', 'bin3c.460', 'bin3c.465', 'bin3c.484', 'bin3c.490', 'bin3c.497', 'bin3c.543', 'bin3c.623', 'bin3c.74', 'bin3c.90', 'bin3c.94']


In [161]:
# Format to BGCflow config
df_bickhart = pd.concat([df_bgcflow_template, df_bickhart_checkm_HQ])
col = df_bickhart.pop("genome_id")
df_bickhart.insert(0, col.name, col)
df_bickhart.loc[:, "source"] = "custom"

# To Do: Assign GTDB taxonomy
#[i for i in df_gtdb.user_genome if i.startswith("bin")]

In [None]:
df_bickhart.to_csv(external_table_path / "df_bickhart_checkm_HQ.csv", index=None)

The selected genomes are then copied to the BGCflow directory for downstream analyses:

In [6]:
# Set Up Paths
bickhart_raw_path = Path("../data/b3c_hifi_dastool/")
bickhart_data_path = external_data_path / "Bickhart_et_al"
bickhart_data_path.mkdir(parents=True, exist_ok=True)

# copy files to target dir
for i in df_bickhart.loc[:, "genome_id"]:
    item = bickhart_raw_path / f"flye4.das_DASTool_bins/{i}.contigs.fa.gz"
    dest = bickhart_data_path / f"{i}.fna"
    if dest.is_file():
        pass
    else:
        if item.is_file():
            with gzip.open(item, 'rb') as f_in:
                with open(dest, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        else:
            item = bickhart_raw_path / f"flye4.das_DASTool_bins/{i}.contigs.fa"
            assert(item.is_file())
            shutil.copy(item, dest)

# Dataset from Liu et al (557 genomes)
The dataset was kindly provided by the authors here: https://www.dropbox.com/sh/qj4aginbflqmxhq/AAB5INvLNKrGiBtjpRbteYJ7a. The dataset are then downloaded using wget:

In [None]:
# MAG set for the Liu paper (557)
! wget -P ../data https://www.dropbox.com/sh/qj4aginbflqmxhq/AAB5INvLNKrGiBtjpRbteYJ7a?dl=0 -nc
! mv ../data/AAB5INvLNKrGiBtjpRbteYJ7a\?dl\=0 ../data/Liu_et_al.zip
! unzip ../data/Liu_et_al.zip

The metadata for the genomes are then cleaned to create a list of genomes that will be used in the analysis:

In [176]:
# Download supplementary materials from the paper
! wget -P ../data https://static-content.springer.com/esm/art%3A10.1186%2Fs40168-021-01155-1/MediaObjects/40168_2021_1155_MOESM2_ESM.xlsx -nc

# Clean Up
df_liu = pd.read_excel("../data/40168_2021_1155_MOESM2_ESM.xlsx", sheet_name=2, skiprows=0)
print(f"Liu et al has {len(df_liu)} MAGs.")

# Filter for HQ only
complete = df_liu.loc[:, "Completeness (%)"] > 90
contam = df_liu.loc[:, "Contamination (%)"] < 5
df_liu_HQ = df_liu.loc[complete & contam]
print(f"Out of these, {len(df_liu_HQ)} is HQ")

df_liu_HQ = pd.concat([df_bgcflow_template, df_liu_HQ])
df_liu_HQ.insert(0, 'genome_id', df_liu_HQ.loc[:, "MAGs"])
df_liu_HQ = df_liu_HQ.drop(columns=["MAGs"])
df_liu_HQ.loc[:, "source"] = "custom"
df_liu_HQ.to_csv(external_table_path / "df_liu_HQ.csv", index=None)
# ! for f in *.fasta; do mv -- "$f" "${f%.fasta}.fna"; done

File ‘../data/40168_2021_1155_MOESM2_ESM.xlsx’ already there; not retrieving.

Liu et al has 557 MAGs.
Out of these, 153 is HQ


# Dataset from Christoph et al (73 genomes)
The dataset from Christoph et al are publicly available from NCBI: https://www.ncbi.nlm.nih.gov/bioproject/?term=PRJNA449266. The assembly details file: `PRJNA449266_AssemblyDetails.txt` were downloaded, containing the assembly accessions of the study, which then can be downloaded using ncbi-genome-download.

HQ-MAGs are selected from the dataset with this definition:
>High-quality draft' will indicate that a SAG or MAG is >90% complete with less than 5% contamination

The metadata for the genomes are then cleaned to create a list of genomes that will be used in the analysis:

In [180]:
# Download supplementary materials from the paper
! wget -P ../data https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-018-0207-y/MediaObjects/41586_2018_207_MOESM3_ESM.xlsx -nc
df_christoph = pd.read_excel("../data/41586_2018_207_MOESM3_ESM.xlsx", sheet_name=0, skiprows=0)

# filter for HQ
a = df_christoph.loc[:, "CheckM Completeness %"] > 90
b = df_christoph.loc[:, "CheckM Contamination %"] < 5
df_christoph = df_christoph[a & b]

# Get NCBI accession
df_christoph.loc[:, "ncbi_mapping"] = [i.split("_", 1)[-1].replace("_", " ") for i in df_christoph.loc[:, "Genome"]]
need_cleaning = df_christoph.loc[:, "ncbi_mapping"].str.startswith("unk")
df_christoph.loc[df_christoph[need_cleaning].index, "ncbi_mapping"] = [i.split(" ")[-1] for i in df_christoph[need_cleaning].loc[:, "ncbi_mapping"]]
df_christoph_ncbi = pd.read_csv(file_christoph_ncbi, sep="\t", skiprows=1, index_col=False)
df_christoph_ncbi_subset = df_christoph_ncbi[df_christoph_ncbi.loc[:, "Isolate"].isin(df_christoph.loc[:, "ncbi_mapping"])]
df_christoph = df_christoph.merge(df_christoph_ncbi_subset, how="left", left_on="ncbi_mapping", right_on="Isolate")
df_christoph = pd.concat([df_bgcflow_template, df_christoph])
df_christoph.insert(0, 'genome_id', df_christoph.loc[:, "# Assembly"])
df_christoph = df_christoph.drop(columns=["# Assembly"])
df_christoph.loc[:, "source"] = "ncbi"
df_christoph.to_csv("../tables/other_studies/df_christoph.csv", index=None)

File ‘../data/41586_2018_207_MOESM3_ESM.xlsx’ already there; not retrieving.



Some of these genomes are already used as GTDB reference. These genomes are discarded in the GTDB-tk classify wf as it will return an error:

In [3]:
pd.read_csv(file_gtdb_warning, skiprows=1, header=None, sep=" ").loc[:, 3].to_csv("../tables/filter_gtdb.txt", header=None, index=None)

# Dataset from Sharrar et al (374 genomes)

The dataset was kindly provided by the authors here: https://figshare.com/ndownloader/files/18105260. The dataset are then downloaded using wget:

In [None]:
! wget -P ../data https://figshare.com/ndownloader/files/18105260 -nc
! mv ../data/18105260 ../data/1334_genomes.tar.gz # then untar

The metadata for the genomes are then cleaned to create a list of genomes that will be used in the analysis:

In [53]:
# Download supplementary materials from the paper
! wget -P ../data https://journals.asm.org/doi/suppl/10.1128/mBio.00416-20/suppl_file/mbio.00416-20-st002.xlsx -nc

# Clean Up
df_sharrar = pd.read_excel("../data/mbio.00416-20-st002.xlsx", sheet_name=0, skiprows=0)
df_sharrar = df_sharrar[df_sharrar.loc[:, "MAG draft quality"] == "High"]
df_sharrar = pd.concat([df_bgcflow_template, df_sharrar])
df_sharrar.insert(0, 'genome_id', df_sharrar.loc[:, "Genome name"])
df_sharrar = df_sharrar.drop(columns=["Genome name"])
df_sharrar.loc[:, "source"] = "custom"
df_sharrar.to_csv("../tables/other_studies/df_sharrar.csv", index=None)

The selected genomes are then copied to the BGCflow directory for downstream analyses:

In [85]:
# copy files to target dir
sharrar_raw_path = Path("../data/1334_genomes")
sharrar_data_path = external_data_path / "Sharrar_et_al"
sharrar_data_path.mkdir(parents=True, exist_ok=True)
for i in df_sharrar.loc[:, "genome_id"]:
    item = sharrar_raw_path / f"{i}.contigs.fa"
    dest = sharrar_data_path / f"{i}.fna"
    shutil.copy(item, dest)

# Downloading dataset from Chen et al
The dataset for Chen et al are available from: https://api.mg-rast.org/project/mgp81948?verbosity=full. The genomes can then be fetched using MG-RAST API:

In [None]:
def get_api(api_url, output):
    response = requests.get(api_url)
    project_metadata = response.json()
    with open(output, 'w') as outfile:
        json.dump(project_metadata, outfile)
    return None

get_api('https://api.mg-rast.org/project/mgp81948?verbosity=full', '../data/mgp81948.json')

In [None]:
with open('../data/mgp81948.json', "r") as f:
    study_metadata = json.load(f)
    ids = [i["metagenome_id"] for i in study_metadata["metagenomes"]]
    for mg_id in ids:
        url = f"https://api.mg-rast.org/download/{mg_id}?file=299.1"
        ! wget -O ../data/chen/{mg_id}.fna {url}

The metadata for the genomes cam be accessed from https://www.frontiersin.org/articles/10.3389/fmicb.2020.01950/full#supplementary-material. The metadata are then cleaned to create a list of genomes that will be used in the analysis:

In [181]:
# Download supplementary materials from the paper
df_chen_metadata = pd.read_excel(file_chen_metadata, sheet_name=0, skiprows=5)

with open("../data/mgp81948.json", "r") as file:
    chen_metadata = json.load(file)
df_chen = pd.DataFrame.from_dict([i for i in chen_metadata["metagenomes"]])
df_chen = pd.concat([df_bgcflow_template, df_chen])
df_chen.insert(0, 'genome_id', df_chen.loc[:, "metagenome_id"])
df_chen = df_chen.drop(columns=["metagenome_id"])
df_chen.loc[:, "source"] = "custom"
df_chen.to_csv("../tables/other_studies/df_chen.csv", index=None)