# Additional datasets from other studies
This notebook describes the methods to access the genomes from other studies used in the analysis

## Table of Contents
* [Dataset from Bickhart et al (695 MAGs, 286 HQ, 284 Bacteria)](#bickhart)
* [Dataset from Liu et al (557 MAGs, 153 HQ, 153 Bacteria)](#liu)
* [Dataset from Christoph et al (376 MAGs, 73 HQ, 73 Bacteria)](#christoph)
* [Dataset from Sharrar et al (1334 MAGs, 374 HQ, 350 Bacteria)](#sharrar)
    * Sanitize Contig Names
* [Dataset from Chen et al (96 MAGs, 24 HQ, 21 Bacteria)](#chen)
    * Sanitize Contig Names

In [1]:
# Please install this libraries first
import pandas as pd
import json
from pathlib import Path
import tarfile
import gzip
import shutil
import requests, json
import ast

from Bio import SeqIO

In [2]:
# Set up paths
## Path to BGCflow directory
external_data_path = Path("../../bgcflow/data/external")

## Path to tables from other studies
external_table_path = Path("../tables/other_studies_HQ")
external_table_path.mkdir(parents=True, exist_ok=True)

## Generate a config template for BGCflow
df_bgcflow_template = pd.DataFrame(columns=["source","organism","genus","species","strain","closest_placement_reference"])

In [3]:
# external files required to run the notebook
# need to make this available in github!

# Not obvious how to fetch with wget
file_christoph_ncbi = Path("../data/supplementary_tables/PRJNA449266_AssemblyDetails.txt")
file_chen_metadata = Path("../data/supplementary_tables/Table_1_Discovery of an Abundance of Biosynthetic Gene Clusters in Shark Bay Microbial Mats.XLSX")

# Manual Run - GTDB runs v202
file_gtdb = Path("../data/pre_analysis_tables/gtdb.all_hqmags_studies.tsv")
file_gtdb_warning = Path("../data/pre_analysis_tables/gtdbtk.warnings.log")

# Manual Run - CheckM
file_bickhart_checkm = Path("../data/pre_analysis_tables/Bickhart_et_al_checkm_out.tsv") # CheckM runs of all bins
file_chen_checkm = Path("../data/pre_analysis_tables/Chen_et_al_df_checkm_stats.csv")

In [4]:
# GTDB
df_gtdb = pd.read_csv(file_gtdb, sep="\t")
df_gtdb = df_gtdb.set_index("user_genome", drop=False)

In [5]:
df_gtdb

Unnamed: 0_level_0,user_genome,original_user_genome,classification,fastani_reference,fastani_reference_radius,fastani_taxonomy,fastani_ani,fastani_af,closest_placement_reference,closest_placement_radius,...,closest_placement_ani,closest_placement_af,pplacer_taxonomy,classification_method,note,"other_related_references(genome_id,species_name,radius,ANI,AF)",msa_percent,translation_table,red_value,warnings
user_genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54_67_14,54_67_14,14_0903_02_20cm_Proteobacteria_54_67_14,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,GCA_005878835.1,95.0,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,99.99,1.0,GCA_005878835.1,95.0,...,99.99,1.0,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCA_005878095.1, s__13-2-20CM-66-19 sp00587809...",95.79,11,,
159_36_9,159_36_9,14_0903_02_30cm_Rickettsiella_grylli_159_36_9,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,GCA_005877695.1,95.0,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,99.98,1.0,GCA_005877695.1,95.0,...,99.98,1.0,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCF_001881495.1, s__Rickettsiella_A isopodorum...",95.59,11,,
157_38_13,157_38_13,14_0903_02_30cm_Sphingobacteriales_157_38_13,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,GCA_005882315.1,95.0,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,100.00,1.0,GCA_005882315.1,95.0,...,100.00,1.0,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCA_005882975.1, s__VBAS01 sp005882975, 95.0, ...",96.72,11,,
165_43_8,165_43_8,14_0903_02_30cm_Sphingobacteriales_165_43_8,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,GCA_005884665.1,95.0,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,100.00,1.0,GCA_005884665.1,95.0,...,100.00,1.0,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,,88.70,11,,
156_68_15,156_68_15,14_0903_02_30cm_Sphingomonadales_156_68_15,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,GCA_005883305.1,95.0,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,100.00,1.0,GCA_005883305.1,95.0,...,100.00,1.0,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCF_007995055.1, s__Allosphingosinicella ginse...",97.00,11,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bin3c.78,bin3c.78,,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,,,,,,GCA_002298075.1,95.0,...,78.08,0.1,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,taxonomic classification defined by topology a...,,,93.65,11,0.957236,
bin3c.79,bin3c.79,,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o_...,,,,,,,,...,,,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o_...,ANI,,"GCA_900540045.1, s__Firm-11 sp900540045, 95.0,...",92.30,11,0.942301,
bin3c.91,bin3c.91,,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o_...,,,,,,,,...,,,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o_...,taxonomic classification fully defined by topo...,,,93.33,11,0.864885,
bin3c.92,bin3c.92,,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o_...,,,,,,,,...,,,d__Bacteria;p__Firmicutes_A;c__Clostridia_A;o_...,taxonomic novelty determined using RED,,,94.92,11,0.894587,


## Dataset from Bickhart et al (695 MAGs, 286 HQ, 284 Bacteria) <a class="anchor" id="bickhart"></a>
The dataset can be accessed from https://zenodo.org/record/5138306/files/hifi_das.bin3c.bins.tar.gz. The dataset are then downloaded using wget:

In [6]:
# MAG set for the Bickhart paper (464)
Path("../data/raw_sequences").mkdir(parents=True, exist_ok=True)
! wget -O ../data/raw_sequences/hifi_das.bin3c.bins.tar.gz https://zenodo.org/record/5138306/files/hifi_das.bin3c.bins.tar.gz?download=1 -nc
tar = tarfile.open("../data/raw_sequences/hifi_das.bin3c.bins.tar.gz")
tar.extractall(path="../data/raw_sequences/")
tar.close()
#! rm ../data/raw_sequences/hifi_das.bin3c.bins.tar.gz

# rename folder
bickhart_raw_path = Path("../data/raw_sequences/b3c_hifi_dastool/")
try:
    bickhart_raw_path.rename("../data/raw_sequences/Bickhart_et_al/")
except FileExistsError:
    pass

File ‘../data/raw_sequences/hifi_das.bin3c.bins.tar.gz’ already there; not retrieving.


The metadata for the genomes are then cleaned to create a list of genomes that will be used in the analysis:

In [7]:
# Download supplementary materials from the paper
! wget -P ../data/supplementary_tables/ https://static-content.springer.com/esm/art%3A10.1038%2Fs41587-021-01130-z/MediaObjects/41587_2021_1130_MOESM3_ESM.xlsx -nc

# EDA - How many HQ MAGs based on DAStool definition?
df_bickhart = pd.read_excel("../data/supplementary_tables/41587_2021_1130_MOESM3_ESM.xlsx", 'SupplementaryTable2')
df_bickhart.loc[:, "genome_id"] = [i.replace("_", ".") for i in df_bickhart.loc[:, 'HiFiBin']]
df_bickhart = df_bickhart.set_index("genome_id", drop=False)
complete = df_bickhart.loc[:, "HiFiBin Completeness"] > 90
contam = df_bickhart.loc[:, "HifiBin Contamination"] < 5
df_bickhart_HQ = df_bickhart.loc[complete & contam]
print(f"Bickhart et al has a total of {len(df_bickhart)} bins. Based on their methods, {len(df_bickhart_HQ)} are HQ.")

# Caitlin has ran CheckM to all bins. How many HQ MAGs based on CheckM definition?
df_bickhart_checkm = pd.read_csv(file_bickhart_checkm, sep="\t")
df_bickhart_checkm.loc[:, "genome_id"] = [i.replace(".contigs", "") for i in df_bickhart_checkm.loc[:, "Bin Id"]]
df_bickhart_checkm = df_bickhart_checkm.set_index("genome_id", drop=False)
complete = df_bickhart_checkm.loc[:, "Completeness"] > 90
contam = df_bickhart_checkm.loc[:, "Contamination"] < 5
df_bickhart_checkm_HQ = df_bickhart_checkm.loc[complete & contam]
print(f"After running CheckM on the {len(df_bickhart_checkm)} bins, we get {len(df_bickhart_checkm_HQ)} HQ MAGs.")

# Where does the tools disagree?
try:
    df_bickhart_HQ.loc[list(df_bickhart_checkm_HQ.index)]
except KeyError as e:
    print("These bins are classified as HQ in CheckM, but not in DAStools:")
    print(ast.literal_eval(e.args[0].replace(" not in index", "")))
    
# Filter out archaeal genomes
print("\nFiltering for bacteria only genomes...")
filter_mask = []
for i in df_bickhart_checkm_HQ.index:
    try:
        tax = df_gtdb.loc[i, "classification"]
        if tax.startswith("d__Bacteria"):
            filter_mask.append(i)
        else:
            print(f"Genome: {i} is not bacteria.")
            print(tax)
    except KeyError as e:
        print(f"Genome: {e} is not in GTDB file")
print(f"Found {len(filter_mask)}/{len(df_bickhart_checkm_HQ)} Bacterial Genomes")
df_bickhart_checkm_HQ = df_bickhart_checkm_HQ.loc[filter_mask]

File ‘../data/supplementary_tables/41587_2021_1130_MOESM3_ESM.xlsx’ already there; not retrieving.

Bickhart et al has a total of 695 bins. Based on their methods, 353 are HQ.
After running CheckM on the 695 bins, we get 286 HQ MAGs.
These bins are classified as HQ in CheckM, but not in DAStools:
['bin3c.119', 'bin3c.136', 'bin3c.169', 'bin3c.172', 'bin3c.181', 'bin3c.182', 'bin3c.183', 'bin3c.191', 'bin3c.212', 'bin3c.214', 'bin3c.219', 'bin3c.232', 'bin3c.239', 'bin3c.252', 'bin3c.254', 'bin3c.26', 'bin3c.266', 'bin3c.274', 'bin3c.276', 'bin3c.292', 'bin3c.345', 'bin3c.347', 'bin3c.396', 'bin3c.400', 'bin3c.42', 'bin3c.426', 'bin3c.460', 'bin3c.465', 'bin3c.484', 'bin3c.490', 'bin3c.497', 'bin3c.543', 'bin3c.623', 'bin3c.74', 'bin3c.90', 'bin3c.94']

Filtering for bacteria only genomes...
Genome: 'bin3c.221' is not in GTDB file
Genome: 'bin3c.602' is not in GTDB file
Found 284/286 Bacterial Genomes


In [8]:
# Format to BGCflow config
df_bickhart = pd.concat([df_bgcflow_template, df_bickhart_checkm_HQ])
col = df_bickhart.pop("genome_id")
df_bickhart.insert(0, col.name, col)
df_bickhart.loc[:, "source"] = "custom"

In [9]:
df_bickhart.to_csv(external_table_path / "df_bickhart_checkm_HQ.csv", index=None)

The selected genomes are then copied to the BGCflow directory for downstream analyses:

In [10]:
# Set Up Paths
bickhart_raw_path = Path("../data/raw_sequences/Bickhart_et_al/")
bickhart_data_path = external_data_path / "Bickhart_et_al"
bickhart_data_path.mkdir(parents=True, exist_ok=True)

# copy files to target dir
for i in df_bickhart.loc[:, "genome_id"]:
    item = bickhart_raw_path / f"flye4.das_DASTool_bins/{i}.contigs.fa.gz"
    dest = bickhart_data_path / f"{i}.fna"
    if dest.is_file():
        pass
    else:
        if item.is_file():
            with gzip.open(item, 'rb') as f_in:
                with open(dest, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        else:
            item = bickhart_raw_path / f"flye4.das_DASTool_bins/{i}.contigs.fa"
            assert(item.is_file())
            shutil.copy(item, dest)

# Dataset from Liu et al (557 MAGs, 153 HQ, 153 Bacteria) <a class="anchor" id="liu"></a>
The dataset was kindly provided by the authors here: https://www.dropbox.com/sh/qj4aginbflqmxhq/AAB5INvLNKrGiBtjpRbteYJ7a. The dataset are then downloaded using wget:

In [11]:
# MAG set for the Liu paper (557)
! wget -O ../data/raw_sequences/Liu_et_al.zip https://www.dropbox.com/sh/qj4aginbflqmxhq/AAB5INvLNKrGiBtjpRbteYJ7a?dl=0 -nc
! mkdir ../data/raw_sequences/Liu_et_al
! unzip -nq ../data/raw_sequences/Liu_et_al.zip  -d ../data/raw_sequences/Liu_et_al

File ‘../data/raw_sequences/Liu_et_al.zip’ already there; not retrieving.
mkdir: cannot create directory ‘../data/raw_sequences/Liu_et_al’: File exists
mapname:  conversion of  failed


The metadata for the genomes are then cleaned to create a list of genomes that will be used in the analysis:

In [12]:
# Download supplementary materials from the paper
! wget -P ../data/supplementary_tables/ https://static-content.springer.com/esm/art%3A10.1186%2Fs40168-021-01155-1/MediaObjects/40168_2021_1155_MOESM2_ESM.xlsx -nc

# Clean Up
df_liu = pd.read_excel("../data/supplementary_tables/40168_2021_1155_MOESM2_ESM.xlsx", sheet_name=2, skiprows=0)
print(f"Liu et al has {len(df_liu)} MAGs.")

# Filter for HQ only
complete = df_liu.loc[:, "Completeness (%)"] > 90
contam = df_liu.loc[:, "Contamination (%)"] < 5
df_liu_HQ = df_liu.loc[complete & contam]
print(f"Out of these, {len(df_liu_HQ)} is HQ")

df_liu_HQ = pd.concat([df_bgcflow_template, df_liu_HQ])
df_liu_HQ.insert(0, 'genome_id', df_liu_HQ.loc[:, "MAGs"])
df_liu_HQ = df_liu_HQ.drop(columns=["MAGs"])
df_liu_HQ.loc[:, "source"] = "custom"
df_liu_HQ.to_csv(external_table_path / "df_liu_checkm_HQ.csv", index=None)

File ‘../data/supplementary_tables/40168_2021_1155_MOESM2_ESM.xlsx’ already there; not retrieving.

Liu et al has 557 MAGs.
Out of these, 153 is HQ


In [13]:
# Set Up Paths
liu_raw_path = Path("../data/raw_sequences/Liu_et_al/")
liu_data_path = external_data_path / "Liu_et_al"
liu_data_path.mkdir(parents=True, exist_ok=True)

# copy files to target dir
for i in df_liu_HQ.loc[:, "genome_id"]:
    item = liu_raw_path / f"{i}.fasta"
    dest = liu_data_path / f"{i}.fna"
    if dest.is_file():
        pass
    else:
        if item.is_file():
            with open(item, 'rb') as f_in:
                with open(dest, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

# Dataset from Christoph et al (376 MAGs, 73 HQ, 73 Bacteria) <a class="anchor" id="christoph"></a>
The dataset from Christoph et al are publicly available from NCBI: https://www.ncbi.nlm.nih.gov/bioproject/?term=PRJNA449266. The assembly details file: `PRJNA449266_AssemblyDetails.txt` were downloaded, containing the assembly accessions of the study, which then can be downloaded using ncbi-genome-download.

HQ-MAGs are selected from the dataset with this definition:
>High-quality draft' will indicate that a SAG or MAG is >90% complete with less than 5% contamination

The metadata for the genomes are then cleaned to create a list of genomes that will be used in the analysis:

In [14]:
# Download supplementary materials from the paper
! wget -P ../data/supplementary_tables/ https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-018-0207-y/MediaObjects/41586_2018_207_MOESM3_ESM.xlsx -nc
df_christoph = pd.read_excel("../data/supplementary_tables/41586_2018_207_MOESM3_ESM.xlsx", sheet_name=0, skiprows=0)
print(f"Christoph et al has: {len(df_christoph)} MAGs")

# filter for HQ
a = df_christoph.loc[:, "CheckM Completeness %"] > 90
b = df_christoph.loc[:, "CheckM Contamination %"] < 5
df_christoph = df_christoph[a & b]
print(f"Out of these, {len(df_christoph)} are HQ MAGs")

# Get NCBI accession
df_christoph.loc[:, "ncbi_mapping"] = [i.split("_", 1)[-1].replace("_", " ") for i in df_christoph.loc[:, "Genome"]]
need_cleaning = df_christoph.loc[:, "ncbi_mapping"].str.startswith("unk")
df_christoph.loc[df_christoph[need_cleaning].index, "ncbi_mapping"] = [i.split(" ")[-1] for i in df_christoph[need_cleaning].loc[:, "ncbi_mapping"]]
df_christoph_ncbi = pd.read_csv(file_christoph_ncbi, sep="\t", skiprows=1, index_col=False)
df_christoph_ncbi_subset = df_christoph_ncbi[df_christoph_ncbi.loc[:, "Isolate"].isin(df_christoph.loc[:, "ncbi_mapping"])]
df_christoph = df_christoph.merge(df_christoph_ncbi_subset, how="left", left_on="ncbi_mapping", right_on="Isolate")
df_christoph = df_christoph.set_index("# Assembly", drop=False)

# Format to BGCflow config
df_christoph = pd.concat([df_bgcflow_template, df_christoph])
df_christoph.insert(0, 'genome_id', df_christoph.loc[:, "# Assembly"])
df_christoph = df_christoph.drop(columns=["# Assembly"])
df_christoph.loc[:, "source"] = "ncbi"
df_christoph.to_csv(external_table_path / "df_christoph_checkm_HQ.csv", index=None)

File ‘../data/supplementary_tables/41586_2018_207_MOESM3_ESM.xlsx’ already there; not retrieving.

Christoph et al has: 376 MAGs
Out of these, 73 are HQ MAGs


PS: Some of these genomes are already used as GTDB reference. These genomes are discarded in the GTDB-tk classify wf as it will return an error:

In [15]:
pd.read_csv(file_gtdb_warning, skiprows=1, header=None, sep=" ").loc[:, 3].to_csv("../tables/filter_gtdb.txt", header=None, index=None)

# Dataset from Sharrar et al (1334 MAGs, 374 HQ, ) <a class="anchor" id="sharrar"></a>

The dataset was kindly provided by the authors here: https://figshare.com/ndownloader/files/18105260. The dataset are then downloaded using wget:

In [16]:
! wget -O ../data/raw_sequences/1334_genomes.tar.gz https://figshare.com/ndownloader/files/18105260 -nc
tar = tarfile.open("../data/raw_sequences/1334_genomes.tar.gz")
tar.extractall(path="../data/raw_sequences/")
tar.close()

# rename folder
sharrar_raw_path = Path("../data/raw_sequences/1334_genomes/")
try:
    sharrar_raw_path.rename("../data/raw_sequences/Sharrar_et_al/")
except FileExistsError:
    pass

File ‘../data/raw_sequences/1334_genomes.tar.gz’ already there; not retrieving.


The metadata for the genomes are then cleaned to create a list of genomes that will be used in the analysis:

In [17]:
# Download supplementary materials from the paper
! wget -P ../data/supplementary_tables/ https://journals.asm.org/doi/suppl/10.1128/mBio.00416-20/suppl_file/mbio.00416-20-st002.xlsx -nc

# Clean Up
df_sharrar = pd.read_excel("../data/supplementary_tables/mbio.00416-20-st002.xlsx", sheet_name=0, skiprows=0)
print(f"Sharrar et al has {len(df_sharrar)} MAGs.")
df_sharrar = df_sharrar[df_sharrar.loc[:, "MAG draft quality"] == "High"]
print(f"Out of these, {len(df_sharrar)} are HQ.")

# Format to BGCflow
df_sharrar = pd.concat([df_bgcflow_template, df_sharrar])
df_sharrar.insert(0, 'genome_id', df_sharrar.loc[:, "Genome name"])
df_sharrar = df_sharrar.drop(columns=["Genome name"])
df_sharrar.loc[:, "source"] = "custom"
df_sharrar.to_csv(external_table_path / "df_sharrar.csv", index=None)

File ‘../data/supplementary_tables/mbio.00416-20-st002.xlsx’ already there; not retrieving.

Sharrar et al has 1334 MAGs.
Out of these, 374 are HQ.


The selected genomes are then copied to the BGCflow directory for downstream analyses:

In [18]:
# copy files to target dir
sharrar_raw_path = Path("../data/raw_sequences/Sharrar_et_al")
sharrar_data_path = external_data_path / "Sharrar_et_al"
sharrar_data_path.mkdir(parents=True, exist_ok=True)
for i in df_sharrar.loc[:, "genome_id"]:
    item = sharrar_raw_path / f"{i}.contigs.fa"
    dest = sharrar_data_path / f"{i}.fna"
    shutil.copy(item, dest)

### Sanitize Genome and Contig Names
Apparently, the contig ids from Chen et al needs to be sanitized because of its length.

Here, we will attempt to shorten the ids by extracting unique names from the original ids.

In [19]:
# Prepare path for correction
sharrar_path = external_data_path / "Sharrar_et_al/"
sharrar_corrected_path = external_data_path / "Sharrar_et_al_sanitized/"
sharrar_corrected_path.mkdir(parents=True, exist_ok=True)

print("Grabbing all ids from .fna files with glob...")
sample_ids = [i.stem for i in sharrar_path.glob("*.fna")]

# how many id variables are there?
id_variables = set([len(i.split("_")) for i in sample_ids])
print(f"There are {len(id_variables)} different variation of ids, each with N number of keywords:")
print(id_variables)

print("Manually curating each id types...")
# get unique values for each id types
type_2 = {i:i.split("_")[-1] for i in sample_ids if len(i.split("_")) == 2}
type_3 = {i:i.split("_", 1)[-1] for i in sample_ids if len(i.split("_")) == 3}
type_4 = {i:i.split("_", 2)[-1] for i in sample_ids if len(i.split("_")) == 4}
type_8 = {i:i.split("_", 5)[-1] for i in sample_ids if len(i.split("_")) == 8}
type_9 = {i:i.split("_", 6)[-1] for i in sample_ids if len(i.split("_")) == 9}
type_10 = {i:i.split("_", 7)[-1] for i in sample_ids if len(i.split("_")) == 10}
type_14 = {i:i.split("_", 11)[-1] for i in sample_ids if len(i.split("_")) == 14}

print("Recording id changes as dict...")
# append into one dictionary
sample_id_dicts = {}
unique_ids = [type_2, type_3, type_4, type_8, type_9, type_10, type_14]
for d in unique_ids:
    sample_id_dicts.update(d)

print("Correcting contig names based on dictionary..")
print("Changes are recorded as json files..")
# Correcting contig names based on dicts
output_dir = sharrar_corrected_path
input_files = [i for i in sharrar_path.glob("*.fna")]

for i in input_files:
    original_file = Path(i)
    new_genome_id = sample_id_dicts[i.stem]
    corrected_file = Path(output_dir) / f"{new_genome_id}.fna"
    
    log_dict = {}
    
    # Use biopython to change contig names
    with open(original_file) as original, open(corrected_file, 'w') as corrected:
        records = SeqIO.parse(original_file, 'fasta')
        ctr = 1
        for record in records:       
            old_id = record.id
            new_id = f"{new_genome_id}_{ctr}"
            record.id = new_id
            ctr = ctr + 1
            SeqIO.write(record, corrected, 'fasta')
            
            log_dict[old_id] = new_id
    
    with open(Path(output_dir) / f"{new_genome_id}.json", "w") as out_file:
        json.dump(log_dict, out_file, indent = 4)
    
print(f"Sanitized genomes are saved in {sharrar_corrected_path}")

Grabbing all ids from .fna files with glob...
There are 7 different variation of ids, each with N number of keywords:
{2, 3, 4, 8, 9, 10, 14}
Manually curating each id types...
Recording id changes as dict...
Correcting contig names based on dictionary..
Changes are recorded as json files..
Sanitized genomes are saved in ../../bgcflow/data/external/Sharrar_et_al_sanitized


In [20]:
# Update config files to the new ids
df_sharrar = pd.read_csv(external_table_path / "df_sharrar.csv")
df_sharrar = df_sharrar.rename(columns={"genome_id":"original_genome_id"})
for i in df_sharrar.index:
    df_sharrar.loc[i, "genome_id"] = sample_id_dicts[df_sharrar.loc[i, "original_genome_id"]]
    
# shift column 'Name' to first position
first_column = df_sharrar.pop('genome_id')
  
# insert column using insert(position,column_name,first_column) function
df_sharrar.insert(0, 'genome_id', first_column)
df_sharrar = df_sharrar.set_index("genome_id", drop=False)

In [21]:
try:
    df_gtdb.loc[df_sharrar.index]
except KeyError as e:
    err = ast.literal_eval(e.args[0].replace(" not in index", ""))
    df_sharrar.loc[:, "classification"] = df_gtdb.loc[[i for i in df_sharrar.index if i not in err]].classification
    df_sharrar = df_sharrar.drop(err)
print(f"Out of the HQ MAGs, {len(df_sharrar)} are bacteria")

Out of the HQ MAGs, 350 are bacteria


In [22]:
df_sharrar.to_csv(external_table_path / "df_sharrar_checkm_HQ.csv", index=None)

# Dataset from Chen et al (96 MAGs, 24 HQ, 21 Bacteria) <a class="anchor" id="chen"></a>
The dataset for Chen et al are available from: https://api.mg-rast.org/project/mgp81948?verbosity=full. The genomes can then be fetched using MG-RAST API:

In [23]:
def get_api(api_url, output):
    response = requests.get(api_url)
    project_metadata = response.json()
    with open(output, 'w') as outfile:
        json.dump(project_metadata, outfile)
    return None

get_api('https://api.mg-rast.org/project/mgp81948?verbosity=full', '../data/raw_sequences/mgp81948.json')

In [24]:
Path("../data/raw_sequences/Chen_et_al").mkdir(parents=True, exist_ok=True)
with open('../data/raw_sequences/mgp81948.json', "r") as f:
    study_metadata = json.load(f)
    ids = [i["metagenome_id"] for i in study_metadata["metagenomes"]]
    for mg_id in ids:
        file = Path(f"../data/raw_sequences/Chen_et_al/{mg_id}.fna")
        if file.is_file():
            pass
        else:
            url = f"https://api.mg-rast.org/download/{mg_id}?file=299.1"
            ! wget -O ../data/raw_sequences/Chen_et_al/{mg_id}.fna {url} -nc #2>> ../data/raw_sequences/Chen_et_al/download.log

The metadata for the genomes can be accessed from https://www.frontiersin.org/articles/10.3389/fmicb.2020.01950/full#supplementary-material. The metadata are then cleaned to create a list of genomes that will be used in the analysis:

In [25]:
# Download supplementary materials from the paper
df_chen_metadata = pd.read_excel(file_chen_metadata, sheet_name=0, skiprows=5)

with open("../data/raw_sequences/mgp81948.json", "r") as file:
    chen_metadata = json.load(file)
df_chen = pd.DataFrame.from_dict([i for i in chen_metadata["metagenomes"]]).set_index("metagenome_id", drop=False)
print(f"Chen et al has {len(df_chen)} MAGs")

df_chen_checkm = pd.read_csv(file_chen_checkm).set_index("genome_id")
df_chen = pd.merge(df_chen, df_chen_checkm, left_index=True, right_index=True)
df_chen.columns

# filter for HQ
a = df_chen.loc[:, "Completeness"] > 90
b = df_chen.loc[:, "Contamination"] < 5
df_chen = df_chen[a & b]
print(f"Out of these, {len(df_chen)} are HQ MAGs")

# filter archaea
try:
    df_chen.loc[:, "classification"] = df_gtdb.loc[df_chen.index].classification
except KeyError as e:
    err = ast.literal_eval(e.args[0].replace(" not in index", ""))
    df_chen.loc[:, "classification"] = df_gtdb.loc[[i for i in df_chen.index if i not in err]].classification
    df_chen = df_chen.drop(err)
print(f"Out of the HQ MAGs, {len(df_chen)} are bacteria")

Chen et al has 96 MAGs
Out of these, 24 are HQ MAGs
Out of the HQ MAGs, 21 are bacteria


In [26]:
# Format to BGCflow
df_chen = pd.concat([df_bgcflow_template, df_chen])
df_chen.insert(0, 'genome_id', df_chen.loc[:, "metagenome_id"])
df_chen = df_chen.drop(columns=["metagenome_id"])
df_chen.loc[:, "source"] = "custom"
df_chen.to_csv(external_table_path / "df_chen_checkm_HQ.csv", index=None)

## Sanitize Contig Names
Apparently, the contig names Chen et al needs to be sanitized because of its length.

In [27]:
def sanitize_chen_et_al(input_file, output_dir):
    """
    Shorten contig name
    """
    original_file = Path(input_file)
    corrected_file = Path(output_dir) / f"{original_file.stem}.fna"
    
    log_dict = {}

    with open(original_file) as original, open(corrected_file, 'w') as corrected:
        records = SeqIO.parse(original_file, 'fasta')
        ctr = 1
        for record in records:       
            old_id = record.id
            new_id = f"{original_file.stem}_{ctr}"
            record.id = new_id
            ctr = ctr + 1
            SeqIO.write(record, corrected, 'fasta')
            
            log_dict[old_id] = new_id
    
    with open(Path(output_dir) / f"{original_file.stem}.json", "w") as out_file:
        json.dump(log_dict, out_file, indent = 4)
    return

chen_path = Path("../data/raw_sequences/Chen_et_al")
chen_corrected_path = external_data_path / "Chen_et_al_sanitized"
chen_corrected_path.mkdir(parents=True, exist_ok=True)

for i in chen_path.glob("*.fna"):
    sanitize_chen_et_al(i, chen_corrected_path)

# Merge all tables

In [28]:
! ls ../tables/other_studies_HQ/

df_bickhart_checkm_HQ.csv  df_christoph_checkm_HQ.csv  df_sharrar.csv
df_chen_checkm_HQ.csv	   df_liu_checkm_HQ.csv        df_sharrar_checkm_HQ.csv


In [29]:
dfs_HQ = [pd.read_csv(i) for i in Path("../tables/other_studies_HQ/").glob("*HQ*")]

In [30]:
df_singleton = pd.read_csv("../../bgcflow/config/Singleton_et_al/df_singleton_bacteria.csv")

In [31]:
dfs_HQ.append(df_singleton)

In [32]:
df_all_study = pd.concat(dfs_HQ)

In [33]:
Path("../../bgcflow/config/all_hqmags_studies").mkdir(parents=True, exist_ok=True)
df_all_study.to_csv("../../bgcflow/config/all_hqmags_studies/all_hqmags_studies_checkm_HQ.csv")