In [None]:
from Bio import SeqIO
from pathlib import Path
import json

def sanitize_chen_et_al(input_file, output_dir):
    """
    Shorten contig name
    """
    original_file = Path(input_file)
    corrected_file = Path(output_dir) / f"{original_file.stem}.fna"
    
    log_dict = {}

    with open(original_file) as original, open(corrected_file, 'w') as corrected:
        records = SeqIO.parse(original_file, 'fasta')
        ctr = 1
        for record in records:       
            old_id = record.id
            new_id = f"{original_file.stem}_{ctr}"
            record.id = new_id
            ctr = ctr + 1
            SeqIO.write(record, corrected, 'fasta')
            
            log_dict[old_id] = new_id
    
    with open(Path(output_dir) / f"{original_file.stem}.json", "w") as out_file:
        json.dump(log_dict, out_file, indent = 4)
    return

Apparently, the contig names from Sharrar and Chen needs to be sanitized.

In [None]:
chen_path = Path("/datadrive/bgcflow/data/external/Chen_et_al/")
chen_corrected_path = Path("/datadrive/bgcflow/data/external/Chen_et_al_sanitized/")
chen_corrected_path.mkdir(parents=True, exist_ok=True)

In [None]:
for i in chen_path.glob("*.fna"):
    sanitize_chen_et_al(i, chen_corrected_path)

In [104]:
sharrar_path = Path("/datadrive/bgcflow/data/external/Sharrar_et_al/")
sharrar_corrected_path = Path("/datadrive/bgcflow/data/external/Sharrar_et_al_sanitized/")
sharrar_corrected_path.mkdir(parents=True, exist_ok=True)

In [105]:
sample_ids = [i.stem for i in sharrar_path.glob("*.fna")]

# how many id variables are there
print(set([len(i.split("_")) for i in sample_ids]))

{2, 3, 4, 8, 9, 10, 14}


In [106]:
# get unique values
type_2 = {i:i.split("_")[-1] for i in sample_ids if len(i.split("_")) == 2}
type_3 = {i:i.split("_", 1)[-1] for i in sample_ids if len(i.split("_")) == 3}
type_4 = {i:i.split("_", 2)[-1] for i in sample_ids if len(i.split("_")) == 4}
type_8 = {i:i.split("_", 5)[-1] for i in sample_ids if len(i.split("_")) == 8}
type_9 = {i:i.split("_", 6)[-1] for i in sample_ids if len(i.split("_")) == 9}
type_10 = {i:i.split("_", 7)[-1] for i in sample_ids if len(i.split("_")) == 10}
type_14 = {i:i.split("_", 11)[-1] for i in sample_ids if len(i.split("_")) == 14}

In [107]:
# aapend into one dictionary
sample_id_dicts = {}
unique_ids = [type_2, type_3, type_4, type_8, type_9, type_10, type_14]
for d in unique_ids:
    sample_id_dicts.update(d) 

In [108]:
# make sure value is unique
len(sample_id_dicts.values()) == len(set(sample_id_dicts.values()))

True

In [109]:
output_dir = sharrar_corrected_path
input_files = [i for i in sharrar_path.glob("*.fna")]
for i in input_files:
    original_file = Path(i)
    new_genome_id = sample_id_dicts[i.stem]
    corrected_file = Path(output_dir) / f"{new_genome_id}.fna"
    
    log_dict = {}

    with open(original_file) as original, open(corrected_file, 'w') as corrected:
        records = SeqIO.parse(original_file, 'fasta')
        ctr = 1
        for record in records:       
            old_id = record.id
            new_id = f"{new_genome_id}_{ctr}"
            record.id = new_id
            ctr = ctr + 1
            SeqIO.write(record, corrected, 'fasta')
            
            log_dict[old_id] = new_id
    
    with open(Path(output_dir) / f"{new_genome_id}.json", "w") as out_file:
        json.dump(log_dict, out_file, indent = 4)

In [110]:
import pandas as pd

In [112]:
df_sharrar = pd.read_csv("/datadrive/bgcflow/config/Sharrar_et_al/df_sharrar_bacteria.csv")

In [117]:
df_sharrar = df_sharrar.rename(columns={"genome_id":"original_genome_id"})

In [120]:
for i in df_sharrar.index:
    df_sharrar.loc[i, "genome_id"] = sample_id_dicts[df_sharrar.loc[i, "original_genome_id"]]

In [122]:
# shift column 'Name' to first position
first_column = df_sharrar.pop('genome_id')
  
# insert column using insert(position,column_name,
# first_column) function
df_sharrar.insert(0, 'genome_id', first_column)

In [143]:
df_sharrar.to_csv("/datadrive/bgcflow/config/Sharrar_et_al/df_sharrar_bacteria_sanitized.csv", index=None)

In [132]:
df_gtdb = pd.read_csv("/datadrive/bgcflow/config/gtdbtk.bac120.summary_christoph_filtered.tsv", sep="\t")
df_gtdb = df_gtdb.rename(columns={"user_genome":"original_user_genome"})

In [134]:
df_gtdb.original_user_genome

0             14_0903_02_20cm_Proteobacteria_54_67_14
1       14_0903_02_30cm_Rickettsiella_grylli_159_36_9
2        14_0903_02_30cm_Sphingobacteriales_157_38_13
3         14_0903_02_30cm_Sphingobacteriales_165_43_8
4          14_0903_02_30cm_Sphingomonadales_156_68_15
                            ...                      
1260                                     mgm4762960.3
1261                                     mgm4762961.3
1262                                     mgm4762962.3
1263                                     mgm4762963.3
1264                                     mgm4762965.3
Name: original_user_genome, Length: 1265, dtype: object

In [135]:
for i in df_gtdb.index:
    try:
        df_gtdb.loc[i, "user_genome"] = sample_id_dicts[df_gtdb.loc[i, "original_user_genome"]]
    except KeyError:
        df_gtdb.loc[i, "user_genome"] = df_gtdb.loc[i, "original_user_genome"]

In [137]:
# shift column 'Name' to first position
first_column = df_gtdb.pop('user_genome')
  
# insert column using insert(position,column_name,
# first_column) function
df_gtdb.insert(0, 'user_genome', first_column)

In [141]:
df_gtdb.to_csv("/datadrive/bgcflow/config/gtdbtk.bac120.summary_christoph_filtered_sharrar_sanitized.tsv", sep="\t", index=None)