In [5]:
import pandas as pd
import os

In [6]:
# Get the coverage information
metrics = pd.read_csv("../../../../opa_diversity_snakemake/input_data/complete_genome_assemblies/grad_lab/metrics.tsv", sep = '\t')
metrics['ASSEMBLYNAME'] = metrics['name'].str.split('/', expand = True)[1]

# Rename isolates that had different names in manuscript
metrics.at[metrics['ASSEMBLYNAME'] == 'NZ2015-73', 'ASSEMBLYNAME'] = 'SRR5827236'
metrics.at[metrics['ASSEMBLYNAME'] == 'NZ2015-300', 'ASSEMBLYNAME'] = 'SRR5827322'
metrics.at[metrics['ASSEMBLYNAME'] == 'JJJ025', 'ASSEMBLYNAME'] = 'EEE024'

# Calculate approximate coverage
metrics['COVERAGE'] = round(metrics['input_read_bases']/metrics['consensus_assembly_bases'], 1)
metrics = metrics[['ASSEMBLYNAME', 'COVERAGE']]

In [7]:
# Read in overall manifest file
df = pd.read_csv('../data/ena_assembly_submission_manifest_combined.tsv', sep = '\t')

# Merge overall manifest file with coverage information
merged = df.merge(metrics, on = 'ASSEMBLYNAME')

# Write overall manifest file with coverage information to file
merged.to_csv('../data/ena_assembly_submission_manifest_combined_with_coverage.tsv', sep = '\t', index = False)

In [8]:
# Separate into individual manifest files

# Create folder for manifest files if it doesn't already exist
os.makedirs('../data/assembly_manifest_files', exist_ok=True)

# Read in overall manifest file
df = pd.read_csv('../data/ena_assembly_submission_manifest_combined_with_coverage.tsv', sep = '\t')

# Get column values
columns = df.columns

# Iterate through isolate and create a separate manifest for each file (field name + \t + field value + \n)
for i, row in df.iterrows():
    name = row['ASSEMBLYNAME']
    with open("../data/assembly_manifest_files/" + name + ".txt", "w") as file:
        for column in columns:
            file.write(column + '\t' + str(row[column]) + '\n')