# Extract accessions from Table S1

In [1]:
import pandas as pd

custom_strain_renames = {
    "A20_iVar_mindepth15": "Huanan_Market_A20",
    "F13_SRR23971580_viralconsensus_min15": "Huanan_Market_F13",
    "F54_SRR23971582_viralconsensus": "Huanan_Market_F54",
    "B5_SRR23971484_viralconsensus": "Huanan_Market_B5",
}

df = (
    pd.read_excel("Table_S1.xlsx", sheet_name="Early SARS-CoV-2 genomes")
    .rename(columns={"Genome name": "name"})
    .assign(
        strain=lambda x: x["name"].str.split("|").str[0],
        rest=lambda x: x["name"].str.split("|", n=1).str[1],
        source=lambda x: x["rest"].map(
            lambda s: (
                "custom" if "|" not in s else
                ("gisaid" if s.startswith("EPI") else ("ngdc" if s.startswith("NMDC") else "genbank"))
            )
        ),
    )
    .assign(
        strain=lambda x: x["strain"].where(
            x["source"] != "custom", x["strain"].map(custom_strain_renames)
        ),
        accession=lambda x: x["rest"].str.split("|").str[0].where(
            x["source"] != "custom", x["strain"],
        ),
        date=lambda x: pd.to_datetime(x["rest"].str.split("|").str[-1]),
    )
    .drop(columns="rest")
    .merge(
        pd.read_csv("jointWHO_market_annotations.csv")[
            ["accession", "joint_WHO_China_sample_id", "addtl_annotations"]
        ],
        validate="one_to_one",
        on="accession",
        how="left",
    )
)

display(df)

df.to_csv("seq_metadata.csv", index=False)

for source, source_df in df.groupby("source"):
    source_df.to_csv(f"{source}_accessions.csv", index=False)

Unnamed: 0,name,strain,source,accession,date,joint_WHO_China_sample_id,addtl_annotations
0,A20_iVar_mindepth15|2020-01-01,Huanan_Market_A20,custom,Huanan_Market_A20,2020-01-01,,2020 market environment
1,F13_SRR23971580_viralconsensus_min15|2020-01-01,Huanan_Market_F13,custom,Huanan_Market_F13,2020-01-01,,2020 market environment
2,F54_SRR23971582_viralconsensus|2020-01-01,Huanan_Market_F54,custom,Huanan_Market_F54,2020-01-01,,2020 market environment
3,B5_SRR23971484_viralconsensus|2020-01-01,Huanan_Market_B5,custom,Huanan_Market_B5,2020-01-01,,2020 market environment
4,hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-26,hCoV-19/Wuhan/Hu-1/2019,gisaid,EPI_ISL_402125,2019-12-26,S06,2019 market sequence
...,...,...,...,...,...,...,...
858,hCoV-19/Shanghai/SH-P261-2-Wuhan/2020|OR240411...,hCoV-19/Shanghai/SH-P261-2-Wuhan/2020,genbank,OR240411,2020-02-15,,
859,hCoV-19/Shanghai/SH-P49-2-Shanghai/2020|OR2405...,hCoV-19/Shanghai/SH-P49-2-Shanghai/2020,genbank,OR240509,2020-02-15,,
860,hCoV-19/Shanghai/SH-P50-A-2-Shanghai/2020|OR24...,hCoV-19/Shanghai/SH-P50-A-2-Shanghai/2020,genbank,OR240510,2020-02-15,,
861,hCoV-19/Shanghai/SH-P56-A-2-Shanghai/2020|OR24...,hCoV-19/Shanghai/SH-P56-A-2-Shanghai/2020,genbank,OR240520,2020-02-15,,
