In [2]:
import pandas as pd
prefix = '../huanan-env-paper-private/data/sample_metadata/Liu_etal_2023_market_samples_acc_Apr16.tsv'

## Table S1: Sample metadata

In [94]:
sample_metadata = pd.read_csv('{}/data/sample_metadata/Liu_etal_2023_market_samples_acc_Apr16.tsv'.format(prefix), sep="\t")

sample_metadata = sample_metadata[['Lab code', 'Sample ID', 'Sampling date', 'Sample_location',
       'Street No.', 'Vendor No.', 'edge_case', 'Stall_corrected',
       'Stall_corrected_merged', 'Sample type', 'Sample information',
       'SARS-CoV-2 qPCR result', 'Latitude', 'Longitude']]

sample_metadata.columns = ['Lab code', 'Sample ID', 'Sampling date', 'Sample location',
       'Street', 'Vendor', 'Edge case', 'Stall ID',
       'Stall ID (merged)', 'Sample type', 'Sample information',
       'SARS-CoV-2_qPCR_result', 'Latitude', 'Longitude']

## Fix mislabeled samples in Liu: Q37, Q61, and Q70
sample_metadata.loc[sample_metadata['Lab code'].isin(['Q37','Q61','Q70']), 'SARS-CoV-2_qPCR_result'] = 'Negative'

In [95]:
## Add sequencing information
sequencing_metadata = pd.read_csv('{}/data/sample_metadata/SRA_run_info.tsv'.format(prefix), sep="\t")
sequencing_metadata = sequencing_metadata[sequencing_metadata.Sample_category != 'single_end_duplicate']
sequencing_metadata = sequencing_metadata[sequencing_metadata.Sample_Type != 'sars2_amplicon']
sequencing_metadata = sequencing_metadata[['LibraryNameFixed','Run']]
sample_metadata = sample_metadata.merge(sequencing_metadata, left_on='Sample ID', right_on='LibraryNameFixed', how='left')
del sample_metadata['LibraryNameFixed']
sample_metadata['Sequencing run'] = sample_metadata['Run'].fillna('None')
del sample_metadata['Run']

In [109]:
# Add SARS2 information
sars2 = pd.read_csv('../huanan-env-paper-private/data/sarscov2/sars2_reads_post_trimming.tsv', sep="\t")
sars2_ngs_positives = sars2.query("Read_count > 0")['Sample']
sample_metadata['SARS-CoV-2_NGS_result'] = 'Negative'
sample_metadata.loc[sample_metadata['Sequencing run'].isin(sars2_ngs_positives), 'SARS-CoV-2_NGS_result'] = 'Positive'

In [111]:
sample_metadata.value_counts(['SARS-CoV-2_qPCR_result','SARS-CoV-2_NGS_result'])

SARS-CoV-2_qPCR_result  SARS-CoV-2_NGS_result
Negative                Negative                 764
Positive                Negative                  35
                        Positive                  31
Negative                Positive                   3
dtype: int64

In [112]:
sample_metadata.to_csv("TableS1.tsv", sep="\t", index=None)

## Table S2: Sequencing metadata

In [113]:
sequencing_metadata = pd.read_csv('{}/data/sample_metadata/SRA_run_info.tsv'.format(prefix), sep="\t")

In [114]:
sequencing_metadata = sequencing_metadata[['Run','Lab code','Sample_Type','LibraryNameFixed','Sample_category','Stall_corrected','Read_pairs_after_trimming','spots','avgLength','LibraryStrategy','LibrarySource']]

In [115]:
sequencing_metadata.columns = ['Sequencing run', 'Lab code', 'Sample type', 'Library name', 'Sample category',
       'Stall ID', 'Read pairs after trimming', 'Total reads before trimming', 'Average Read Length',
       'Library strategy', 'Library source']

In [116]:
sequencing_metadata.to_csv("TableS2.tsv", sep="\t", index=None)

## Table S3: Summary numbers

## Table S4: SARS2 read counts

In [132]:
sars2 = pd.read_csv('../huanan-env-paper-private/data/sarscov2/sars2_reads_post_trimming.tsv', sep="\t")
sars2 = pd.merge(sars2, sequencing_metadata[['Sequencing run','Lab code','Sample type','Sample category','Library name','Stall ID']], left_on='Sample',right_on='Sequencing run')

sars2.columns = ['Sample', 'SARS2 paired read count', 'SARS2 covered bases', 'Sequencing run', 'Lab code',
       'Sample type', 'Sample category', 'Library name', 'Stall ID']

sars2 = sars2[['Lab code', 'Sequencing run', 'SARS2 paired read count', 'SARS2 covered bases',
       'Sample type', 'Sample category', 'Library name', 'Stall ID']]

In [133]:
sars2.sort_values("SARS2 paired read count", ascending=False).to_csv("TableS4.tsv", sep="\t", index=None)

## Table S5: Species names

from:
`data/mitochondrial_mapping/species_descriptions_with_common_name.csv`

Modified slightly. Changed category to indicate animals reported by Xiao at the genus, but not species, level.

## Table S5: Mammalian DNA in SARS2 samples

In [165]:
animals = pd.read_csv('../huanan-env-paper-private/data/mitochondrial_mapping/mitochondrial_metazoa_coveredbases_93.tsv', sep="\t")
animals = animals.set_index("Run")

species = pd.read_csv('../huanan-env-paper-private/data/mitochondrial_mapping/species_descriptions_with_common_name.csv')
mammal = species.query("Class == 'Mammalia'").copy()
mammal['Contig_Species'] = mammal['Contig'] + " " + mammal['Species']
mammals = animals[animals.columns[animals.columns.isin(mammal['Contig_Species'])]].melt(ignore_index=False).reset_index().query("value>0")

In [181]:
positive_samples = sample_metadata.query("`SARS-CoV-2_qPCR_result` == 'Positive' or `SARS-CoV-2_NGS_result` == 'Positive'")

In [191]:
mammals = mammals[mammals.Run.isin(positive_samples['Sequencing run'])]

In [195]:
mammals_sars2 = pd.merge(mammals, positive_samples[['Sequencing run','Stall ID','Lab code','Sampling date','Sample type','Sample information','SARS-CoV-2_qPCR_result','SARS-CoV-2_NGS_result']], left_on='Run', right_on='Sequencing run', how='left').sort_values("Run")

In [196]:
mammals_sars2.to_csv("TableS5.tsv", sep="\t", index=None)

## Table S8-S11: Mammalian and animal read counts and covered bases

In [217]:
animals = pd.read_csv('../huanan-env-paper-private/data/mitochondrial_mapping/mitochondrial_metazoa_counts_93.tsv', sep="\t")
animals = animals.set_index("Run")

species = pd.read_csv('../huanan-env-paper-private/data/mitochondrial_mapping/species_descriptions_with_common_name.csv')
mammal = species.query("Class == 'Mammalia'").copy()
mammal['Contig_Species'] = mammal['Contig'] + " " + mammal['Species']
mammals = animals[animals.columns[animals.columns.isin(mammal['Contig_Species'])]]

tmp = sequencing_metadata[['Sequencing run','Lab code', 'Sample type', 'Sample category', 'Stall ID']]

mammals_final = pd.merge(mammals, tmp, left_index=True, right_on='Sequencing run', how='left')
mammals_final = mammals_final[['Sequencing run','Lab code', 'Sample type', 'Sample category', 'Stall ID'] + mammals.columns.to_list()]
mammals_final.to_csv("TableS8.tsv", index=None, sep="\t")

In [209]:
animals = pd.read_csv('../huanan-env-paper-private/data/mitochondrial_mapping/mitochondrial_metazoa_coveredbases_93.tsv', sep="\t")
animals = animals.set_index("Run")

species = pd.read_csv('../huanan-env-paper-private/data/mitochondrial_mapping/species_descriptions_with_common_name.csv')
mammal = species.query("Class == 'Mammalia'").copy()
mammal['Contig_Species'] = mammal['Contig'] + " " + mammal['Species']
mammals = animals[animals.columns[animals.columns.isin(mammal['Contig_Species'])]]

tmp = sequencing_metadata[['Sequencing run','Lab code', 'Sample type', 'Sample category', 'Stall ID']]

mammals_final = pd.merge(mammals, tmp, left_index=True, right_on='Sequencing run', how='left')
mammals_final = mammals_final[['Sequencing run','Lab code', 'Sample type', 'Sample category', 'Stall ID'] + mammals.columns.to_list()]
mammals_final.to_csv("TableS9.tsv", index=None, sep="\t")

In [216]:
animals = pd.read_csv('../huanan-env-paper-private/data/mitochondrial_mapping/mitochondrial_metazoa_counts_93.tsv', sep="\t")
animals_final.to_csv("TableS10.tsv", index=None, sep="\t")

animals = pd.read_csv('../huanan-env-paper-private/data/mitochondrial_mapping/mitochondrial_metazoa_coveredbases_93.tsv', sep="\t")
animals_final.to_csv("TableS11.tsv", index=None, sep="\t")

## Table S12: Comparison to Xiao 21 and Worobey 22

In [54]:
animals = pd.read_csv('../mtDNA/mitochondrial_metazoa_counts_93.tsv', sep="\t")
animals = animals.set_index("Run")

species = pd.read_csv('../mtDNA/species_descriptions_with_common_name.csv')
mammal = species.query("Class == 'Mammalia'").copy()
mammal['Contig_Species'] = mammal['Contig'] + " " + mammal['Species']
mammals = animals[animals.columns[animals.columns.isin(mammal['Contig_Species'])]]

## Get market samples
metadata = pd.read_csv('../metadata/Liu_etal_2023_with_sequencing.csv')
env_samples = metadata[~metadata.Sequencing_Run.isna()].query("`Sample.type` == 'Environmental swab'")

## For total counts
n1 = pd.DataFrame(mammals[mammals.index.isin(env_samples['Sequencing_Run'])].astype(bool).sum().sort_values(ascending=False))
n1.columns=['Total samples']

n2 = pd.DataFrame(mammals[mammals.index.isin(env_samples['Sequencing_Run'])].sum().sort_values(ascending=False))
n2.columns=['Total reads']

n2 = n2.merge(n1, left_index=True, right_index=True)
## For SARS2 positive counts
metadata = pd.read_csv('../metadata/Liu_etal_2023_with_sequencing.csv')
env_samples = metadata[~metadata.Sequencing_Run.isna()].query("`Sample.type` == 'Environmental swab'").query("`SARS.CoV.2.qPCR.result` == 'Positive'")

## For total counts
n3 = pd.DataFrame(mammals[mammals.index.isin(env_samples['Sequencing_Run'])].astype(bool).sum().sort_values(ascending=False))
n3.columns = ['Total SARS2 samples']

env_samples = metadata[~metadata.Sequencing_Run.isna()].query("`Sample.type` == 'Environmental swab'").query("`SARS.CoV.2.qPCR.result` == 'Positive'")
env_samples = env_samples[env_samples['Sampling.date'].isin(['2020-01-12'])]
n4 = pd.DataFrame(mammals[mammals.index.isin(env_samples['Sequencing_Run'])].astype(bool).sum().sort_values(ascending=False))
n4.columns = ['Total SARS2 Jan12 samples']
n4 = n4.merge(n3, left_index=True, right_index=True)

n4 = n4.merge(n2, left_index=True, right_index=True)

In [55]:
n4

Unnamed: 0,Total SARS2 Jan12 samples,Total SARS2 samples,Total reads,Total samples
NC_012920.1 Homo sapiens,6,35,32637,93
NC_021478.1 Rhizomys pruinosus,5,6,10538,51
NC_002008.4 Canis lupus,5,15,23759,62
NC_001913.1 Oryctolagus cuniculus,5,13,18435,53
NC_013700.1 Nyctereutes procyonoides,5,7,41118,57
KX964606.1 Erinaceus amurensis,5,9,8320,34
KR816507.1 Hystrix brachyura,4,4,5419,10
NC_011638.1 Rattus tanezumi,3,7,6080,30
NC_025316.1 Lepus sinensis,3,4,2274,27
NC_004069.1 Muntiacus reevesi,3,3,1326,20


## Table S12: rRNA

In [227]:
rrna = pd.read_csv('../huanan-env-paper-private/mtDNA/rRNA/mt93_counts_rrna.tsv', sep="\t")
species = pd.read_csv('../huanan-env-paper-private/old/data/mitochondrial_mapping/species_descriptions_with_common_name.csv')
mammal = species.query("Class == 'Mammalia'").copy()
rrna[['Sample_Name'] + mammal['Contig'].to_list()].to_csv("TableS12.tsv", index=None, sep="\t")

## Table S13: Correlations

In [243]:
corr = pd.read_csv('../huanan-env-paper-private/correlations/correlation_results.tsv', sep="\t")
corr2 = pd.read_csv('../huanan-env-paper-private/correlations/correlation_results_nomammal.tsv', sep="\t")
corr = pd.concat([corr,corr2])

In [244]:
corr.columns = ['Species', 'Common name', 'Class', 'Date', 'Spearman correlation (total reads)',
       'P-value (total reads)', 'Spearman correlation (mapped reads)', 'P-value (mapped reads)', 'Positive samples',
       'Read count', 'Category', 'drop', 'boot_95_min', 'boot_mean',
       'boot_95_max', 'Adjusted P-value (total reads)', 'Adjusted P-value (mapped reads)']

del corr['drop']

corr.to_csv("TableS13.tsv", index=None, sep="\t")

## Table S14: Mammalian viruses

In [230]:
virus = pd.read_csv('../huanan-env-paper-private/viruses/filtered_viral_counts_97_95_20_200.tsv', sep="\t")
virus.columns = ['Sequencing Run', 'Viral genome', 'Viral name', 'Virus genome length', 'Paired read count', 'Covered bases']
virus.to_csv("TableS14.tsv", index=None, sep="\t")

# Table S16 and S17: Mitochondrial genomes

In [245]:
mt = pd.read_csv('../huanan-env-paper-private/MT_phylogenetics/Mitochondrial_genomes.tsv', sep="\t")

In [247]:
mt.columns = ['Sequencing Run','Species','Bases coverage','Breadth of coverage','drop','SNPs from reference', 'ANI to reference (%)', 'Sample Lab code', 'Stall']
del mt['drop'] 
mt.to_csv('TableS15.tsv', sep="\t", index=None)