In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from scipy.stats.stats import pearsonr

# pd.set_option('display.max_columns', None)

# real snipen 2021 datasets
# current_file = './snipen_16.csv'
# current_file = './snipen_24.csv'
# current_file = './snipen_25.csv'

# simulated liu even dataset
# current_file = './liu_sim_even.csv'

# simulated helius datasets
# current_file = './helius_ecori_agei_100K.csv'
# current_file = './helius_ecori_msei_100K.csv'
# current_file = './helius_hhai_agei_100K.csv'
# current_file = './helius_hhai_msei_100K.csv'

# current_file = './helius_ecori_agei.csv'
# current_file = './helius_ecori_msei.csv'
# current_file = './helius_hhai_agei.csv'
# current_file = './helius_hhai_msei.csv'

# current_file = './helius_ecori_agei_10M.csv'
# current_file = './helius_ecori_msei_10M.csv'
# current_file = './helius_hhai_agei_10M.csv'
# current_file = './helius_hhai_msei_10M.csv'

# current_file = './helius_ecori_agei_100M.csv'
# current_file = './helius_ecori_msei_100M.csv'
# current_file = './helius_hhai_agei_100M.csv'
# current_file = './helius_hhai_msei_100M.csv'

# current_file = './helius_ecori_msei_10M_lower.csv'
# current_file = './helius_ecori_msei_10M_higher.csv'

current_file = './liu_sim_10M_post_bracken.csv'


df = pd.read_csv(current_file, index_col=0)

kmers_ls = df.columns.to_list()[15:-1]
kmers_ls = [i for i in kmers_ls if 'N' not in i]
print(kmers_ls)

This represents the number of reads recovered after recreating fragments

Remove all fragments with internal cut sites. This makes the ratio comparisons much simpler, as complete digest fragments will *always* occur at a higher ratio than the longer fragments that may contain them.

In [None]:
df = df[df['internal']==0]
p_tot_frag = int(df['observed'].sum())
print(f'{p_tot_frag} total fragments observed')
df.sort_values('rel_abund', inplace=True)
p_uniq_frag = df.shape[0]
print(f'{p_uniq_frag} unique fragments')
p_taxa_no = len(df['genome'].unique())
print(f'{p_taxa_no} taxa observed')
gen_ls = list(df['genome'].unique())

ground truth list:

In [None]:
e_ls = []
for gen in gen_ls:
    e = df[df['genome']==gen]['rel_abund'].unique()[0]
    e_ls.append(float(e))

print(len(e_ls))

## get the ratio of taxa-to-taxa fragment counts for each fragment length

Calculate all relative abundance comparisons by capturing the inter-taxa ratios for each fragment length. The average of these ratios will be used to determine the overall relative abundance of the taxa, because the ratios should hold regardless of the fragment size being taken into consideration.

In [None]:
def process_ratios(tmp_df, gen_ls):
    '''
    for a given length of fragment, within each genome x, get the average
    observed count as avg, then divide each observed count for every
    genome by avg this average and save as a column named after x
    '''
    for gen in gen_ls:
        avg = tmp_df[tmp_df['genome']==gen]['observed'].mean()
        tmp_df[gen] = tmp_df['observed'] / avg
    return tmp_df


def scale_ratios(np_arr):
    '''
    this approach assumes the first column, first row is a reliable
    representation of the real count data...
    '''
    rel_base = np_arr[0,0]
    for idx, i in enumerate(np_arr[0,:]):
        col_scale = rel_base/i
        np_arr[:,idx] = np_arr[:,idx]*col_scale
    return np_arr


def get_max_idx(ratios_df):
    max_ratios = ratios_df.count().max()
    for idx, i in enumerate(ratios_df.columns.to_list()):
        if ratios_df[i].count() == max_ratios:
            max_genome = i
            max_idx = idx
    return max_idx


def scale_ratios_to_max(np_ratios, max_idx):
    np_arr = np.copy(np_ratios)
    manp = np_arr[:,max_idx]

    for idx, i in enumerate(np_arr.T):
        col_scale = manp/i
        np_arr[:,idx] = np.nanmean(col_scale)*i
    return np_arr


def average_over_columns(np_arr):
    avg_ls = np.nanmean(np_arr, axis=1).tolist()
    return avg_ls


def return_rel_abund(o_ls):
    rel_ls = []
    for i in o_ls:
        rel_ls.append(i/sum(o_ls))
    return rel_ls

In [None]:
test_df = df.copy()
try:
    test_df.drop(gen_ls, inplace=True, axis=1)
except KeyError:
    pass
test_df = test_df.reindex(columns = test_df.columns.tolist() + gen_ls)

final_df = pd.DataFrame()

for j in range(0, test_df['length'].max()+1):
    if j % 100 == 0:
        print(f'processing fragments of {j}bp')
    tmp_df = test_df[test_df['length']==j].copy()
    if tmp_df.shape[0] > 0:
        tmp_df = process_ratios(tmp_df, gen_ls)
        final_df = pd.concat([final_df, tmp_df])

ratios_df = pd.DataFrame(0, index=gen_ls, columns=gen_ls)

for gena in gen_ls:
    for genb in gen_ls:
        ratios_df.loc[gena, genb] = final_df[final_df['genome']==gena][genb].mean()

np_ratios = np.array(ratios_df)
# scaled_arr = scale_ratios(np_ratios)
max_idx = get_max_idx(ratios_df)
scaled_arr = scale_ratios_to_max(np_ratios, max_idx)
o_ls = average_over_columns(scaled_arr)
o_ls = return_rel_abund(o_ls)

plt.figure(figsize=(10,10))
plt.scatter(e_ls, o_ls)
plt.plot([0,.12],[0,.12])
for i, gen_name in enumerate(gen_ls):
    plt.annotate(gen_name, (e_ls[i]+.003, o_ls[i]), fontsize=11)
plt.show()
print(f'\n{pearsonr(e_ls,o_ls)}')
print('\n')

In [None]:
ratios_df.to_csv(os.path.join('./ratios',f'{os.path.basename(current_file)[:-4]}_ratios.csv'))

In [None]:
pearsonr(e_ls,o_ls)

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(e_ls, o_ls)
# plt.plot([0,.12],[0,.12])
plt.plot([0,.0025],[0,.0025])
plt.show()
print(f'\n{pearsonr(e_ls,o_ls)}')
print('\n')

In [None]:
def count_to_rel(count_ls):
    rel_ls = []
    for i in count_ls:
        rel_ls.append(i/sum(count_ls))
    return rel_ls

mean_ls = []
median_ls = []

for gen in gen_ls:
    mean_ls.append(df[df['genome'] == gen]['observed'].mean())
    median_ls.append(df[df['genome'] == gen]['observed'].median())

mean_ls = count_to_rel(mean_ls)
median_ls = count_to_rel(median_ls)

In [None]:
abundances_df = pd.DataFrame()
abundances_df['genome'] = gen_ls
abundances_df['expected'] = e_ls
abundances_df['ratio'] = o_ls
abundances_df['mean'] = mean_ls
abundances_df['median'] = median_ls
abundances_df.to_csv(os.path.join('./ratios',f'{os.path.basename(current_file)[:-4]}_abundances.csv'))

In [None]:
ticks = [i for i in range(len(o_ls))]
labels = [i for i in gen_ls]

a = "{:.3f}".format(pearsonr(e_ls,o_ls)[0])
b = "{:.3f}".format(pearsonr(e_ls,mean_ls)[0])
c = "{:.3f}".format(pearsonr(e_ls,median_ls)[0])

plt.figure(figsize=(20,10))
# plt.xticks(ticks, labels, rotation = 90)
plt.scatter(ticks, e_ls, c='black', marker='_', s=40, alpha=0.45,label='ground truth               pearson r')
plt.scatter(ticks, o_ls, c='green', s=20, alpha=0.25,     label=f'ratio estimation          {a}')
plt.scatter(ticks, mean_ls, c='orange', s=20, alpha=0.25, label=f'mean depth                {b}')
plt.scatter(ticks, median_ls, c='red', s=20, alpha=0.25,  label=f'median depth             {c}')
plt.legend()
plt.ylabel('relative abundance')
plt.xlabel('taxa')
plt.savefig(os.path.join('./figures', f'{os.path.basename(current_file)[:-4]}_abundance.tif'), dpi=350)
plt.savefig(os.path.join('./figures', f'{os.path.basename(current_file)[:-4]}_abundance.png'), dpi=350)
plt.show()

In [None]:
ticks = [i for i in range(len(o_ls))]
labels = [i for i in gen_ls]

a = "{:.3f}".format(pearsonr(e_ls,o_ls)[0])
b = "{:.3f}".format(pearsonr(e_ls,mean_ls)[0])
c = "{:.3f}".format(pearsonr(e_ls,median_ls)[0])

plt.figure(figsize=(20,10))
# plt.xticks(ticks, labels, rotation = 90)
plt.scatter(ticks, e_ls, c='black', marker='_', s=40, alpha=0.45,label='ground truth               pearson r')
plt.scatter(ticks, o_ls, c='green', s=20, alpha=0.25,     label=f'ratio estimation          {a}')
plt.scatter(ticks, mean_ls, c='orange', s=20, alpha=0.25, label=f'mean depth                {b}')
plt.scatter(ticks, median_ls, c='red', s=20, alpha=0.25,  label=f'median depth             {c}')
plt.legend()
plt.yscale('log')
plt.ylabel('relative abundance (log10)')
plt.xlabel('taxa')
plt.savefig(os.path.join('./figures', f'log_{os.path.basename(current_file)[:-4]}_abundance.tif'), dpi=350)
plt.savefig(os.path.join('./figures', f'log_{os.path.basename(current_file)[:-4]}_abundance.png'), dpi=350)
plt.show()

In [None]:
stop

Below is code specifically catered to the Snipen et al. 2021 datasets

In [None]:
gen_dt = {'013372085.1' : 'Acinetobacter baumannii',                               
          '000154225.1' : 'Actinomyces odontolyticus',                             
          '000008005.1' : 'Bacillus cereus',                                       
          '000012825.1' : 'Bacteroides vulgatus',                                  
          '000016965.1' : 'Clostridium beijerinckii',                              
          '000008565.1' : 'Deinococcus radiodurans',                               
          '000172575.2' : 'Enterococcus faecalis',                                 
          '000005845.2' : 'Escherichia coli',                                      
          '000008525.1' : 'Helicobacter pylori',                                   
          '000014425.1' : 'Lactobacillus gasseri',                                 
          '000196035.1' : 'Listeria monocytogenes',                                
          '000008805.1' : 'Neisseria meningitidis',                                
          '000008345.1' : 'Propionibacterium acnes',                               
          '000006765.1' : 'Pseudomonas aeruginosa',                                
          '000012905.2' : 'Rhodobacter sphaeroides',                               
          '000017085.1' : 'Staphylococcus aureus',                                 
          '000007645.1' : 'Staphylococcus epidermidis',                            
          '000007265.1' : 'Streptococcus agalactiae',                              
          '000007465.2' : 'Streptococcus mutans',                                  
          '000006885.1' : 'Streptococcus pneumoniae'}

In [None]:
ticks = [i for i in range(len(o_ls))]
labels = [gen_dt[i.split('_')[1]] for i in gen_ls]
labels = [i for i in gen_ls]

a = "{:.3f}".format(pearsonr(e_ls,o_ls)[0])
b = "{:.3f}".format(pearsonr(e_ls,mean_ls)[0])
c = "{:.3f}".format(pearsonr(e_ls,median_ls)[0])

plt.figure(figsize=(len(o_ls),10))
plt.xticks(ticks, labels, rotation = 90)
plt.scatter(ticks, e_ls, c='black', marker='_', s=300, label='ground truth               pearson r')
plt.scatter(ticks, o_ls, c='green', s=100, alpha=0.5,     label=f'ratio estimation          {a}')
plt.scatter(ticks, mean_ls, c='orange', s=100, alpha=0.5, label=f'mean depth                {b}')
plt.scatter(ticks, median_ls, c='red', s=100, alpha=0.5,  label=f'median depth             {c}')
plt.legend()
plt.ylabel('relative abundance')
plt.savefig(os.path.join('./figures', f'{os.path.basename(current_file)[:-4]}_abundance.tif'), dpi=350)
plt.show()