In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import scipy.interpolate

from mpl_toolkits.mplot3d import Axes3D
from scipy.stats.stats import pearsonr
from scipy.interpolate import interp1d
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from statsmodels.nonparametric.smoothers_lowess import lowess as  sm_lowess

pd.set_option('display.max_columns', None)

Open the file containing fragment features, including count, sequence, kmer content, etc.

Create a list of the kmers that don't include 'N'.

In [None]:
# df = pd.read_csv('key_real_adj_features_kmers.csv', index_col=0)
# df = pd.read_csv('key_adj_features_kmers.csv', index_col=0)
# df = pd.read_csv('SRR10199724_no_dupes.csv', index_col=0)
# df = pd.read_csv('SRR10199716_sim_no_dupes.csv', index_col=0)
# df = pd.read_csv('SRR10199716_no_dupes.csv', index_col=0)
# df = pd.read_csv('ecori_agei_no_dupes.csv', index_col=0)
# df = pd.read_csv('./SRR5298272.csv', index_col=0)
df = pd.read_csv('./SRR5298272_no_dupes.csv', index_col=0)


kmers_ls = df.columns.to_list()[15:-1]
kmers_ls = [i for i in kmers_ls if 'N' not in i]
print(kmers_ls)

This represents the number of reads recovered after recreating fragments

In [None]:
print(df['observed'].sum())

Remove all fragments with internal cut sites. This makes the ratio comparisons much simpler, as complete digest fragments will *always* occur at a higher ratio than the longer fragments that may contain them.

In [None]:
print(df.shape[0])
df = df[df['internal']==0]
print(df.shape[0])
gen_ls = list(df['genome'].unique())

Use a lowess curve within the fragment distribution of each genome to find possible outliers

following https://james-brennan.github.io/posts/lowess_conf/

In [None]:
def smooth(x, y, xgrid):
    '''
    creates lowess curve for input x y arrays
    '''
    sample_no = round(len(y)*.05)
    samples = np.random.choice(len(x), sample_no, replace=True)
    y_s = y[samples]
    x_s = x[samples]
    y_sm = sm_lowess(y_s,x_s, frac=1./5., it=5,
                     return_sorted = False)

    # regularly sample it onto the grid
    y_grid = scipy.interpolate.interp1d(x_s, y_sm, 
                                        fill_value='extrapolate')(xgrid)
    return y_grid

df[['low', 'high']] = 0

outliers_df = pd.DataFrame()

for gen in gen_ls:
    tmp_df = df[df['genome'] == gen].copy()
    x = np.array(tmp_df['length'].to_list())
    y = np.array(tmp_df['observed'].to_list())

    sns.set_style("white")
    plt.rc("axes.spines", top=False, right=False)
    sns.set_context("paper")

    # use bootstrapping to get a stack of lowess models
    xgrid = np.array([i for i in range(x.min(), x.max()+1)])
    K = 1000
    smooths = np.stack([smooth(x, y, xgrid) for k in range(K)]).T
    mean = np.nanmean(smooths, axis=1)
    std_dev = np.nanstd(smooths, axis=1)

    plt.plot(xgrid, smooths, color='tomato', alpha=0.25)
    plt.plot(x, y, 'k.', alpha=0.5)
    plt.show()
    
    for i in range(0, 6):
        low_dt = {i:j for i, j in zip(xgrid, mean-i*std_dev)}
        high_dt = {i:j for i, j in zip(xgrid, mean+i*std_dev)}
        tmp_df['low'] = np.where(tmp_df['observed'] < tmp_df['length'].map(low_dt), i, tmp_df['low'])
        tmp_df['high'] = np.where(tmp_df['observed'] > tmp_df['length'].map(high_dt), i, tmp_df['high'])

    outliers_df = pd.concat([outliers_df, tmp_df])

    plt.figure(figsize=(10,10))
    plt.fill_between(xgrid, mean-1*std_dev,
                        mean+1*std_dev, alpha=0.2,color='green')
    plt.fill_between(xgrid, mean-2*std_dev,
                        mean+2*std_dev, alpha=0.2,color='green')
    plt.fill_between(xgrid, mean-3*std_dev,
                        mean+3*std_dev, alpha=0.2,color='green')
    plt.fill_between(xgrid, mean-4*std_dev,
                        mean+4*std_dev, alpha=0.2,color='green')
    plt.plot(xgrid, mean, color='tomato')
    plt.plot(x, y, 'k.', alpha=0.5)
    plt.title(f'Stdevs from bootstrapped Lowess model\n{gen}')
    plt.show()

    # break

## get the ratio of taxa-to-taxa fragment counts for each fragment length

Calculate all relative abundance comparisons by capturing the inter-taxa ratios for each fragment length. The average of these ratios will be used to determine the overall relative abundance of the taxa, because the ratios should hold regardless of the fragment size being taken into consideration.

ground truth list:

In [None]:
e_ls = []
for gen in gen_ls:
    e = df[df['genome']==gen]['rel_abund'].unique()[0]
    e_ls.append(float(e))

# print(e_ls)
print(len(e_ls))

In [None]:
def process_ratios(tmp_df, gen_ls):
    for gen in gen_ls:
        avg = tmp_df[tmp_df['genome']==gen]['observed'].mean()
        tmp_df[gen] = tmp_df['observed'] / avg
    return tmp_df


def scale_ratios(np_arr):
    rel_base = np_arr[0,0]
    for idx, i in enumerate(np_arr[0,:]):
        col_scale = rel_base/i
        np_arr[:,idx] = np_arr[:,idx]*col_scale
    return np_arr


def average_over_columns(np_arr):
    avg_ls = np.nanmean(np_arr, axis=1).tolist()
    return avg_ls


def return_rel_abund(o_ls):
    rel_ls = []
    for i in o_ls:
        rel_ls.append(i/sum(o_ls))
    return rel_ls


for i in range(max(outliers_df['low'].max(), outliers_df['high'].max())+1, 0, -1):
    print(f'using fragments within {i} standard deviations')
    test_df = outliers_df[(outliers_df['low'] < i )  & (outliers_df['high'] < i)].copy()
    try:
        test_df.drop(gen_ls, inplace=True, axis=1)
    except KeyError:
        pass
    test_df = test_df.reindex(columns = test_df.columns.tolist() + gen_ls)
    print(test_df.shape[0])

    final_df = pd.DataFrame()

    for j in range(0, test_df['length'].max()+1):
        tmp_df = test_df[test_df['length']==j].copy()
        if tmp_df.shape[0] > 0:
            tmp_df = process_ratios(tmp_df, gen_ls)
            final_df = pd.concat([final_df, tmp_df])

    ratios_df = pd.DataFrame(0, index=gen_ls, columns=gen_ls)

    for gena in gen_ls:
        for genb in gen_ls:
            ratios_df.loc[gena, genb] = final_df[final_df['genome']==gena][genb].mean()

    np_ratios = np.array(ratios_df)

    scaled_arr = scale_ratios(np_ratios)
    o_ls = average_over_columns(scaled_arr)
    o_ls = return_rel_abund(o_ls)
    plt.figure(figsize=(10,10))
    plt.scatter(e_ls, o_ls)
    plt.plot([0,.12],[0,.12])
    for i, gen_name in enumerate(gen_ls):
        plt.annotate(gen_name, (e_ls[i]+.003, o_ls[i]), fontsize=11)
    plt.show()
    print(f'{pearsonr(e_ls,o_ls)}')
    print('\n')

Calculate the mean observed number of reads at each length, n. Then, create a n x n matrix of every mean ratio across all lengths where present.

In [None]:
kmers_sums = outliers_df[kmers_ls].sum().to_list()
kmers_bunds_start = [i/sum(kmers_sums) for i in kmers_sums]

transp_ls = [i for i in range(max(outliers_df['low'].max(), outliers_df['high'].max()), 0, -1)]
transp_ls = [i/max(outliers_df['low'].max(), outliers_df['high'].max()) for i in reversed(transp_ls)]

plt.figure(figsize=(20,5))  
plt.cool()
     
for idx, i in enumerate(range(max(outliers_df['low'].max(), outliers_df['high'].max()), 0, -1)):
    # print(f'using fragments within {i} standard deviations')
    kmer_df = outliers_df[(outliers_df['high'] < i)].copy()
    kmers_sums = kmer_df[kmers_ls].sum().to_list()
    kmers_bunds = [i/sum(kmers_sums) for i in kmers_sums]
    kmers_bunds = [i-j for i, j in zip(kmers_bunds, kmers_bunds_start)]
    plt.xticks(rotation=90)
    points=[idx for idx, i in enumerate(kmers_ls)]
    plt.axhline(y=0, color = 'black')
    for i in points:
        plt.axvline(x=i, color = 'whitesmoke', zorder=0)
    plt.scatter(kmers_ls, kmers_bunds, s=100, c=kmers_bunds, zorder=1, alpha=transp_ls[idx])
print('removing the overabundant fragments...')
plt.show()

plt.figure(figsize=(20,5))       

for idx, i in enumerate(range(max(outliers_df['low'].max(), outliers_df['high'].max()), 0, -1)):
    # print(f'using fragments within {i} standard deviations')
    kmer_df = outliers_df[(outliers_df['low'] < i)].copy()
    kmers_sums = kmer_df[kmers_ls].sum().to_list()
    kmers_bunds = [i/sum(kmers_sums) for i in kmers_sums]
    kmers_bunds = [i-j for i, j in zip(kmers_bunds, kmers_bunds_start)]
    plt.xticks(rotation=90)
    points=[idx for idx, i in enumerate(kmers_ls)]
    plt.axhline(y=0, color = 'black')
    for i in points:
        plt.axvline(x=i, color = 'whitesmoke', zorder=0)
    plt.scatter(kmers_ls, kmers_bunds, s=100, c=kmers_bunds, zorder=1, alpha=transp_ls[idx])
print('removing the underabundant fragments...')
plt.show()

## multiple linear regression

In [None]:
# kmer_df = adj_final_df.copy()

# mlr_df = kmer_df[kmers_ls+['length','observed']]

# train, test = train_test_split(mlr_df, test_size=0.3)

# y_train = np.array(train['observed'])
# x_train = np.array(train.drop(['observed'], axis=1))
# y_test = np.array(test['observed'])
# x_test = np.array(test.drop(['observed'], axis=1))

# model = LinearRegression().fit(x_train, y_train)
# preds = [model.predict(np.array([i]))[0] for i in x_test]

# plt.figure(figsize=[10,10])
# plt.scatter(preds,list(y_test),alpha=0.05)
# plt.xlabel('predicted')
# plt.ylabel('observed')
# plt.plot([i for i in np.linspace(0,200,10)],[i for i in np.linspace(0,200,10)],c='pink')
# plt.xlim([0,200])
# plt.ylim([0,200])
# plt.show()
# print(pearsonr(preds,list(y_test)))

# reg = MLPRegressor(hidden_layer_sizes=(x_train.shape[1], x_train.shape[1]), solver='adam', activation="relu", random_state=1, max_iter=1000).fit(x_train, y_train)
# y_pred=reg.predict(x_test)
# print(r2_score(y_pred, y_test))

# plt.figure(figsize=[10,10])
# plt.scatter(y_pred,list(y_test),alpha=0.05)
# plt.xlabel('predicted')
# plt.ylabel('observed')
# plt.plot([i for i in np.linspace(0,200,10)],[i for i in np.linspace(0,200,10)],c='pink')
# plt.xlim([0,200])
# plt.ylim([0,200])
# plt.show()
# print(pearsonr(list(y_pred),list(y_test)))