## Setup

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import itertools
from collections import Counter
from networkx.drawing.nx_agraph import graphviz_layout
from skbio.stats.composition import ilr
from skbio.stats.composition import clr
from skbio.stats.composition import multiplicative_replacement
import seaborn as sns
from matplotlib import rcParams
import os
sns.set()
sns.set(font_scale=1.5)

## Bootstrap Sampling

In [2]:
df = pd.read_csv('data/genus-table-final.csv', header=0)

In [3]:
df0 = df[df['diagnosis']==0]
df1 = df[df['diagnosis']==1]
df0.drop(["diagnosis"], axis=1, inplace=True)
df1.drop(["diagnosis"], axis=1, inplace=True)

bootstrap_n = 100
df0s = [df0.sample(frac=1, replace=True) for i in range(bootstrap_n)]
df1s = [df1.sample(frac=1, replace=True) for i in range(bootstrap_n)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0.drop(["diagnosis"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(["diagnosis"], axis=1, inplace=True)


## Filter the Sampled Tables

In [4]:
def filterNoise(df):
    df3 = df.copy()
    
    # Quickly check if the entire column is bad
    lowMax = []
    for column in df3.columns:
        if df3[column].max() < 0.00001:
            lowMax.append(column)
    #Drop bad columns
    df4 = df3.drop(columns=lowMax).copy()
    
    #Create new dataframe and check noise levels
    thresh=0.00001
    df5 = df4.apply(lambda x: x.where(x > thresh, 0), axis=0).copy()
                
    return(df5)

In [5]:
def prevalenceFilter(df):
    dfo = df.copy()
    dfo.replace(0.0, np.nan, inplace=True)
    dfo.dropna(axis=1, how='any', thresh=round(len(dfo.index)*.1), inplace=True)
    dfo.fillna(0.0, inplace=True)
    dfo2 = dfo.div(dfo.sum(axis=1), axis=0).copy()            
    return(dfo2)

In [6]:
for i in range(bootstrap_n):
    df0s[i] = filterNoise(df0s[i])
    df0s[i] = prevalenceFilter(df0s[i])
    df1s[i] = filterNoise(df1s[i])
    df1s[i] = prevalenceFilter(df1s[i])

In [8]:
os.mkdir("df0tables")
os.mkdir("df1tables")
for i in range(bootstrap_n):
    df0s[i].to_csv("df0tables/df0_table"+str(i)+".csv", index=False)
    df1s[i].to_csv("df1tables/df1_table"+str(i)+".csv", index=False)

## Apply clr Transformation to the Tables

In [None]:
for i in range(bootstrap_n):
    n = df0s[i].shape[0]
    for row in range(n):
        df0s[i].iloc[row] = clr(np.array(df0s[i].iloc[row].replace(0.0, 1e-10)))
        df1s[i].iloc[row] = clr(np.array(df1s[i].iloc[row].replace(0.0, 1e-10)))

In [None]:
os.mkdir("df0clrs")
os.mkdir("df1clrs")
for i in range(bootstrap_n):
    df0s[i].to_csv("df0clrs/df0_clr"+str(i)+".csv", index=False)
    df1s[i].to_csv("df1clrs/df1_clr"+str(i)+".csv", index=False)

# Create PCors with R Script and Proceed to Notebook 2