## Setup

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import itertools
from collections import Counter
from networkx.drawing.nx_agraph import graphviz_layout
from skbio.stats.composition import ilr
from skbio.stats.composition import clr
from skbio.stats.composition import multiplicative_replacement
import seaborn as sns
from matplotlib import rcParams
sns.set()
sns.set(font_scale=1.5)

## Filter Incompletely Assigned Taxons

In [2]:
df = pd.read_csv('data/genus-normalized-table.csv', header=0)
print(len(df.columns))
diag = df['diagnosis']
taxtokeep = []
for c in df.columns:
    if 'g__' in c:
        genus = c.split('g__')[-1]
        if genus != '':
            taxtokeep.append(c)
df = df[taxtokeep]
df['diagnosis'] = diag

names = {x:x.replace("[","").replace("]","") for x in df.columns}
df.rename(names, axis=1, inplace=True)
df = df.groupby(df.columns, axis=1).sum()
print(len(df.columns))
df.to_csv('data/genus-table-final.csv', index=False)

# rename = {}
# for c in df.columns:
#     rename[c] = c.split('g__')[-1]
# df = df.rename(rename, axis=1)
# df['diagnosis'] = diag
# df.to_csv('temp.csv', index=False)

341
221


## Split Groups, Filter Bad Columns and Filter by Prevalence

In [3]:
def filterNoise(df):
    df3 = df.copy()
    
    # Quickly check if the entire column is bad
    lowMax = []
    for column in df3.columns:
        if df3[column].max() < 0.00001:
            lowMax.append(column)
    #Drop bad columns
    df4 = df3.drop(columns=lowMax).copy()
    
    #Create new dataframe and check noise levels
    thresh=0.00001
    df5 = df4.apply(lambda x: x.where(x > thresh, 0), axis=0).copy()
                
    return(df5)

In [4]:
df0 = df[df['diagnosis']==0]
df1 = df[df['diagnosis']==1]
df0 = df0.drop(['diagnosis'], axis=1)
df1 = df1.drop(['diagnosis'], axis=1)
df0 = filterNoise(df0)
df1 = filterNoise(df1)
print(df0.shape)
print(df1.shape)

(48, 154)
(48, 189)


In [5]:
def prevalenceFilter(df):
    dfo = df.copy()
    dfo.replace(0.0, np.nan, inplace=True)
    dfo.dropna(axis=1, how='any', thresh=round(len(dfo.index)*.1), inplace=True)
    dfo.fillna(0.0, inplace=True)
    dfo2 = dfo.div(dfo.sum(axis=1), axis=0).copy()            
    return(dfo2)

In [6]:
df0clean = prevalenceFilter(df0)
df1clean = prevalenceFilter(df1)
print(df0clean.shape)
print(df1clean.shape)

(48, 97)
(48, 100)


## Get Union of Species

In [7]:
uniquespecies = set(list(df0clean.columns)+list(df1clean.columns))
uniquespecies.remove('k__Bacteria;p__Fusobacteria;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__Fusobacterium')
uniquespecies.remove('k__Bacteria;p__Lentisphaerae;c__Lentisphaeria;o__Victivallales;f__Victivallaceae;g__Victivallis')
uniquespecies.remove('k__Bacteria;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Verrucomicrobiaceae;g__Akkermansia')
len(uniquespecies)

100

In [8]:
df0clean = df[df['diagnosis']==0][list(uniquespecies)]
df1clean = df[df['diagnosis']==1][list(uniquespecies)]
print(df0clean.shape)
print(df1clean.shape)

(48, 100)
(48, 100)


## CLR Transform

In [9]:
df0final = df0clean.copy()
df1final = df1clean.copy()
for row in df0clean.index:
    df0final.loc[row] = clr(np.array(df0clean.loc[row].replace(0.0, 1e-10)))
    
for row in df1clean.index:
    df1final.loc[row] = clr(np.array(df1clean.loc[row].replace(0.0, 1e-10)))
    
print(df0final.shape, df1final.shape)

(48, 100) (48, 100)


In [10]:
df0final.to_csv('data/healthy_clr.csv', index=False)
df1final.to_csv('data/schizo_clr.csv', index=False)

# Now Construct Partial Correlation Matrix with R Scripts