In [1]:
'''Import packages'''
'''Requires numpy, pandas, scikit-learn, and matplotlib/seaborn'''
import numpy as np
import pandas as pd
from skbio.stats import composition
from sklearn.model_selection import LeaveOneGroupOut
#from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder
from scipy.stats import kendalltau
#from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")

#If we want to time the implementation: 
#import time
#start_time = time.time()

Import the dataframes: 

In [2]:
'''Import data'''
data_rel = pd.read_csv('data/Chloroplasts_removed/ByLake_Filtering/5in10/muskegon/muskegon_relative_otu_5in10.tsv', sep=' ', index_col=None, header=0, float_precision='high')
target = pd.read_csv('data/Chloroplasts_removed/ByLake_Filtering/5in10/muskegon/muskegon_sampledata_5in10.tsv', sep= ' ', index_col=0, header=0)

Set `index`: 

In [3]:
index = target.Lake[target.Lake == 'Muskegon'].index

Create target variables and store as Pandas `Series` object: 

In [4]:
'''Set sample names as index and shuffle data'''

#Remove outlier samples:
data_rel = data_rel.loc[index,:]
target = target.loc[index,:]

#Create target columns of HNA-values: 
#hna = target.loc[index,'HNA.cells']
#hna_rel = hna/target.loc[index,'Total.cells']
#hna = pd.Series(hna, index=hna.index)
#hna_rel = pd.Series(hna_rel, index=hna.index)

#Create target columns of LNA-values: 
lna = target.loc[index,'LNA.cells']
lna_rel = lna/target.loc[index,'Total.cells']
lna = pd.Series(lna, index=lna.index)
lna_rel = pd.Series(lna_rel, index=lna.index)

Perform Centered Log-Ratio (CLR) transformation to abundance data: 

In [5]:
otus = list(data_rel.columns) #otus are the variables

data_rel_repl = composition.multiplicative_replacement(data_rel.loc[index,otus])
data_rel_clr = pd.DataFrame(composition.clr(data_rel_repl), index=index, columns=otus)

If we want to save CLR-transformed file: 

In [6]:
#data_rel_clr.to_csv('muskegon_otu_5in10_clr.csv')

Standardize data with mean 0 and standard deviation of 1: 

In [7]:
from analysis_functions import standardize_df
data_stand, scaler = standardize_df(data_rel_clr,otus)



Create dummy variable to encode samples according to `Year` and `Site`; `cv` denotes cross-validation object. 

In [8]:
target.loc[index,'spatiotemporal'] = target.loc[index,'Year'].astype(str) + target.loc[index,'Site']
le = LabelEncoder()
le_values = le.fit_transform(target.loc[index,'spatiotemporal'].values)
cv = LeaveOneGroupOut().split(data_stand, groups=le_values)

First, calculate kendall tau correlation coefficient between OTU's and HNAcc: 

In [9]:
kendall = np.zeros(len(otus))
p_kendall = np.zeros(len(otus))
for i in np.arange(0,len(otus)):
    kendall[i], p_kendall[i] = kendalltau(data_rel_clr.loc[:,otus[i]],lna)
fs = pd.DataFrame(kendall, index=otus, columns = ['kendall_LNA'])
fs['p_kendall_LNA'] = p_kendall
fs['kendall significant LNA'] = np.where(fs.loc[otus,'p_kendall_LNA']<0.05,'yes','no')

Determine amount of OTU's that are significantly correlated with $P < 0.05$ or $P < 0.01$: 

In [10]:
print('Total OTUs: ' + str(len(otus)))
fs_selected_f = fs[fs.loc[:,'p_kendall_LNA'].values < 0.05]
print('Number of selected LNA OTUs using Kendall tau with p < 0.05: ' + str(len(fs_selected_f)))
fs_selected_f = fs[fs.loc[:,'p_kendall_LNA'].values < 0.01]
print('Number of selected LNA OTUs using Kendall tau with p < 0.01: ' + str(len(fs_selected_f)))

Total OTUs: 482
Number of selected LNA OTUs using Kendall tau with p < 0.05: 210
Number of selected LNA OTUs using Kendall tau with p < 0.01: 121


Show OTU's with highest correlation: 

In [11]:
fs.sort_values('kendall_LNA', inplace=True, ascending=False)
display(fs.head(10))

Unnamed: 0,kendall_LNA,p_kendall_LNA,kendall significant LNA
Otu000005,0.601269,4.976994e-12,yes
Otu000048,0.514543,3.419384e-09,yes
Otu000108,0.505024,6.601971e-09,yes
Otu000029,0.494447,1.352736e-08,yes
Otu000113,0.482813,2.928865e-08,yes
Otu000038,0.482813,2.928865e-08,yes
Otu000123,0.454257,1.812209e-07,yes
Otu000016,0.447911,2.678812e-07,yes
Otu000277,0.446854,2.857682e-07,yes
Otu000042,0.434162,6.138084e-07,yes


Perform Randomized Lasso (RL) with HNAcc as output: 

In [12]:
from analysis_functions import perform_randomizedLasso
fs.loc[otus,'RL score'] = perform_randomizedLasso(data_stand.loc[index,otus], lna)
fs.loc[otus,'RL ranking'] = fs.loc[otus,'RL score'].rank(method='min', ascending=False)
fs.sort_values('RL score', inplace=True, ascending=False)



Show RL scores: 

In [13]:
display(fs.head(10))

Unnamed: 0,kendall_LNA,p_kendall_LNA,kendall significant LNA,RL score,RL ranking
Otu000029,0.494447,1.352736e-08,yes,0.568,1.0
Otu000244,-0.402433,3.793692e-06,yes,0.394,2.0
Otu000242,0.140137,0.1074783,no,0.372,3.0
Otu000030,0.429931,7.883544e-07,yes,0.366,4.0
Otu000038,0.482813,2.928865e-08,yes,0.306,5.0
Otu000136,-0.373876,1.751855e-05,yes,0.306,5.0
Otu000267,-0.372819,1.850315e-05,yes,0.294,7.0
Otu000905,0.294553,0.0007162986,yes,0.256,8.0
Otu000210,-0.450026,2.35295e-07,yes,0.254,9.0
Otu000412,-0.186674,0.0320208,yes,0.252,10.0


In [15]:
from analysis_functions import perform_Boruta

fs_boruta = perform_Boruta(200, 1/3, 5, data_stand, lna, otus)

Concat Boruta selection results with previous ones: 

In [16]:
fs = pd.concat([fs,fs_boruta], axis=1, ignore_index=False)

In [17]:
fs.sort_values('Boruta ranking', ascending=True, inplace=True)
display(fs.head(10))

Unnamed: 0,kendall_LNA,p_kendall_LNA,kendall significant LNA,RL score,RL ranking,Boruta ranking,Boruta score
Otu000048,0.514543,3.419384e-09,yes,0.008,430.0,1,0.040885
Otu000005,0.601269,4.976994e-12,yes,0.172,44.0,1,0.185705
Otu000083,-0.40349,3.57759e-06,yes,0.008,430.0,1,0.039996
Otu000113,0.482813,2.928865e-08,yes,0.224,14.0,1,0.051869
Otu000027,-0.548387,2.999146e-10,yes,0.124,68.0,1,0.230685
Otu000060,0.240613,0.005714879,yes,0.25,11.0,1,0.018703
Otu000016,0.447911,2.678812e-07,yes,0.07,139.0,1,0.010861
Otu000017,-0.369646,2.178312e-05,yes,0.052,184.0,1,0.004633
Otu000058,-0.29138,0.0008174455,yes,0.208,19.0,1,0.027559
Otu000042,0.434162,6.138084e-07,yes,0.016,383.0,1,0.021497


Calculate kendall tau correlation coefficient between RL ranking and Boruta ranking: 

In [18]:
from scipy.stats import spearmanr
r, p = kendalltau(fs.loc[otus,'RL ranking'], fs.loc[otus,'Boruta ranking'])
print('Kendall tau: ' + str(r))
print('P-value: ' + str(p))

Kendall tau: 0.170948758023
P-value: 4.59722856685e-08


Save results if wanted: 

In [None]:
#fs.to_csv('FS_new/Muskegon_fs_scores_LNA_5in10.csv')