In [1]:
'''Import packages'''
'''Requires numpy, pandas, scikit-learn, and matplotlib/seaborn'''
import numpy as np
import pandas as pd
from skbio.stats import composition
from sklearn.model_selection import LeaveOneGroupOut
#from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder
from scipy.stats import kendalltau
#from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")

#If we want to time the implementation: 
#import time
#start_time = time.time()

Import the dataframes: 

In [2]:
'''Import data'''
data_rel = pd.read_csv('data/Chloroplasts_removed/ByLake_Filtering/5in10/muskegon/muskegon_relative_otu_5in10.tsv', sep=' ', index_col=None, header=0, float_precision='high')
target = pd.read_csv('data/Chloroplasts_removed/ByLake_Filtering/5in10/muskegon/muskegon_sampledata_5in10.tsv', sep= ' ', index_col=0, header=0)

Set `index`: 

In [3]:
index = target.Lake[target.Lake == 'Muskegon'].index

Create target variables and store as Pandas `Series` object: 

In [4]:
'''Set sample names as index and shuffle data'''

#Remove outlier samples:
data_rel = data_rel.loc[index,:]
target = target.loc[index,:]

#Create target columns of HNA-values: 
hna = target.loc[index,'HNA.cells']
hna_rel = hna/target.loc[index,'Total.cells']
hna = pd.Series(hna, index=hna.index)
hna_rel = pd.Series(hna_rel, index=hna.index)

#Create target columns of LNA-values: 
#lna = target.loc[index,'LNA.cells']
#lna_rel = lna/target.loc[index,'Total.cells']
#lna = pd.Series(lna, index=lna.index)
#lna_rel = pd.Series(lna_rel, index=lna.index)

Perform Centered Log-Ratio (CLR) transformation to abundance data: 

In [5]:
otus = list(data_rel.columns) #otus are the variables

data_rel_repl = composition.multiplicative_replacement(data_rel.loc[index,otus])
data_rel_clr = pd.DataFrame(composition.clr(data_rel_repl), index=index, columns=otus)
data_rel_clr.to_csv('muskegon_otu_5in10_clr.csv')

Standardize data with mean 0 and standard deviation of 1: 

In [6]:
from analysis_functions import standardize_df
data_stand, scaler = standardize_df(data_rel_clr,otus)



Create dummy variable to encode samples according to `Year` and `Site`; `cv` denotes cross-validation object. 

In [7]:
target.loc[index,'spatiotemporal'] = target.loc[index,'Year'].astype(str) + target.loc[index,'Site']
le = LabelEncoder()
le_values = le.fit_transform(target.loc[index,'spatiotemporal'].values)
cv = LeaveOneGroupOut().split(data_stand, groups=le_values)

First, calculate kendall tau correlation coefficient between OTU's and HNAcc: 

In [8]:
kendall = np.zeros(len(otus))
p_kendall = np.zeros(len(otus))
for i in np.arange(0,len(otus)):
    kendall[i], p_kendall[i] = kendalltau(data_rel_clr.loc[:,otus[i]],hna)
fs = pd.DataFrame(kendall, index=otus, columns = ['kendall_HNA'])
fs['p_kendall_HNA'] = p_kendall
fs['kendall significant HNA'] = np.where(fs.loc[otus,'p_kendall_HNA']<0.05,'yes','no')

Determine amount of OTU's that are significantly correlated with $P < 0.05$ or $P < 0.01$: 

In [9]:
print('Total OTUs: ' + str(len(otus)))
fs_selected_f = fs[fs.loc[:,'p_kendall_HNA'].values < 0.05]
print('Number of selected HNA OTUs using Kendall tau with p < 0.05: ' + str(len(fs_selected_f)))
fs_selected_f = fs[fs.loc[:,'p_kendall_HNA'].values < 0.01]
print('Number of selected HNA OTUs using Kendall tau with p < 0.01: ' + str(len(fs_selected_f)))

Total OTUs: 482
Number of selected HNA OTUs using Kendall tau with p < 0.05: 103
Number of selected HNA OTUs using Kendall tau with p < 0.01: 54


Show OTU's with highest correlation: 

In [10]:
fs.sort_values('kendall_HNA', inplace=True, ascending=False)
display(fs.head(10))

Unnamed: 0,kendall_HNA,p_kendall_HNA,kendall significant HNA
Otu000173,0.501851,8.199569e-09,yes
Otu000187,0.351666,5.36191e-05,yes
Otu000317,0.343205,8.077833e-05,yes
Otu000060,0.342147,8.497027e-05,yes
Otu000175,0.290323,0.0008540041,yes
Otu001267,0.286092,0.001015928,yes
Otu000279,0.283977,0.001107143,yes
Otu000038,0.279746,0.001312698,yes
Otu000073,0.279746,0.001312698,yes
Otu000614,0.278689,0.00136932,yes


Perform Randomized Lasso (RL) with HNAcc as output: 

In [11]:
from analysis_functions import perform_randomizedLasso
fs.loc[otus,'RL score'] = perform_randomizedLasso(data_stand.loc[index,otus], hna)
fs.loc[otus,'RL ranking'] = fs.loc[otus,'RL score'].rank(method='min', ascending=False)
fs.sort_values('RL score', inplace=True, ascending=False)



Show RL scores: 

In [12]:
display(fs.head(10))

Unnamed: 0,kendall_HNA,p_kendall_HNA,kendall significant HNA,RL score,RL ranking
Otu000173,0.501851,8.199569e-09,yes,0.462,1.0
Otu000038,0.279746,0.001312698,yes,0.378,2.0
Otu000029,0.259651,0.002860222,yes,0.346,3.0
Otu000614,0.278689,0.00136932,yes,0.336,4.0
Otu000264,0.278689,0.00136932,yes,0.332,5.0
Otu000412,-0.296668,0.0006554652,yes,0.312,6.0
Otu000088,-0.240613,0.005714879,yes,0.31,7.0
Otu000487,-0.207827,0.01698122,yes,0.31,7.0
Otu000244,-0.455315,1.697078e-07,yes,0.306,9.0
Otu000242,0.195135,0.02500431,yes,0.304,10.0


In [13]:
from analysis_functions import perform_Boruta

fs_boruta = perform_Boruta(200, 1/3, 5, data_stand, hna, otus)

Concat Boruta selection results with previous ones: 

In [14]:
fs = pd.concat([fs,fs_boruta], axis=1, ignore_index=False)

In [15]:
fs.sort_values('Boruta ranking', ascending=True, inplace=True)
display(fs.head(10))

Unnamed: 0,kendall_HNA,p_kendall_HNA,kendall significant HNA,RL score,RL ranking,Boruta ranking,Boruta score
Otu000005,0.248017,0.004389213,yes,0.03,306.0,1,0.075054
Otu000244,-0.455315,1.697078e-07,yes,0.306,9.0,1,0.171064
Otu000173,0.501851,8.199569e-09,yes,0.462,1.0,1,0.258015
Otu000012,-0.292438,0.0007823435,yes,0.076,129.0,1,0.061089
Otu000187,0.351666,5.36191e-05,yes,0.122,66.0,2,0.074209
Otu000016,0.153887,0.07713486,no,0.0,461.0,2,0.079004
Otu000313,0.245902,0.004736213,yes,0.27,11.0,4,0.033098
Otu000517,-0.358012,3.919857e-05,yes,0.076,129.0,4,0.032553
Otu000060,0.342147,8.497027e-05,yes,0.184,24.0,5,0.029043
Otu000521,-0.332628,0.0001331288,yes,0.122,66.0,6,0.022277


Calculate kendall tau correlation coefficient between RL ranking and Boruta ranking: 

In [16]:
from scipy.stats import spearmanr
r, p = kendalltau(fs.loc[otus,'RL ranking'], fs.loc[otus,'Boruta ranking'])
print('Kendall tau: ' + str(r))
print('P-value: ' + str(p))

Kendall tau: 0.14942423522
P-value: 1.8564792777e-06


Save results if wanted: 

In [17]:
#fs.to_csv('FS_new/Muskegon_fs_scores_HNA_5in10.csv')