In [41]:
'''Import packages'''
'''Requires numpy, pandas, scikit-learn, and matplotlib/seaborn'''
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import Lasso
from scipy.stats import linregress

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")

'''Import script which contains functions'''
import analysis_functions
from analysis_functions import get_r2
from analysis_functions import get_lassoCV
from analysis_functions import perform_randomizedLasso

from IPython.display import display, HTML

#If we want to time the implementation: 
#import time
#start_time = time.time()

Import the dataframes: 

In [42]:
'''Import data'''
data_abs = pd.read_csv('data/Chloroplasts_removed/nochloro_absolute_otu.tsv', sep=' ', index_col=None, header=0)
data_rel = pd.read_csv('data/Chloroplasts_removed/nochloro_relative_otu.tsv', sep=' ', index_col=None, header=0)
target = pd.read_csv('data/Chloroplasts_removed/nochloro_HNA_LNA.tsv', sep=' ', index_col=0, header=0)
productivity = pd.read_csv('data/Chloroplasts_removed/productivity_data.tsv', sep=' ', index_col=0, header=0)
productivity.index = productivity.samples

**2)**: At 01-05-'17 @marschmi found that a few samples can be seen as outliers, as they represent the bottom waters of productive inland lakes, resulting in a big difference of the HNA percentage. These samples are the following: Z14055F, Z14003F, Z14007F, Z14023F, Z14011F. A code of line is added, in order to be able to run the pipeline without these samples; 

In [43]:
samples_to_drop = ['Z14055F', 'Z14003F', 'Z14007F', 'Z14023F', 'Z14011F']
index = target.index.drop(samples_to_drop)

In [44]:
'''Set sample names as index and shuffle data'''
productivity = productivity.loc[target.samples.values,:]
productivity.index= target.index

#Remove outlier samples:
data_abs = data_abs.loc[index,:] 
data_rel = data_rel.loc[index,:]
target = target.loc[index,:]
productivity = productivity.loc[index,:]

#Shuffle data: 
data_abs = data_abs.sample(frac=1, random_state=3)
data_rel = data_rel.sample(frac=1, random_state=3)
target = target.sample(frac=1, random_state=3)
productivity = productivity.sample(frac=1, random_state=3) 

#Create target columns of HNA-values: 
hna = target.loc[:,'HNA.cells']
hna_rel = hna/target.loc[:,'Total.cells']
hna = pd.Series(hna, index=hna.index)
hna_rel = pd.Series(hna_rel, index=hna.index)

**-- PREPROCESSING OF DATA --**

**1)**: filter out those OTUs which have very low abundances and so give rise to (almost) zero-columns. Therefore an OTU has to have a minimal relative abundance one, defined by the parameter $abun$. 

However, depending on the problem set-up, I use a **second constraint** which states that an OTU must have a relative abundance > $abun$ in one of the **productivity** samples. In this way we're going to bias the OTU-selection towards the ones which are considerably present in the productivity samples. 

In [45]:
'''Filtering based on productivity samples, not needed for first part of analysis'''
#retain only productivity samples 
#productivity = productivity.dropna(subset=['tot_bacprod'])
#remove high productivity samples (>90)
#productivity = productivity[productivity.tot_bacprod < 90]

#idx_prod = productivity.samples.values
#display(idx_prod)
#prod = pd.Series(productivity.tot_bacprod.values, index=idx_prod)
#prod_error = pd.Series(productivity.SD_tot_bacprod.values, index=idx_prod)
#prod_rel_error = prod_error/prod

'Filtering based on productivity samples, not needed for first part of analysis'

In [46]:
'''Parameter abun for initial filtering of OTUs'''
abun = 0.001

In [47]:
from analysis_functions import preprocess_df
data_abs = preprocess_df(data_abs,abun,True)
otus = list(data_abs.columns)

print('Number of OTUs: ' + str(len(otus)))

Number of OTUs: 1245


(Note that this number is the same whether we use absolute or relative abundances, as the filtering is based on a minimal _relative_ abundance.)

In [48]:
#Some variables to store information and to create inner and outer CV-folds

#cv_out = 10
cv = 5
#outer_cv = KFold(n_splits=cv_out, shuffle=False)

#otu_scores_cv = pd.DataFrame(columns=otus)
#r2_cv = np.zeros(cv_out)
#thresholds_cv = np.zeros(cv_out)

#pred = pd.Series(index=data_abs.index)
#final_scores = pd.DataFrame(columns=otus)

thresholds = np.arange(0,1,0.01)
t = 0

Let's first check the performance without using the randomized Lasso: 

To do so, we use the **_Randomized Lasso_**: this method makes use of two kinds of randomization in order to select variables (i.e., OTU's) with a certain _stability_: (1) it fits a Lasso to various bootstrap subsamples and (2) it perturbs the initial weighting of certain variables. 

This results in a $score \in [0,1]$ that is assigned to variables, with 0 denoting the case where a variable is never chosen by the Lasso, and 1 denoting the case where a variable always is chosen. In other words, the higher the score, the more important a variable can be considered to be. 

Let's use a **10x5 nested cross-validation** scheme to evaluate our total pipeline: this means that the preprocessing and randomized Lasso is now included in this pipeline (in order to be sure not to overfit, motivated by the paragraph ''7.10.2 The Wrong and Right Way to Do Cross-validation" in ESLII): 

**First goal: ** try to pinpoint those OTU's for which we are sure they are present in the '_HNA-cloud_'. 

In [49]:
lassoCV = get_lassoCV(cv)
scaler = StandardScaler()
scaler.fit(data_abs.loc[:,otus])
data_abs = pd.DataFrame(scaler.transform(data_abs[otus]),index=data_abs.index,columns=otus)    
lassoCV.fit(data_abs.loc[:,otus], hna)
mse = np.sum(lassoCV.mse_path_, axis=1)
mse_min = np.min(mse)
alpha = lassoCV.alpha_
    
otu_scores = pd.Series(perform_randomizedLasso(data_abs.loc[:,otus], hna, alpha), index=otus)
otu_scores.sort_values(ascending=False,inplace=True)
        
mse_scores = np.zeros(len(thresholds))
dummy=0
scores = otu_scores
    
for thr in thresholds: 
    scores = otu_scores[otu_scores.values > thr]
    features_new = scores.index
    if(len(features_new) > 0): 
        lassoCV = get_lassoCV(cv)
        lassoCV.fit(data_abs.loc[:,features_new],hna)
        #alphas, preds = perform_nested_ridge_cv(data_abs[features_new],hna) #We could use this if we want a different evaluation model
        mse = np.sum(lassoCV.mse_path_, axis=1)
        mse_scores[dummy] = np.min(mse)
    dummy+=1
        
mse_scores = mse_scores[np.nonzero(mse_scores)]
mse_min_idx = mse_scores.argmin()
thresh_max = thresholds[mse_min_idx]
optimal_scores = otu_scores[otu_scores.values>thresh_max]
selected_otus = optimal_scores.index
    
lassoCV = get_lassoCV(cv)
lassoCV.fit(data_abs.loc[:, selected_otus], hna)
alpha = lassoCV.alpha_
lasso = Lasso(alpha,max_iter=20000,normalize=False)
pred = cross_val_predict(lasso, data_abs.loc[:, selected_otus], hna, cv=cv)

In [50]:
r2_final = get_r2(pred,hna)   
print('R²_cv: ' + str(r2_final) )

R²_cv: 0.969335928734


In [51]:
display(optimal_scores)

Otu000009    0.693333
Otu000382    0.580000
Otu000344    0.576667
Otu000123    0.570000
Otu000176    0.560000
Otu000227    0.550000
Otu000067    0.546667
Otu007625    0.540000
Otu000011    0.533333
Otu000025    0.530000
Otu000098    0.523333
Otu000128    0.506667
Otu001312    0.500000
Otu000047    0.496667
Otu002102    0.496667
Otu000487    0.476667
Otu000173    0.463333
Otu000004    0.456667
Otu000765    0.453333
Otu000474    0.453333
Otu000203    0.453333
Otu000050    0.446667
Otu001929    0.446667
Otu000106    0.433333
Otu000016    0.433333
Otu001749    0.433333
Otu000109    0.430000
Otu000057    0.426667
Otu000224    0.423333
Otu000319    0.423333
Otu000007    0.423333
Otu000981    0.420000
Otu000112    0.420000
Otu000469    0.413333
Otu000124    0.406667
Otu000204    0.403333
Otu000175    0.396667
Otu000563    0.383333
Otu000005    0.380000
Otu000030    0.380000
Otu000222    0.376667
Otu000139    0.370000
Otu000781    0.363333
Otu000084    0.353333
Otu000354    0.346667
Otu000219 

In [52]:
optimal_scores.to_csv('HNA_selectedOTUs_stand_abun_remov' + str(abun)+'_R2'+str(r2_final)+'.csv')

**-- WHAT IF WE ONLY CONSIDER THOSE OTU's WHICH ARE SIGNIFICANTLY PRESENT IN THE PRODUCTIVITY SAMPLES? --**

In [53]:
'''Import data'''
data_abs = pd.read_csv('data/Chloroplasts_removed/nochloro_absolute_otu.tsv', sep=' ', index_col=None, header=0)
data_rel = pd.read_csv('data/Chloroplasts_removed/nochloro_relative_otu.tsv', sep=' ', index_col=None, header=0)
target = pd.read_csv('data/Chloroplasts_removed/nochloro_HNA_LNA.tsv', sep=' ', index_col=0, header=0)
productivity = pd.read_csv('data/Chloroplasts_removed/productivity_data.tsv', sep=' ', index_col=0, header=0)

In [54]:
'''Set sample names as index and shuffle data'''
data_abs.set_index(target.samples,inplace=True)
data_rel.set_index(target.samples,inplace=True)
data_abs = data_abs.sample(frac=1, random_state=3)
data_rel = data_rel.sample(frac=1, random_state=3)
target = target.sample(frac=1, random_state=3)
productivity = productivity.sample(frac=1, random_state=3)

#Create target columns of HNA-values: 
hna = target.loc[:,'HNA.cells']
hna_rel = hna/target.loc[:,'Total.cells']
hna = pd.Series(hna, index=hna.index)
hna_rel = pd.Series(hna_rel, index=hna_rel.index)

**Preprocessing: ** First filter productivity outliers (productivity > 90). 

In [55]:
#retain only productivity samples 
productivity = productivity.dropna(subset=['tot_bacprod'])
#remove high productivity samples (>90)
productivity = productivity[productivity.tot_bacprod < 90]

idx_prod = productivity.samples.values
#display(idx_prod)
prod = pd.Series(productivity.tot_bacprod.values, index=idx_prod)
#prod_error = pd.Series(productivity.SD_tot_bacprod.values, index=idx_prod)
#prod_rel_error = prod_error/prod

In [56]:
from analysis_functions import preprocess_df
data_abs_prod = data_abs.loc[idx_prod,:] 
data_abs_prod = preprocess_df(data_abs_prod,abun,True)
otus_prod = list(data_abs_prod.columns)

print('Number of OTUs: ' + str(len(otus_prod)))

Number of OTUs: 374


In [57]:
lassoCV = get_lassoCV(cv)
scaler = StandardScaler()
scaler.fit(data_abs.loc[:,otus_prod])
data_abs = pd.DataFrame(scaler.transform(data_abs[otus_prod]),index=data_abs.index,columns=otus_prod)    
lassoCV.fit(data_abs.loc[:,otus_prod], hna)
mse = np.sum(lassoCV.mse_path_, axis=1)
mse_min = np.min(mse)
alpha = lassoCV.alpha_
    
otu_scores_prod = pd.Series(perform_randomizedLasso(data_abs.loc[:,otus_prod], hna, alpha), index=otus_prod)
otu_scores_prod.sort_values(ascending=False,inplace=True)
        
mse_scores = np.zeros(len(thresholds))
dummy=0
scores = otu_scores
    
for thr in thresholds: 
    scores = otu_scores_prod[otu_scores_prod.values > thr]
    features_new = scores.index
    if(len(features_new) > 0): 
        lassoCV = get_lassoCV(cv)
        lassoCV.fit(data_abs.loc[:,features_new],hna)
        #alphas, preds = perform_nested_ridge_cv(data_abs[features_new],hna) #We could use this if we want a different evaluation model
        mse = np.sum(lassoCV.mse_path_, axis=1)
        mse_scores[dummy] = np.min(mse)
    dummy+=1
        
mse_scores = mse_scores[np.nonzero(mse_scores)]
mse_min_idx = mse_scores.argmin()
thresh_max = thresholds[mse_min_idx]
optimal_scores_prod = otu_scores_prod[otu_scores_prod.values>thresh_max]
selected_otus = optimal_scores_prod.index
    
lassoCV = get_lassoCV(cv)
lassoCV.fit(data_abs.loc[:, selected_otus], hna)
alpha = lassoCV.alpha_
lasso = Lasso(alpha,max_iter=20000,normalize=False)
pred = cross_val_predict(lasso, data_abs.loc[:, selected_otus], hna, cv=cv)

In [58]:
r2_final = get_r2(pred,hna)   
print('R²_cv: ' + str(r2_final) )

R²_cv: 0.928038799058


In [59]:
display(optimal_scores_prod)
optimal_scores_prod.to_csv('HNA_selectedOTUs_prod_stand_abun_remov' + str(abun)+'_R2'+str(r2_final)+'.csv')

Otu000027    0.973333
Otu000057    0.953333
Otu000267    0.880000
Otu000128    0.846667
Otu000203    0.803333
Otu001749    0.743333
Otu000048    0.736667
Otu000043    0.700000
Otu000614    0.693333
Otu000474    0.663333
Otu000005    0.663333
Otu000058    0.636667
Otu000168    0.610000
Otu000025    0.606667
Otu000109    0.603333
Otu000004    0.583333
Otu000123    0.580000
Otu000176    0.560000
Otu000173    0.546667
Otu000098    0.546667
Otu000050    0.523333
Otu000084    0.516667
Otu000029    0.513333
Otu000344    0.500000
Otu000011    0.486667
Otu000292    0.483333
Otu000009    0.473333
Otu000124    0.463333
Otu000017    0.456667
Otu000563    0.446667
Otu000782    0.446667
Otu000047    0.440000
Otu000016    0.430000
Otu000219    0.423333
Otu000067    0.423333
Otu000227    0.413333
Otu000615    0.400000
Otu000985    0.396667
Otu000487    0.393333
Otu000190    0.386667
Otu000112    0.383333
Otu000030    0.383333
Otu000101    0.363333
Otu001267    0.356667
Otu000264    0.343333
Otu000041 