In [11]:
'''Import packages'''
'''Requires numpy, pandas, scikit-learn, and matplotlib/seaborn'''
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import Lasso


import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")

import analysis_functions
from analysis_functions import get_lassoCV
from analysis_functions import perform_randomizedLasso

from IPython.display import display, HTML

#If we want to time the implementation: 
#import time
#start_time = time.time()

In [12]:
'''Import data'''
data_abs = pd.read_csv('data/Chloroplasts_removed/nochloro_absolute_otu.tsv', sep=' ', index_col=None, header=0)
data_rel = pd.read_csv('data/Chloroplasts_removed/nochloro_relative_otu.tsv', sep=' ', index_col=None, header=0)
target = pd.read_csv('data/Chloroplasts_removed/nochloro_HNA_LNA.tsv', sep=' ', index_col=0, header=0)
productivity = pd.read_csv('data/Chloroplasts_removed/productivity_data.tsv', sep=' ', index_col=0, header=0)

In [13]:
'''Set sample names as index and shuffle data'''
data_abs.set_index(target.samples,inplace=True)
data_rel.set_index(target.samples,inplace=True)
data_abs = data_abs.sample(frac=1, random_state=3)
data_rel = data_rel.sample(frac=1, random_state=3)
target = target.sample(frac=1, random_state=3)
productivity = productivity.sample(frac=1, random_state=3)

#Create target columns of HNA-values: 
hna = target.loc[:,'HNA.cells']
hna_rel = hna/target.loc[:,'Total.cells']
hna = pd.Series(hna, index=hna.index)
hna_rel = pd.Series(hna_rel, index=hna_rel.index)

**REMOVE OUTLIERS IN PRODUCTIVITY SAMPLES**: there seem to be three outlier samples amongst the productivity samples. Therefore these are removed and not considered in the rest of the analysis. This leaves us with **20 samples**. 

In [14]:
from sklearn.linear_model import LinearRegression
from analysis_functions import get_r2
from scipy.stats import linregress

#retain only productivity samples 
productivity = productivity.dropna(subset=['tot_bacprod'])
#remove high productivity samples (>90)
productivity = productivity[productivity.tot_bacprod < 90]

idx_prod = productivity.samples.values
#display(idx_prod)
prod = pd.Series(productivity.tot_bacprod.values, index=idx_prod)
prod_error = pd.Series(productivity.SD_tot_bacprod.values, index=idx_prod)
prod_rel_error = prod_error/prod

**Preprocessing of data**: filter out those OTUs which have very low abundances and so give rise to (almost) zero-columns. Therefore an OTU has to have a minimal relative abundance one, defined by the parameter $abun$. However, I use a **second constraint** which states that an OTU must have a relative abundance > $abun$ in one of the **productivity** samples. In this way we're going to bias the OTU-selection towards the ones present in the productivity samples. 

In [15]:
abun = 0.005

In [16]:
from analysis_functions import preprocess_df
data_abs_prod = data_abs.loc[idx_prod,:] 
data_abs_prod = preprocess_df(data_abs_prod,abun,True)
otus_prod = list(data_abs_prod.columns)

print('Number of OTUs: ' + str(len(otus_prod)))

Number of OTUs: 121


**Standardize data**: 

In [17]:
#scaler = StandardScaler()
#data_abs_prod = pd.DataFrame(scaler.fit_transform(data_abs_prod[otus_prod]),index=data_abs_prod.index,columns=otus_prod)
#'''Let's normalize instead of standardizing: '''
#data_abs_prod = (data_abs_prod - data_abs_prod.mean()) / (data_abs_prod.max() - data_abs_prod.min())

Let's see how well these OTU's relate to the HNA using a 4x5 cross-validated Lasso: 

In [18]:
cv_out = 20
otu_scores_cv = pd.DataFrame(columns=otus_prod)
r2_cv = np.zeros(cv_out)
thresholds_cv = np.zeros(cv_out)

outer_cv = KFold(n_splits=cv_out, shuffle=False)

t = 0

final_scores = pd.DataFrame(columns=otus_prod)

thresholds = np.arange(0,1,0.02)
pred = pd.Series(index=data_abs_prod.loc[idx_prod,:].index)

for idx_train, idx_test in outer_cv.split(data_abs_prod.loc[idx_prod,:],prod):
    lassoCV = get_lassoCV(19)
    scaler = StandardScaler()
    scaler.fit(data_abs_prod.iloc[idx_train,:])
    data_abs_prod = pd.DataFrame(scaler.transform(data_abs_prod[otus_prod]),index=data_abs_prod.index,columns=otus_prod)    
    lassoCV.fit(data_abs_prod.iloc[idx_train,:], prod[idx_train])
    mse = np.sum(lassoCV.mse_path_, axis=1)
    mse_min = np.min(mse)
    alpha = lassoCV.alpha_
    
    otu_scores = pd.Series(perform_randomizedLasso(data_abs_prod.iloc[idx_train,:], prod[idx_train], alpha), index=otus_prod)
    otu_scores_cv.loc[t] = otu_scores    
    otu_scores.sort_values(ascending=False,inplace=True)
        
    mse_scores = np.zeros(len(thresholds))
    dummy=0
        
    scores = otu_scores
    
    for thr in thresholds: 
        scores = otu_scores[otu_scores.values > thr]
        features_new = scores.index
        if(len(features_new) > 0): 
            lassoCV = get_lassoCV(19)
            lassoCV.fit(data_abs_prod.ix[idx_train,features_new],prod[idx_train])
            #alphas, preds = perform_nested_ridge_cv(data_abs[features_new],hna) #We could use this if we want a different evaluation model
            mse = np.sum(lassoCV.mse_path_, axis=1)
            mse_scores[dummy] = np.min(mse)
        dummy+=1
    
    mse_scores = mse_scores[np.nonzero(mse_scores)]
    #mse_min = mse_scores.min()
    mse_min_idx = mse_scores.argmin()
    thresh_max = thresholds[mse_min_idx]
    thresholds_cv[t] = thresh_max
    optimal_scores = otu_scores[otu_scores.values>thresh_max]
    selected_otus = optimal_scores.index

    lassoCV = get_lassoCV(19)
    lassoCV.fit(data_abs_prod.ix[idx_train, selected_otus], prod[idx_train])
    alpha = lassoCV.alpha_
    lasso = Lasso(alpha,max_iter=20000,normalize=False)
    pred_cv_final = cross_val_predict(lasso, data_abs_prod.ix[idx_train, selected_otus], prod[idx_train], cv=19)
    r2_cv[t] = get_r2(pred_cv_final, prod[idx_train])
    print(r2_cv[t])
    print(len(selected_otus))
    pred.iloc[idx_test] = lassoCV.predict(data_abs_prod.ix[idx_test,selected_otus])
    t+=1

0.756831359318
12
0.715053275904
14
0.769486927084
9
0.630902168218
11
0.690764528603
21
0.818374990387
11
0.857785792986
7
0.819489204715
19
0.815390408241
8
0.70929330084
10
0.664378587693
9
0.818572153668
11
0.726989999888
11
0.675289088253
10
0.877879668059
18
0.913636744885
10
0.844615957214
10
0.739531452052
17
0.778037356739
21
0.908257212853
16


In [19]:
r2_final = get_r2(pred,prod)   
print(r2_final)

-0.070736043399


In [20]:
mean_otu_scores = otu_scores_cv.mean()
std_otu_scores = otu_scores_cv.std()
mean_otu_scores.sort_values(ascending=False,inplace=True)
avg_thresh = thresholds_cv.mean()
std_thresh = thresholds_cv.std()
print(std_thresh)
otus_final = mean_otu_scores[mean_otu_scores > avg_thresh]
display(otus_final)

0.0364828726939


Otu000117    0.608667
Otu000071    0.508833
Otu000058    0.378333
Otu000041    0.372500
Otu000060    0.371333
Otu000454    0.364000
Otu000034    0.325000
Otu000082    0.312167
Otu000025    0.276000
Otu000176    0.271667
dtype: float64

Old Table: 

In [15]:
final = pd.DataFrame(optimal_scores, index=selected_otus, columns=['Score_prod'])
final['Score_HNA'] = scores_HNA.loc[selected_otus,:]
final['Score_HNA_prodfilt'] = scores_HNA_prodfilt.loc[selected_otus,:]
display(final)

Unnamed: 0,Score_prod,Score_HNA,Score_HNA_prodfilt
Otu000082,0.748,0.216,0.622
Otu000117,0.736,0.238,0.262
Otu000075,0.686,0.098,0.16
Otu000091,0.616,0.184,0.098
Otu000176,0.564,0.844,0.978
Otu000343,0.518,0.066,0.03
Otu000094,0.436,0.032,0.348
Otu000026,0.376,0.058,0.112
Otu000025,0.374,0.762,0.852
Otu000246,0.336,0.032,0.05


In [16]:
''' Display final OTU's complying to fitted thresholds: '''
final_thr = final[(final['Score_HNA_prodfilt'] > 0.63)]
display(final_thr)

Unnamed: 0,Score_prod,Score_HNA,Score_HNA_prodfilt
Otu000176,0.564,0.844,0.978
Otu000025,0.374,0.762,0.852
Otu000041,0.262,0.292,0.82


Note: Threshold for Score_HNA_prodfilt is sufficient to also filter out Score_HNA. 