## Cross Validation Lasso
***

### Whats is the $\lambda$ value that maximizes R2?

In [1]:
import numpy as np
import itertools
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, make_scorer
# import analysis code
from lassoloaddata import loadlassodata
from lassopath import alphas, get_folds # array with alpha values to test

# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
# ANALYSIS CODE
def cv_in_fold_evaluate_lambdas(X, y):
    """
    Args:
        X -> predictors (scaled)
        y -> response
    Returns:
        pd.DataFrame with cross validation results
    NOTE: we are doing cross validation inside cross validation data
    """
    # define the grid search
    lasso = Lasso(random_state=42)
    tuned_params = dict(alpha=alphas)
    n_folds = 3
    # we are using 30 cores!! (this function should be run sequentially)
    clf = GridSearchCV(lasso, tuned_params, cv=n_folds,n_jobs=30, scoring=make_scorer(r2_score))
    clf.fit(X, y)
    return pd.DataFrame(clf.cv_results_)
    
def runanalysis_cv_params(rna_protocol='wt_ribo', keepmir=True):
    """
    runs the analysis to get the lasso path in the given data
    """
    predictors, response = loadlassodata(rna_protocol)
    mirstatus = "all_genes" if keepmir else "no_mir_genes"
    # remove MiR-430 genes
    if not keepmir:
        no_mir_genes = predictors['GCACTT'] == 0 # 0 6-mer MiR-430 sites
        predictors = predictors[no_mir_genes]
        response = response[no_mir_genes]
    
    # run the lasso with 6 folds, the predictors are scaled
    print('runing analysis for {} genes'.format(predictors.shape[0]))
    folds = get_folds(predictors, response, k=6)
    results = []
    for i, fold_data in enumerate(folds):
        print('runing fold {} of 6'.format(i + 1))
        (X, y), leave_out = fold_data
        fold_res = cv_in_fold_evaluate_lambdas(X, y)
        fold_res['kfold'] = i
        results.append(fold_res)
    # add param to frame
    results = pd.concat(results)
    results['sample_condition'] = rna_protocol
    results['which_genes'] = mirstatus
    return results

In [5]:
results = []
for cond, mircase in itertools.product(("wt_ribo", "wt_polya"), (True, False)):
    print('runing {} with mir case {}'.format(cond, mircase))
    results.append(runanalysis_cv_params(cond, mircase))
    

runing wt_ribo with mir case True
runing analysis for 4729 genes
runing fold 0 of 6
runing fold 1 of 6
runing fold 2 of 6
runing fold 3 of 6
runing fold 4 of 6
runing fold 5 of 6
runing wt_ribo with mir case False
runing analysis for 3349 genes
runing fold 0 of 6
runing fold 1 of 6
runing fold 2 of 6
runing fold 3 of 6
runing fold 4 of 6
runing fold 5 of 6
runing wt_polya with mir case True
runing analysis for 4729 genes
runing fold 0 of 6
runing fold 1 of 6
runing fold 2 of 6
runing fold 3 of 6
runing fold 4 of 6
runing fold 5 of 6
runing wt_polya with mir case False
runing analysis for 3349 genes
runing fold 0 of 6
runing fold 1 of 6
runing fold 2 of 6
runing fold 3 of 6
runing fold 4 of 6
runing fold 5 of 6


In [8]:
pd.concat(results).to_csv('results_data/lasso_cross_val_params.csv')