### in this notebook, we will run BPDN/LASSO on random input of species and replicates (for small_simulation v3). We will use independent pools for each species and replicates. 
### We will reduce the dimensions of the space to $2^{15}$
#### We will make predictions only using data that has the given species being present at the start. 
#### We will save the time taken, the predictions, and also the wells supplied to make the predictions. 
#### We will need to tune the LASSO parameter -- run simulations for multiple parameter values 

In [1]:
import pandas as pd
import numpy as np
import cvxpy as cvx
import random
import time
from __future__ import print_function
from builtins import input


from sporco.admm import bpdn
### using the ADMM algorithm 
### we can also use the PGM algorithm 
from sporco import util
from sporco import plot
plot.config_notebook_plotting()

from scipy.linalg import hadamard

import matplotlib.pyplot as plt

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

import time 
from scipy import stats
from matplotlib import pyplot
plt.style.use('ggplot')
plt.style.use('seaborn-dark-palette')
iHiV = pd.read_pickle("~/compressed_sensingv1/realdatasets/ophelli-ryan/iHiV15.pkl")
B = pd.read_pickle("~/bge-analysis-simv3/B.pkl")
A = pd.read_pickle("~/bge-analysis-simv3/A.pkl")
replist = np.arange(0,10,1)
timearray = []
import os

#### set up the BPDN model 

In [2]:
opt = bpdn.BPDN.Options({'Verbose': False, 'MaxMainIter': 500,
                         'RelStopTol': 1e-8, 'AutoRho': {'RsdlTarget': 1.0}})

In [3]:
tableg=A.T
species_ones = [] 
for species in range(16):
    species_ones.append(tableg[tableg[species]==1.0][species].index.to_list())

In [4]:
samplist = [32, 65, 327, 655]

In [6]:
### generate lambda parameters :
labdl = np.round(np.logspace(-5,0.8,5),5)
lam = list(labdl)
lam.append([0.5,0.8,1.0,2.0,3.0,4.0,5.0,8.0,10.0,12.0])
lamf = list(pd.DataFrame(lam)[0].explode().values)
lamf.sort()
#lamf = [15.0,20.0,30.0,50.0]

In [7]:
npath = os.getcwd()
npath = "~/compressed_sensingv1/abun_simv3/sparse-recovery-nonBP/reduced-matrices-uniform"
for lmda in lamf:
    os.mkdir("/raid/home/shreyaa5/compressed_sensingv1/abun_simv3/sparse-recovery-nonBP/reduced-matrices-uniform/rel-abun"+"/pt"+str(lmda))

In [None]:
timearray  = []
listt = list(np.arange(0,2**15,1))
for species in range(16):
    starttt = time.time()
    entime = time.time() - starttt
    for replica in range(10):
        steadystate = pd.read_pickle("~/compressed_sensingv1/fullsearch/15_2.3_"+str(replica)+"/steady-state.pkl")
        n = 2**16
        sst = steadystate.T
        sstv2 = sst.copy()
        sstv2["total"] = sstv2.T.sum()
        for i in range(16):
            sstv2[i] = sstv2[i]/sstv2["total"]
        sstv3 = sstv2.fillna(0)
        abunvec = pd.DataFrame(sstv3[species].values[species_ones[species]])
        abunvec.index = np.arange(0,2**15,1)
        for m in samplist:
            ri = pd.read_pickle("~/compressed_sensingv1/abun_simv3/training-data/sp"+str(species)+"_rep"+str(replica)+"-in-sample-"+str(m)+".pkl")    
            y1 = abunvec[0][ri]
            s2 = np.array([y1.values])
            s3 = s2.T
            startt = time.time()
            D = iHiV[ri,:]
            for lmda in lamf:
                os.chdir("/raid/home/shreyaa5/compressed_sensingv1/abun_simv3/sparse-recovery-nonBP/reduced-matrices-uniform/rel-abun/pt"+str(lmda))
                b = bpdn.BPDN(D, s3, lmda, opt)
                x = b.solve()
                reconCSybge = np.dot(iHiV,x.ravel())
                endtime = time.time() - startt
                timearray.append([replica,species,endtime,entime])
                pd.to_pickle(ri,str(species)+"_rep"+str(replica)+"-in-sample-"+str(m)+".pkl")
                pd.to_pickle(reconCSybge,"abun_for_sp"+str(species)+"_rep"+str(replica)+"_samp"+str(m)+".pkl")
                pd.to_pickle(x.ravel(),"beta_for_sp"+str(species)+"_rep"+str(replica)+"_samp"+str(m)+".pkl")

In [8]:
#### in this part, we will run the high-performing (at 1p) species in replicates (for optimally chosen lambda) for higher data points training 
gp = pd.read_pickle("~/compressed_sensingv1/abun_simv3/sparse-recovery-nonBP/reduced-matrices-uniform/high-performing-at1p.pkl")

In [18]:
updsamp = [1310, 2616, 3270, 6540]

In [17]:
timearray  = []
listt = list(np.arange(0,2**15,1))
for param in gp:
    replica = int(param[0])
    species = int(param[1])
    lmda = param[2]
    steadystate = pd.read_pickle("~/compressed_sensingv1/fullsearch/15_2.3_"+str(replica)+"/steady-state.pkl")
    n = 2**16
    sst = steadystate.T
    abunvec = pd.DataFrame(sst[species].values[species_ones[species]])
    abunvec.index = np.arange(0,2**15,1)
    for m in updsamp:
        ri = pd.read_pickle("~/compressed_sensingv1/abun_simv3/training-data/sp"+str(species)+"_rep"+str(replica)+"-in-sample-"+str(m)+".pkl")    
        y1 = abunvec[0][ri]
        s2 = np.array([y1.values])
        s3 = s2.T
        startt = time.time()
        D = iHiV[ri,:]
        os.chdir("/raid/home/shreyaa5/compressed_sensingv1/abun_simv3/sparse-recovery-nonBP/reduced-matrices-uniform/pt"+str(lmda))
        b = bpdn.BPDN(D, s3, lmda, opt)
        x = b.solve()
        reconCSybge = np.dot(iHiV,x.ravel())
        endtime = time.time() - startt
        timearray.append([replica,species,endtime,m])
        pd.to_pickle(ri,str(species)+"_rep"+str(replica)+"-in-sample-"+str(m)+".pkl")
        pd.to_pickle(reconCSybge,"abun_for_sp"+str(species)+"_rep"+str(replica)+"_samp"+str(m)+".pkl")
        pd.to_pickle(x.ravel(),"beta_for_sp"+str(species)+"_rep"+str(replica)+"_samp"+str(m)+".pkl")

In [None]:
#### for samplist (0.1, 0.2, 1, 2)% samplings, run CS three more times. 
### using the same training dataset?

In [71]:
timearray  = []
listt = list(np.arange(0,2**15,1))
for param in gp:
    replica = int(param[0])
    species = int(param[1])
    lmda = param[2]
    steadystate = pd.read_pickle("~/compressed_sensingv1/fullsearch/15_2.3_"+str(replica)+"/steady-state.pkl")
    n = 2**16
    sst = steadystate.T
    abunvec = pd.DataFrame(sst[species].values[species_ones[species]])
    abunvec.index = np.arange(0,2**15,1)
    for m in [32,65,327,655]:
        #ri = #pd.read_pickle("~/compressed_sensingv1/abun_simv3/training-data/sp"+str(species)+"_rep"+str(replica)+"-in-sample-"+str(m)+".pkl")    
        ri = random.sample(listt,m)
        y1 = abunvec[0][ri]
        s2 = np.array([y1.values])
        s3 = s2.T
        startt = time.time()
        D = iHiV[ri,:]
        os.chdir("/raid/home/shreyaa5/compressed_sensingv1/abun_simv3/sparse-recovery-nonBP/reduced-matrices-uniform/pt"+str(lmda))
        b = bpdn.BPDN(D, s3, lmda, opt)
        x = b.solve()
        reconCSybge = np.dot(iHiV,x.ravel())
        endtime = time.time() - startt
        timearray.append([replica,species,endtime,m])
        pd.to_pickle(ri,str(species)+"_rep"+str(replica)+"-in-sample-"+str(m)+"run4.pkl")
        pd.to_pickle(reconCSybge,"abun_for_sp"+str(species)+"_rep"+str(replica)+"_samp"+str(m)+"run4.pkl")
        pd.to_pickle(x.ravel(),"beta_for_sp"+str(species)+"_rep"+str(replica)+"_samp"+str(m)+"run4.pkl")