#### In this notebook, assuming an effective size of 15 (one present always) species, we will look at CS (BPDN,ADMM) on effectively 187 points. For each species, we will only look at predictions on the points where the species was present to begin with. (this is the first set of species that maximises data points)

#### import the relevant files and modules

In [12]:
import pandas as pd
import numpy as np
import cvxpy as cvx
import random
import time

from sporco.admm import bpdn
### using the ADMM algorithm 
### we can also use the PGM algorithm 


from scipy.linalg import hadamard


from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

import time 
from scipy import stats

iHiV = pd.read_pickle("~/bge-analysis-simv3/iHiV.pkl")
iHiVred = pd.read_pickle("~/compressed_sensingv1/realdatasets/misc/ophelli-ryan/iHiV15.pkl")

import warnings
warnings.filterwarnings("ignore")

X = pd.read_pickle("~/bge-analysis-simv3/X15.pkl")

opt = bpdn.BPDN.Options({'Verbose': False, 'MaxMainIter': 500,
                         'RelStopTol': 1e-8, 'AutoRho': {'RsdlTarget': 1.0}})

#### read the effective species presence-absence

pa_redv1 = pd.read_pickle("~/compressed_sensingv1/realdatasets/misc/ophelli-ryan/eff-16-list1-red-pa.pkl")

labdl = np.round(np.logspace(-5,0.8,5),5)
lam = list(labdl)
lam.append([0.5,0.6,1.0])
lamf = list(pd.DataFrame(lam)[0].explode().values)
lamf.sort()
lamf = lamf[:7]
lamf

random_state = None
laml = lamf

#pa_redv1

B = pd.read_pickle("~/bge-analysis-simv3/B.pkl")
A = pd.read_pickle("~/bge-analysis-simv3/A.pkl")

tableg=A.T
species_onesall = [] 
for species in range(16):
    species_onesall.append(tableg[tableg[species]==1.0][species].index.to_list())
species_zerosall = []
for species in range(16):
    species_zerosall.append(tableg[tableg[species]==0.0][species].index.to_list())

##### compile the well numbers for each species for its presence 

species_ones = []
for i in range(16):
    species_ones.append(pa_redv1[pa_redv1[i]==1.0]["well number"].values)

ones_len = [] 
for ii in range(len(species_ones)):
    ones_len.append(len(species_ones[ii])) 

steadystate = pd.read_pickle("~/compressed_sensingv1/realdatasets/misc/ophelli-ryan/eff-16-list1-red-sst-REL.pkl")
sst = steadystate.T

bias_var_listm = [3,3,3,3,3,3,3,3,5,3,5,3,3,7,5,7]

In [None]:
err = []
errlin = []
samp = 100
for species in range(8):
    cv_splits = bias_var_listm[species]
    subsetall = A[species_onesall[species]].T
    subsetall["indexinwhole"] = subsetall.index
    subsetall.index = np.arange(0,2**15,1)
    givendata = subsetall[subsetall["indexinwhole"].isin(species_ones[species])]
    m = int(samp*(ones_len[species])/100)
    cvdata = m
    ri = random.sample(list(givendata.index),m)
    ri.sort() 
    sstgiven = steadystate.T[subsetall[subsetall["indexinwhole"].isin(species_ones[species])]["indexinwhole"].values].T
    sstgiven.index = givendata.index
    y2 = sstgiven.T[ri].T[species]
    #y2 = sst[ri].T[species]
    data_present = sstgiven.T[ri].T.index
    xs = list(data_present)
    random.shuffle(xs)
    for iteras in range(5):
        kf = KFold(n_splits=cv_splits, random_state=iteras*10, shuffle=True)
        kf.get_n_splits(data_present)
        for lmda in laml:
            for train_index, test_index in kf.split(data_present):
                startt = time.time()
                X_train, X_test = data_present[train_index], data_present[test_index]
                rinew = list(X_train)
                rileft = list(X_test)
                D = iHiVred[rinew,:]
                y3 = y2[rinew]
                s2 = np.array([y3.values])
                s3 = s2.T
                b = bpdn.BPDN(D, s3, lmda, opt)
                x = b.solve()
                yrecon = np.dot(iHiVred,x.ravel())
                #print(time.time()-startt,iteras,"cs")
                err.append([iteras,lmda,species,yrecon[rileft],rileft,rinew,cv_splits,len(rileft),len(rinew)])
                startt = time.time()
                D = X[rinew,:]
                y3 = y2[rinew]
                s2 = np.array([y3.values])
                s3 = s2.T
                b = bpdn.BPDN(D, s3, lmda, opt)
                x = b.solve()
                yrecon = np.dot(X,x.ravel())
                #print(time.time()-startt,iteras,"linreg")
                errlin.append([iteras,lmda,species,yrecon[rileft],rileft,rinew,cv_splits,len(rileft),len(rinew)])


In [None]:
errdf  = pd.DataFrame(err)
errdflin = pd.DataFrame(errlin)

In [None]:
pd.to_pickle(errdf,"16l1-"+str(species)+"-bge.pkl")
pd.to_pickle(errdflin,"16l1-"+str(species)+"-lin.pkl")

In [None]:
pd.read_pickle("16l1-"+str(species)+"-bge.pkl")
pd.read_pickle("16l1-"+str(species)+"-lin.pkl")