In [6]:
import numpy as np

from Distributed.selectinf.distributed_lasso import multisplit_lasso as L
from Distributed.selectinf.Tests.instance import gaussian_instance, logistic_instance

### Generate data

In [2]:
K = 3 # number of local machines
n1 = 1000 # number of samples on each of the local machine machine
n0 = 1000 # number of samples on the central machine
n = n0 + n1 * K # total number of samples
p = 100
signal = np.sqrt(2 * np.log(p))
s = 5 # sparsity
rho = 0.3
sigma = 1.
np.random.seed(42)
X, Y, beta = gaussian_instance(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3]

### Lasso 

In [3]:
proportion = (n1 / n) * np.ones(K) # proportion of data on each of the local machine
feature_weights = {i: np.ones(p) * np.sqrt(2 * np.log(p)) * 1.5 for i in range(K)} # feature weights for each of the K machines
selector = L.gaussian(X, Y, feature_weights, proportion, estimate_dispersion=True) # create the model
selected = selector.fit() # fit the model
print("Selected", selected.sum(), "variables")

Selected 8 variables


In [4]:
selector.setup_inference(dispersion=None) # setup the inference
target_spec = selector.selected_targets()
result = selector.inference(target_spec, level=0.9)  
print(result)

        MLE        SE    Zvalue    pvalue alternative  lower_confidence  \
0 -3.909735  1.488610 -2.626433  0.008629    twosided         -6.358281   
1  1.636432  1.971080  0.830221  0.406414    twosided         -1.605706   
2 -1.102830  1.933536 -0.570369  0.568427    twosided         -4.283214   
3 -0.382391  1.964922 -0.194609  0.845699    twosided         -3.614400   
4 -0.317457  2.043601 -0.155342  0.876552    twosided         -3.678882   
5  0.103624  2.048693  0.050580  0.959660    twosided         -3.266176   
6 -2.512710  1.986254 -1.265050  0.205853    twosided         -5.779806   
7  0.023204  2.040705  0.011370  0.990928    twosided         -3.333457   

   upper_confidence  unbiased  
0         -1.461189 -5.136766  
1          4.878569 -1.697327  
2          2.077554 -3.298806  
3          2.849617  2.340845  
4          3.043967 -3.110683  
5          3.473423  3.044991  
6          0.754387 -3.510875  
7          3.379864 -1.998944  


### Logistic regression

In [34]:
np.random.seed(42)
X, Y, beta = logistic_instance(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, random_signs=True)[:3]

In [35]:
feature_weights = {i: np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * .8 for i in range(K)}
selector = L.logistic(X, Y, feature_weights, proportion)
selected = selector.fit() # fit the model
print("Selected", selected.sum(), "variables")

selector.setup_inference(dispersion=1.)
target_spec = selector.selected_targets()
result = selector.inference(target_spec, level=0.9)
print(result)

Selected 4 variables
        MLE        SE    Zvalue    pvalue alternative  lower_confidence  \
0 -6.463941  3.971717 -1.627493  0.103632    twosided        -12.996835   
1 -2.049404  3.918726 -0.522977  0.600990    twosided         -8.495135   
2  5.007527  3.848403  1.301196  0.193191    twosided         -1.322533   
3 -0.855014  3.950388 -0.216438  0.828646    twosided         -7.352824   

   upper_confidence  unbiased  
0          0.068952  1.776052  
1          4.396328 -6.191762  
2         11.337587  7.751807  
3          5.642796  4.781950  
