In [1]:
from __future__ import print_function

import numpy as np
import random

from matplotlib import pyplot as plt

from selectinf.nbd_lasso import nbd_lasso
from selectinf.Utils.discrete_family import discrete_family
from instance import GGM_instance

  warn('unable to import isotonic regression from sklearn, using a pure python implementation')


In [2]:
# TODO: Add root n to the randomization covariance
prec,cov,X = GGM_instance(n=500,p=10, max_edges=2)
nbd_instance = nbd_lasso.gaussian(X)
active_signs_nonrandom = nbd_instance.fit(perturb=np.zeros((10,9)))
active_signs_random = nbd_instance.fit()
print(active_signs_nonrandom.shape)
print(np.abs(active_signs_nonrandom).sum())
print(np.abs(active_signs_random).sum())
print(np.abs(prec != 0).sum() - 10)

Custom perturbation
Sampled perturbation
(10, 9)
10.0
8.0
12


In [3]:
print(np.diag(cov))
print((prec))

[0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002]
[[ 5.00000000e+02  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  5.89115114e+02  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -2.29126735e+02]
 [ 0.00000000e+00  0.00000000e+00  5.76169501e+02  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -2.09491153e+02  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  5.54774395e+02
   1.65122517e-15  0.00000000e+00  0.00000000e+00  1.74319912e+02
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -1.41699142e-14
   6.31956011e+02  0.00000000e+00  0.00000000e+00 -2.88773951e+02
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  5.59906548e+02 -1.

In [4]:
def remove_diag(A):
    p = A.shape[0]
    A_new = np.zeros((p,p-1))
    for i in range(p):
        A_new[i] = np.delete(A[i],i)
    return A_new

def add_diag(A,val):
    p = A.shape[0]
    A_new = np.zeros((p,p))
    for i in range(p):
        A_new[i,0:i] = A[i,0:i]
        A_new[i,i] = val
        A_new[i,i+1:p] = A[i,i:p-1]
    return A_new

def is_sym(A, tol = 1e-8):
    return(np.max(np.abs(A-A.T)) < tol)

def invert_interval(interval):
    interval_new = (interval[1]*-1,interval[0]*-1)
    return interval_new

In [5]:
prec_no_diag = remove_diag(prec)
print(np.abs(active_signs_nonrandom).sum())
print(np.abs(active_signs_random).sum())
print(np.abs(active_signs_nonrandom - np.sign(-prec_no_diag)).sum())
print(np.abs(active_signs_random - np.sign(-prec_no_diag)).sum())

10.0
8.0
2.0
4.0


In [21]:
def bootstrap_variance(X, b_max=500):
    n,p = X.shape
    S_boot = np.zeros((b_max, p, p))
    for b in range(b_max):
        sample_idx = np.random.choice(range(n),replace=True,size=n)
        X_b = X[sample_idx]
        S_boot[b,:,:] = X_b.T @ X_b
    # Upper-triangular!
    inner_vars = np.zeros((p, p))
    for i in range(p):
        for j in range(i+1,p):
            S_ij_sample = S_boot[:,i,j]
            inner_vars[i,j] = np.var(S_ij_sample)
    return inner_vars

def edge_inference(j0, k0, S, n, p, var=None,
                   ngrid=10000):
    inner_prod = S[j0,k0]
    # print("inner_prod", "(", j0, ",", k0, "):" , inner_prod)
    # print("var:", var)

    stat_grid = np.zeros((1, ngrid))
    stat_grid[0,:] = np.linspace(0 - 10*np.sqrt(2/n),#10*np.sqrt(var),
                                 0 + 10*np.sqrt(2/n),#10*np.sqrt(var),
                                 num=ngrid)
    def log_det_S_j_k(s_val):
        S_j_k = S
        S_j_k[j0,k0] = s_val
        S_j_k[k0,j0] = s_val
        return (n-p-1)/2 * np.log(np.abs(np.linalg.det(S_j_k)))

    logWeights = np.zeros((ngrid,))
    for g in range(ngrid):
        logWeights[g] = log_det_S_j_k(stat_grid[0,g])
    logWeights -= logWeights.max() # not needed anymore after root n scaling

    condlWishart = discrete_family(stat_grid[0,:], np.exp(logWeights))

    neg_interval = condlWishart.equal_tailed_interval(observed=inner_prod,
                                                      alpha=0.1)
    return neg_interval, condlWishart

def get_nonzero(active_signs):
    active_sign_sq = add_diag(active_signs, 0)
    nonzero = ((active_sign_sq + active_sign_sq.T) != 0) # OR
    # nonzero = ((active_sign_sq * active_sign_sq.T) != 0) # AND
    return nonzero

def conditional_inference(X,nonzero):
    n,p = X.shape

    # Estimating variances by bootstrap
    # inner_vars = bootstrap_variance(X)
    S_ = X.T @ X
    intervals = np.zeros((p,p,2))
    condlDists = {}
    for i in range(p):
        for j in range(i+1,p):
            if nonzero[i,j]:
                neg_int, condlWishart = edge_inference(j0=i, k0=j, S=S_,
                                                       # var=inner_vars[i, j],
                                                       n=n, p=p, ngrid=100000)
                print(neg_int)
                interval = invert_interval(neg_int)
                intervals[i,j,:] = interval
                condlDists[(i,j)] = condlWishart

    return intervals, condlDists

In [22]:
nonzero = get_nonzero(active_signs_nonrandom)

intervals,condlDists = \
    conditional_inference(X, nonzero)

def get_coverage(nonzero, intervals, prec, p):
    coverage = np.zeros((p,p))
    for i in range(p):
        for j in range(i+1,p):
            if nonzero[i,j]:
                interval = intervals[i,j,:]
                if prec[i,j] < interval[1] and prec[i,j] > interval[0]:
                    coverage[i,j] = 1
                else:
                    coverage[i,j] = 0
    return coverage

coverage = get_coverage(nonzero, intervals, prec, X.shape[1])

(198.96928557426133, 295.5222207137424)
(183.76880468273202, 282.6482160196048)
(-233.13573606288455, -139.43801853094382)
(683.2613072359356, 887.8466664530948)
(209.2491537213587, 309.28009852294474)


In [23]:
print(coverage)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [32]:
def naive_inference(X):
    n,p = X.shape
    nbd_instance = nbd_lasso.gaussian(X)
    active_signs_nonrandom = nbd_instance.fit(perturb=np.zeros((p,p-1)))
    nonzero = get_nonzero(active_signs_nonrandom)

    # Construct intervals
    if nonzero.sum() > 0:
        intervals, condlDists = conditional_inference(X, nonzero)
        # coverage is upper-triangular
        coverage = get_coverage(nonzero, intervals, prec, p)
        interval_len = 0
        nonzero_count = 0
        for i in range(p):
            for j in range(i+1,p):
                if nonzero[i,j]:
                    interval = intervals[i,j,:]
                    interval_len = interval_len + (interval[1] - interval[0])
                    nonzero_count = nonzero_count + 1
        avg_len = interval_len / nonzero_count
        cov_rate = coverage.sum() / nonzero_count
        return intervals, cov_rate, avg_len/n

In [33]:
def print_nonzero_intervals(nonzero, intervals, prec, X):
    S = X.T @ X
    p = prec.shape[0]
    for i in range(p):
            for j in range(i+1,p):
                if nonzero[i,j]:
                    print("(",i,",",j,")", "selected")
                    print("Theta", "(",i,",",j,")", "interval:", intervals[i,j,:])
                    print("Theta", "(",i,",",j,")", prec[i,j])
                    print("S", "(",i,",",j,")", S[i,j])

In [34]:
print_nonzero_intervals(nonzero, intervals, prec, X)

( 1 , 9 ) selected
Theta ( 1 , 9 ) interval: [-295.52222071 -198.96928557]
Theta ( 1 , 9 ) -229.12673471247834
S ( 1 , 9 ) 0.40467213447114153
( 2 , 8 ) selected
Theta ( 2 , 8 ) interval: [-282.64821602 -183.76880468]
Theta ( 2 , 8 ) -209.49115294962394
S ( 2 , 8 ) 0.3447896748512277
( 3 , 7 ) selected
Theta ( 3 , 7 ) interval: [139.43801853 233.13573606]
Theta ( 3 , 7 ) 174.31991248685642
S ( 3 , 7 ) -0.32755364533420894
( 4 , 7 ) selected
Theta ( 4 , 7 ) interval: [-887.84666645 -683.26130724]
Theta ( 4 , 7 ) -288.7739505511683
S ( 4 , 7 ) 0.5075472995995413
( 5 , 6 ) selected
Theta ( 5 , 6 ) interval: [-309.28009852 -209.24915372]
Theta ( 5 , 6 ) -183.14493899726764
S ( 5 , 6 ) 0.3943756713338829


In [35]:
intervals, cov_rate, avg_len = naive_inference(X)

Custom perturbation
(198.96928557426133, 295.5222207137424)
(183.76880468273202, 282.6482160196048)
(-233.13573606288455, -139.43801853094382)
(683.2613072359356, 887.8466664530948)
(209.2491537213587, 309.28009852294474)


In [36]:
cov_rate

0.6

In [37]:
avg_len

0.23749854721081592