In [6]:
import numpy as np
import random
from scipy.stats import norm
from selectinf.nbd_lasso import nbd_lasso
from selectinf.Utils.discrete_family import discrete_family
from selectinf.Tests.instance import GGM_instance
from selectinf.nbd_helpers import is_sym

from selectinf.Tests.nbd_naive_and_ds import *

In [74]:
n=800
p=50
prec,cov,X = GGM_instance(n=n, p=p, max_edges=4)

In [70]:
np.set_printoptions(precision=3, suppress=True)
print(prec)

[[ 1.008  0.    -0.    ...  0.     0.     0.   ]
 [ 0.     1.001  0.    ...  0.     0.     0.   ]
 [-0.    -0.     1.002 ... -0.     0.01   0.   ]
 ...
 [ 0.     0.    -0.    ...  1.017  0.     0.   ]
 [ 0.     0.     0.01  ...  0.     1.016  0.   ]
 [ 0.     0.     0.    ...  0.     0.     1.   ]]


In [71]:
weights_const=0.5
ridge_const=0.5
randomizer_scale=1.

In [49]:
nbd_instance = nbd_lasso.gaussian(X, n_scaled=False, weights_const=weights_const,
                                      ridge_terms=ridge_const, randomizer_scale=randomizer_scale)
active_signs_random = nbd_instance.fit(logic="OR")
nonzero = nbd_instance.nonzero

# Construct intervals
if nonzero.sum() > 0:
    # Intervals returned is in its original (unscaled) order
    intervals = nbd_instance.inference(parallel=False)

Inference for 0 , 47
Inference for 9 , 14
Inference for 16 , 47
Inference for 17 , 40
Inference for 21 , 27
Inference for 22 , 34
Inference for 23 , 30
Inference for 24 , 48
Inference for 29 , 37
Inference for 48 , 49


In [93]:
n=1000
p=50
prec,cov,X = GGM_instance(n=n, p=p, max_edges=4)
nonzero_n, intervals_n, cov_rate_n, avg_len_n = naive_inference(X, prec,
                                                                weights_const=weights_const,
                                                                true_nonzero=None, logic='OR',
                                                                solve_only=False, continued=False)
cov_rate_n

E estimated


0.625

In [73]:
nonzero_n.sum()

AttributeError: 'NoneType' object has no attribute 'sum'

In [None]:
coverage = get_coverage(nonzero, intervals, prec, n=n, p=p, scale=False)
interval_len = 0
nonzero_count = 0  # nonzero_count is essentially upper-triangular
for i in range(p):
    for j in range(i+1,p):
        if nonzero[i,j]:
            interval = intervals[i,j,:]
            interval_len = interval_len + (interval[1] - interval[0])
            nonzero_count = nonzero_count + 1
avg_len = interval_len / nonzero_count
cov_rate = coverage.sum() / nonzero_count
F1_approx = calculate_F1_score_graph(prec, selection=nonzero)

In [None]:
cov_rate

In [None]:
avg_len

In [None]:
F1_approx

In [None]:
def print_nonzero_intervals(nonzero, intervals, prec, X):
    # Intervals, prec, X are all in their original scale
    n, p = X.shape
    S = X.T @ X / n

    for i in range(p):
            for j in range(i+1,p):
                if nonzero[i,j]:
                    print("(",i,",",j,")", "selected")
                    print("Theta", "(",i,",",j,")", "interval:", intervals[i,j,:])
                    print("Theta", "(",i,",",j,")", prec[i,j])
                    print("S/n", "(",i,",",j,")", S[i,j])
print_nonzero_intervals(nonzero, intervals, prec, X)

In [None]:
(np.abs(prec) > 1e-10).sum() / (p**2)

In [None]:
((np.abs(prec) > 1e-10) * nonzero).sum() / nonzero.sum()

In [None]:
nonzero.sum() / (p**2)

## Random Edges Instance

In [None]:
def is_invertible(a):
    return a.shape[0] == a.shape[1] and np.linalg.matrix_rank(a) == a.shape[0]

In [None]:
def GGM_random_instances(n=200, p=50, theta=-0.2):

    # Guarantee same sparsity level as in Friedman et al.:
    # https://www.asc.ohio-state.edu/statistics/statgen/joul_aut2015/2010-Friedman-Hastie-Tibshirani.pdf
    prob = 0.4 / (np.abs(theta)*p)

    invertible = False

    # Generate invertible precision
    while not invertible:
        prec = np.eye(p)

        # Randomly selecting edges
        for i in range(p):
            for j in range(i + 1, p):
                prec[i, j] = theta * np.random.binomial(n=1, p=prob)

        # symmetrize
        prec = prec + prec.T - np.eye(p)

        invertible = is_invertible(prec)

    cov = np.linalg.inv(prec)
    # standardize the covariance
    cov = cov / np.outer(np.sqrt(np.diag(cov)), np.sqrt(np.diag(cov)))
    prec = np.linalg.inv(cov)

    X = np.random.multivariate_normal(mean=np.zeros(p),
                                      cov=cov, size=n)

    return prec, cov, X

In [None]:
n=1000
p=50
prec,cov,X = GGM_random_instances(n=n, p=p, theta=-0.2)

In [None]:
weights_const=1.
ridge_const=1.
randomizer_scale=2#np.sqrt(np.sqrt(p))
print(randomizer_scale)

nbd_instance = nbd_lasso.gaussian(X, n_scaled=False, weights_const=weights_const,
                                      ridge_terms=ridge_const, randomizer_scale=randomizer_scale)
active_signs_random = nbd_instance.fit()
nonzero = nbd_instance.nonzero

# Construct intervals
if nonzero.sum() > 0:
    # Intervals returned is in its original (unscaled) order
    intervals = nbd_instance.inference(parallel=False)

In [None]:
coverage = get_coverage(nonzero, intervals, prec, n=n, p=p, scale=False)
interval_len = 0
nonzero_count = 0  # nonzero_count is essentially upper-triangular
for i in range(p):
    for j in range(i+1,p):
        if nonzero[i,j]:
            interval = intervals[i,j,:]
            interval_len = interval_len + (interval[1] - interval[0])
            nonzero_count = nonzero_count + 1
avg_len = interval_len / nonzero_count
cov_rate = coverage.sum() / nonzero_count
F1_approx = calculate_F1_score_graph(prec, selection=nonzero)

In [None]:
cov_rate

In [None]:
avg_len

In [None]:
F1_approx

In [None]:
(np.abs(prec) > 1e-10).sum() / (p**2)

In [None]:
((np.abs(prec) > 1e-10) * nonzero).sum() / nonzero.sum()

In [None]:
nonzero.sum() / (p**2)

In [None]:
def print_nonzero_intervals(nonzero, intervals, prec, X):
    # Intervals, prec, X are all in their original scale
    n, p = X.shape
    S = X.T @ X / n

    for i in range(p):
            for j in range(i+1,p):
                if nonzero[i,j]:
                    print("(",i,",",j,")", "selected")
                    print("Theta", "(",i,",",j,")", "interval:", intervals[i,j,:])
                    print("Theta", "(",i,",",j,")", prec[i,j])
                    print("S/n", "(",i,",",j,")", S[i,j])
print_nonzero_intervals(nonzero, intervals, prec, X)

In [None]:
sum(sum(np.abs(prec) > 1e-10))/2

In [None]:
nonzero_count

## Hub Instance

In [None]:
def GGM_hub_instances(n=200, p=50, K=10, theta=-0.175):
    group_size = int(p / K)

    invertible = False
    while not invertible:
        prec = np.eye(p)
        for k in range(K):
            group_k = range(k * group_size, (k + 1) * group_size)
            hub = random.sample(list(group_k), 1)[0]
            for i in group_k:
                # fix column at hub, iterate over all rows in the group
                if i != hub:
                    prec[i, hub] = theta

        # symmetrize
        prec = prec + prec.T - np.eye(p)

        invertible = is_invertible(prec)

    cov = np.linalg.inv(prec)
    # standardize the covariance
    cov = cov / np.outer(np.sqrt(np.diag(cov)), np.sqrt(np.diag(cov)))
    prec = np.linalg.inv(cov)

    X = np.random.multivariate_normal(mean=np.zeros(p),
                                      cov=cov, size=n)

    return prec, cov, X

In [None]:
n=1000
p=50
prec, cov, X = GGM_hub_instances(n=n, p=p, K=int(p/5), theta=-0.2)

In [None]:
weights_const=1
ridge_const=1.
randomizer_scale=2

nbd_instance = nbd_lasso.gaussian(X, n_scaled=False, weights_const=weights_const,
                                      ridge_terms=ridge_const, randomizer_scale=randomizer_scale)
active_signs_random = nbd_instance.fit()
nonzero = nbd_instance.nonzero

# Construct intervals
if nonzero.sum() > 0:
    # Intervals returned is in its original (unscaled) order
    intervals = nbd_instance.inference(parallel=False)

In [None]:
coverage = get_coverage(nonzero, intervals, prec, n=n, p=p, scale=False)
interval_len = 0
nonzero_count = 0  # nonzero_count is essentially upper-triangular
for i in range(p):
    for j in range(i+1,p):
        if nonzero[i,j]:
            interval = intervals[i,j,:]
            interval_len = interval_len + (interval[1] - interval[0])
            nonzero_count = nonzero_count + 1
avg_len = interval_len / nonzero_count
cov_rate = coverage.sum() / nonzero_count
F1_approx = calculate_F1_score_graph(prec, selection=nonzero)

In [None]:
cov_rate

In [None]:
avg_len

In [None]:
F1_approx

In [None]:
def print_nonzero_intervals(nonzero, intervals, prec, X):
    # Intervals, prec, X are all in their original scale
    n, p = X.shape
    S = X.T @ X / n

    for i in range(p):
            for j in range(i+1,p):
                if nonzero[i,j]:
                    print("(",i,",",j,")", "selected")
                    print("Theta", "(",i,",",j,")", "interval:", intervals[i,j,:])
                    print("Theta", "(",i,",",j,")", prec[i,j])
                    print("S/n", "(",i,",",j,")", S[i,j])
print_nonzero_intervals(nonzero, intervals, prec, X)

In [None]:
(np.abs(prec) > 1e-10).sum() / (p**2)

In [None]:
((np.abs(prec) > 1e-10) * nonzero).sum() / nonzero.sum()

In [None]:
nonzero.sum() / (p**2)

## Clique Instance

In [None]:
def GGM_clique_instances(n=200, p=400, K=20, group_size=7, theta=-0.175):
    # Partition [p] into p/K (big_group_size) disjoint sets,
    # then choose a fixed-size subset of each disjoint set

    assert K * group_size < p
    big_group_size = int(p/K)

    invertible = False
    while not invertible:
        prec = np.eye(p)
        for k in range(K):
            group_k = range(k * big_group_size, (k + 1) * big_group_size)
            variables_k = np.random.choice(group_k,
                                           size=group_size, replace=False)
            for i in variables_k:
                for j in variables_k:
                    # Set theta_ij = theta
                    if i != j:
                        prec[i, j] = theta

        invertible = is_invertible(prec)

    cov = np.linalg.inv(prec)
    # standardize the covariance
    cov = cov / np.outer(np.sqrt(np.diag(cov)), np.sqrt(np.diag(cov)))
    prec = np.linalg.inv(cov)

    X = np.random.multivariate_normal(mean=np.zeros(p),
                                      cov=cov, size=n)

    return prec, cov, X

In [None]:
prec, cov, X = GGM_clique_instances(n=1000, p=50, K=10, group_size=4, theta=-0.2)

In [None]:
weights_const=1.
ridge_const=1.
randomizer_scale=2.

nbd_instance = nbd_lasso.gaussian(X, n_scaled=False, weights_const=weights_const,
                                      ridge_terms=ridge_const, randomizer_scale=randomizer_scale)
active_signs_random = nbd_instance.fit()
nonzero = nbd_instance.nonzero

# Construct intervals
if nonzero.sum() > 0:
    # Intervals returned is in its original (unscaled) order
    intervals = nbd_instance.inference(parallel=False)

In [None]:
n=1000
p=50
coverage = get_coverage(nonzero, intervals, prec, n=3000, p=50, scale=False)
interval_len = 0
nonzero_count = 0  # nonzero_count is essentially upper-triangular
for i in range(p):
    for j in range(i+1,p):
        if nonzero[i,j]:
            interval = intervals[i,j,:]
            interval_len = interval_len + (interval[1] - interval[0])
            nonzero_count = nonzero_count + 1
avg_len = interval_len / nonzero_count
cov_rate = coverage.sum() / nonzero_count
F1_approx = calculate_F1_score_graph(prec, selection=nonzero)

In [None]:
cov_rate

In [None]:
avg_len

In [None]:
F1_approx

In [None]:
def print_nonzero_intervals(nonzero, intervals, prec, X):
    # Intervals, prec, X are all in their original scale
    n, p = X.shape
    S = X.T @ X / n

    for i in range(p):
            for j in range(i+1,p):
                if nonzero[i,j]:
                    print("(",i,",",j,")", "selected")
                    print("Theta", "(",i,",",j,")", "interval:", intervals[i,j,:])
                    print("Theta", "(",i,",",j,")", prec[i,j])
                    print("S/n", "(",i,",",j,")", S[i,j])
print_nonzero_intervals(nonzero, intervals, prec, X)

In [None]:
(np.abs(prec) > 1e-10).sum() / (p**2)

In [None]:
((np.abs(prec) > 1e-10) * nonzero).sum() / nonzero.sum()

In [None]:
nonzero.sum() / (p**2)