In [1]:
using LinearAlgebra, CSV, Random, Tables, JLD, Base.Iterators, Printf, DataFrames

In [2]:
include("../Algorithm/utilities.jl")

refreshFile (generic function with 1 method)

In [3]:
data = load("../Data/communities.jld")["normCommunities"]
# Matrix sqrt: via diagonalization and sqrt eigenvalues
raw_data = sqrt(data)';
n = size(data, 1);
N = n * n;
norms = data[1:n+1:N]
k = 7;
# All close.
maximum(abs.(raw_data * raw_data .- data))

6.661338147750939e-15

In [16]:
nodes = CSV.File("../NC-K7-Trace-Nodes.csv") |> Tables.matrix;
bounds = CSV.File("../NC-K7-Trace-Bounds.csv") |> Tables.matrix;

In [18]:
# node_id = 50
# node_id = 300
node_id = 41
y = nodes[:, node_id]
selected_data = findall(y .== 1)
i = sum(nodes[:, node_id] .== 1)
stillneed = k - i

5

In [20]:
bounds[:, node_id]

3-element Vector{Float64}:
 4.6440665883730805
 6.620524735335411
 7.0

In [21]:
sum(norms[y .== 1]) + sum(selectsorted(norms .* (y .== -1), stillneed))

7.0

In [22]:
s = maximum(svd(raw_data[:, selected_data]).S)
u = svd(raw_data[:, selected_data]).U[:, 1];
v = svd(raw_data[:, selected_data]).V[:, 1];
D = Matrix(1.0I, 101, 101)[:, selected_data];
s^2

1.9885850360066886

In [23]:
norm(raw_data[:, selected_data] * v)^2

1.9885850360066886

In [24]:
norm(u' * raw_data[:, selected_data])^2

1.9885850360066892

In [25]:
norm(data[:, selected_data][selected_data, :] * v)

1.9885850360066917

In [26]:
rank_one_term = (raw_data * u) .^ 2
rank_one_term[nodes[:, node_id] .== 0] .= 0
rank_one_term

101-element Vector{Float64}:
 0.0002994451594882808
 0.03066298611731982
 0.08577110013652099
 0.016049961181145968
 0.28098579234613125
 0.0017981504413838068
 0.0924737433896238
 0.04386307427228924
 0.049012359018644036
 0.05126719064815483
 0.0004268326165033283
 0.15506212547906217
 0.0
 ⋮
 0.001486141465211946
 6.0664179699725966e-5
 0.23179122454587167
 0.1694991155659717
 0.008684430324264608
 0.008617640897232319
 0.01895029707671571
 0.002639305133772915
 0.044335169463675404
 0.1254640250809044
 0.0010975082146965829
 0.0580169663416154

In [27]:
# Upper bound on the contribution from adding one variable into ||E||^2.
# The upper bound comes from ||E||_F^2
# upper bounded by ||(I-uu')MD||_F^2. Also, we know that normCommunities
# has column norm of 1 for every variable.
# We should add another upper bound based on taking k columns from this
# row into ||E||_F.
residual_term = norms .- rank_one_term
residual_term[y .== 0] .= 0
residual_term

101-element Vector{Float64}:
 0.9997005548405117
 0.9693370138826802
 0.914228899863479
 0.983950038818854
 0.7190142076538688
 0.9982018495586162
 0.9075262566103762
 0.9561369257277108
 0.950987640981356
 0.9487328093518451
 0.9995731673834967
 0.8449378745209378
 0.0
 ⋮
 0.998513858534788
 0.9999393358203003
 0.7682087754541284
 0.8305008844340283
 0.9913155696757354
 0.9913823591027677
 0.9810497029232843
 0.9973606948662271
 0.9556648305363246
 0.8745359749190956
 0.9989024917853034
 0.9419830336583846

In [28]:
function frobenius_rows_k(M, y)
    M = M .^ 2
    row_quantities = zeros(n)
    stillneed = k - sum(y .== 1)
    for row_index in 1:n
        if y[row_index] == 0
            continue
        end
        row = copy(M[row_index, :])
        row_sum = sum(row[y .== 1])
        row[y .== 1] .= 0
        if y[row_index] == -1
            row_sum += row[row_index]
            row[row_index] = 0
            row_stillneed = stillneed -1
        else
            row_stillneed = stillneed
        end
        row_sum += sum(sort(row, rev=true)[1:row_stillneed])
        row_quantities[row_index] = row_sum
    end
    row_quantities
end

frobenius_rows_k (generic function with 1 method)

In [29]:
# frobenius_rows_k(data - raw_data * u * u' * raw_data, y)

In [30]:
sum((y .== -1) .& (frobenius_rows_k(data - raw_data * u * u' * raw_data, y) .< residual_term))

8

In [31]:
lb_proj_nodes_extra = rank_one_term .>= sort(rank_one_term .* Array{Float64}(y .== -1))[end-stillneed+1]
lb_proj_nodes_extra = lb_proj_nodes_extra .& (y .== -1)
lb_proj_nodes = lb_proj_nodes_extra .| (y .== 1)
lb_proj = sum(rank_one_term[lb_proj_nodes])
# Too slow to compute!
lb_diag = maximum(svd(raw_data[:, lb_proj_nodes]).S) ^ 2
lb_proj, lb_diag

(6.230223767217365, 6.511153616891437)

In [32]:
bounds[:, node_id]

3-element Vector{Float64}:
 4.6440665883730805
 6.620524735335411
 7.0

In [33]:
lambda_1 = bounds[2, node_id]
lambda_2 = bounds[3, node_id] - lb_diag
lambda_2_lb = bounds[3, node_id] - lambda_1
lambda_2_lb, lambda_2

(0.37947526466458914, 0.4888463831085632)

In [34]:
# lambda_2 = svd(raw_data[:, selected_data]).S[2] + sum(selectsorted(residual_term, stillneed))

In [35]:
# Initialization. Loop until convergence.
# Bad idea - lambda_2 bounds ||E|| from below, not above.
residual_ub = lambda_2
# Contribution to Tr E < residual_ub
resid_var_sel = residual_term .<= residual_ub
resid_var_sel = BitArray(ones(size(y)))
residual_ub, sum((y .== -1) .& .~resid_var_sel)

(0.4888463831085632, 0)

In [36]:
# ||Sigma||_F >= sqrt(||A||_F + ||E||_F)???
norm(data[1:k, 1:k]), sqrt(norm(raw_data[1:k, :] * u * u' * raw_data[:, 1:k])^2 + norm(data[1:k, 1:k] - (raw_data[1:k, :] * u * u' * raw_data[:, 1:k]))^2)

(3.2734747142193443, 3.1674211559981784)

In [37]:
a_squared_program = mapslices(
    (row) -> sum(row[y .== 1].^2) + sum(selectsorted(row.^2 .* (y .== -1), stillneed)),
    data,
    dims=2)[:, 1]
# lambda_1^2 >= sum_i rank_one_term_i^2 >= sum_i rank_one_term_i^2 + sum_(i!=j) 2 rank_one_term_i rank_one_term_j
e_squared_program = a_squared_program - rank_one_term.^2
maximum(e_squared_program)

5.469445648769899

In [38]:
# Contribution to ||E||^2 from the row sums (squared Frobenius norm)
E = data - (raw_data * u * u' * raw_data)
e_squared_program = mapslices(
    (row) -> sum(row[y .== 1].^2) + sum(selectsorted(row.^2 .* (resid_var_sel .& (y .== -1)), stillneed)),
    E,
    dims=2)[:, 1]
maximum(e_squared_program)

4.4609160327705375

In [39]:
# Upper bound on ||E||_F using ||A||_F
sqrt(sum(a_squared_program[y .== 1]) + sum(selectsorted(a_squared_program .* (y .== -1), stillneed)) - lb_diag^2)

0.38238387256342726

In [40]:
# Upper bound on ||E||_F
lambda_2 = sqrt(sum(e_squared_program[y .== 1]) + sum(selectsorted(e_squared_program .* (y .== -1), stillneed)))

4.622779319886431

In [41]:
# Upper bound performed poorly
lambda_2 = bounds[3, node_id] - lb_diag

0.4888463831085632

In [42]:
sum(residual_term[y .== 1]) + sum(selectsorted(residual_term[y .== -1], stillneed))

5.009906394681007

In [43]:
# Multiplier for 
mult_parallel = 1. / lambda_1
mult_perp = lambda_2 / (lb_proj - lambda_2)^2
mult_linearize = mult_parallel + mult_perp
linear_program = rank_one_term + mult_linearize * e_squared_program

101-element Vector{Float64}:
 0.7402556644415275
 0.6092168011964461
 0.4188537600563196
 0.4464603644313368
 0.5084851222819133
 0.6139812621846495
 0.5461652732115176
 0.5727035809021997
 0.5847931532364803
 0.5345380867652912
 0.7354799333590984
 0.32922500559393647
 0.06581892739181085
 ⋮
 0.6111544845758357
 0.4639871124942503
 0.7206220817410615
 0.46599589946560355
 0.4503847328062839
 0.45637789055229383
 0.44689139917136106
 0.4807723579061648
 0.38724499424874453
 0.3686715541035246
 0.308404669455784
 0.3995239730634894

In [44]:
# Linear program solution
sum(linear_program[y .== 1]) + sum(selectsorted(linear_program[y .== -1], stillneed))

6.318477149077138

In [45]:
# Conservative program in case we cannot bound sin theta_perp.
linear_program_lam_2 = rank_one_term + mult_parallel .* e_squared_program
lambda_2 + sum(linear_program_lam_2[y .== 1]) + sum(selectsorted(linear_program_lam_2[y .== -1], stillneed))

6.799433303364484

In [55]:
# C11 = a, C22 = b
# tan theta = -sqrt(b/a)
# max(a+b + c sin^2 theta, d cos^2 theta)
# sin^2(arctan(sqrt(b/a))) = b/(a + b) :D
linear_program = rank_one_term * sqrt(mult_perp / (mult_parallel + mult_perp)) * (sqrt(mult_parallel) + sqrt(mult_perp)) / (mult_parallel + mult_perp) + mult_linearize * e_squared_program
sum(linear_program[y .== 1]) + sum(selectsorted(linear_program[y .== -1], stillneed))

5.8376507339192605

In [116]:
[sum((sort(rank_one_term, rev=true)[k] .> linear_program) .& (y .== -1)) sum(y .== -1)]

1×2 Matrix{Int64}:
 13  81

In [117]:
resid_var_sel = y .== -1
for iter in countfrom()
    @printf("rank_one_term iter %d\n", iter)
    e_squared_program = mapslices(
        (row) -> sum(row[y .== 1].^2) + sum(selectsorted(row.^2 .* (resid_var_sel .& (y .== -1)), stillneed)),
        E,
        dims=2)[:, 1]
    linear_program = rank_one_term + mult_linearize * e_squared_program
    num_selected = sum(resid_var_sel)
    resid_var_sel .&= sort(rank_one_term, rev=true)[k] .<= linear_program
    if sum(resid_var_sel) == num_selected
        break
    end
end

rank_one_term iter 1
rank_one_term iter 2


In [118]:
sum(resid_var_sel)

68

In [119]:
# Linear program solution
sum(linear_program[y .== 1]) + sum(selectsorted(linear_program[y .== -1], stillneed))

1.6338898195663933