In [53]:
using LinearAlgebra, CSV, Random, Tables, JLD, Base.Iterators, Printf

In [3]:
include("../Algorithm/utilities.jl")

refreshFile (generic function with 1 method)

In [4]:
data = load("../Data/communities.jld")["communities"]
# Matrix sqrt: via diagonalization and sqrt eigenvalues
raw_data = sqrt(data)';
n = size(data, 1);
N = n * n;
norms = data[1:n+1:N]
k = 5;
# All close.
maximum(abs.(raw_data * raw_data .- data))

6.140921104957897e-16

In [5]:
nodes = CSV.File("../Co-K5-Trace-Nodes.csv", header=false) |> Tables.matrix;
bounds = CSV.File("../Co-K5-Trace-Bounds.csv", header=false) |> Tables.matrix;

In [6]:
# node_id = 50
# node_id = 300
node_id = 40
y = nodes[:, node_id]
selected_data = findall(y .== 1)
i = sum(nodes[:, node_id] .== 1)
stillneed = k - i

2

In [7]:
bounds[:, node_id]

3-element Array{Float64,1}:
 0.19840666121229678
 0.3629025164807017
 0.4316969652057027

In [8]:
sum(norms[y .== 1]) + sum(selectsorted(norms .* (y .== -1), stillneed))

0.4316969652057027

In [9]:
s = maximum(svd(raw_data[:, selected_data]).S)
u = svd(raw_data[:, selected_data]).U[:, 1];
v = svd(raw_data[:, selected_data]).V[:, 1];
D = Matrix(1.0I, 101, 101)[:, selected_data];
s^2

0.14185918642419382

In [10]:
norm(raw_data[:, selected_data] * v)^2

0.14185918642419373

In [11]:
norm(u' * raw_data[:, selected_data])^2

0.1418591864241937

In [12]:
norm(data[:, selected_data][selected_data, :] * v)

0.14185918642419382

In [13]:
rank_one_term = (raw_data * u) .^ 2
rank_one_term[nodes[:, node_id] .== 0] .= 0
rank_one_term

101-element Array{Float64,1}:
 7.926078166037157e-6
 0.00042107519671213234
 0.010091876290498288
 0.0
 0.011482437882389994
 0.00013178020948110015
 0.0023632781250717723
 0.0009055802857870314
 0.0012990605554894848
 0.0016750788941365209
 ⋮
 0.0063246338140331565
 0.00029115092252889475
 0.0005201627269077583
 0.0008469155188951991
 3.40177036420949e-5
 0.0016327144857154394
 0.005953773016861563
 0.00018478862022754878
 0.0066479897694745995

In [14]:
# Upper bound on the contribution from adding one variable into ||E||^2.
# The upper bound comes from ||E||_F^2
# upper bounded by ||(I-uu')MD||_F^2. Also, we know that normCommunities
# has column norm of 1 for every variable.
# We should add another upper bound based on taking k columns from this
# row into ||E||_F.
residual_term = norms .- rank_one_term
residual_term[y .== 0] .= 0
residual_term

101-element Array{Float64,1}:
 0.016097164406534425
 0.02638209318953039
 0.054141001469726655
 0.0
 0.03214735697131307
 0.05392057454670922
 0.02172243664593466
 0.01970490065528001
 0.02642470524769024
 0.030432363176178794
 ⋮
 0.03542551701477036
 0.032597381433193566
 0.03968867072276389
 0.038444722591622994
 0.011947288809978047
 0.03961357916191083
 0.04651232744206663
 0.05757289255173794
 0.04763397466371458

In [15]:
function frobenius_rows_k(M, y)
    M = M .^ 2
    row_quantities = zeros(n)
    stillneed = k - sum(y .== 1)
    for row_index in 1:n
        if y[row_index] == 0
            continue
        end
        row = copy(M[row_index, :])
        row_sum = sum(row[y .== 1])
        row[y .== 1] .= 0
        if y[row_index] == -1
            row_sum += row[row_index]
            row[row_index] = 0
            row_stillneed = stillneed -1
        else
            row_stillneed = stillneed
        end
        row_sum += sum(sort(row, rev=true)[1:row_stillneed])
        row_quantities[row_index] = row_sum
    end
    row_quantities
end

frobenius_rows_k (generic function with 1 method)

In [16]:
# frobenius_rows_k(data - raw_data * u * u' * raw_data, y)

In [17]:
sum((y .== -1) .& (frobenius_rows_k(data - raw_data * u * u' * raw_data, y) .< residual_term))

81

In [18]:
lb_proj_nodes_extra = rank_one_term .>= sort(rank_one_term .* Array{Float64}(y .== -1))[end-stillneed+1]
lb_proj_nodes_extra = lb_proj_nodes_extra .& (y .== -1)
lb_proj_nodes = lb_proj_nodes_extra .| (y .== 1)
lb_proj = sum(rank_one_term[lb_proj_nodes])
# Too slow to compute!
lb_diag = maximum(svd(raw_data[:, lb_proj_nodes]).S) ^ 2
lb_proj, lb_diag

(0.2202679971348654, 0.2732082054995253)

In [19]:
bounds[:, node_id]

3-element Array{Float64,1}:
 0.19840666121229678
 0.3629025164807017
 0.4316969652057027

In [20]:
lambda_1 = bounds[2, node_id]
lambda_2 = bounds[3, node_id] - lb_diag
lambda_2_lb = bounds[3, node_id] - lambda_1
lambda_2_lb, lambda_2

(0.06879444872500101, 0.15848875970617743)

In [21]:
# lambda_2 = svd(raw_data[:, selected_data]).S[2] + sum(selectsorted(residual_term, stillneed))

In [22]:
# Initialization. Loop until convergence.
# Bad idea - lambda_2 bounds ||E|| from below, not above.
residual_ub = lambda_2
# Contribution to Tr E < residual_ub
resid_var_sel = residual_term .<= residual_ub
resid_var_sel = BitArray(ones(size(y)))
residual_ub, sum((y .== -1) .& .~resid_var_sel)

(0.15848875970617743, 0)

In [45]:
# ||Sigma||_F >= sqrt(||A||_F + ||E||_F)???
norm(data[1:k, 1:k]), sqrt(norm(raw_data[1:k, :] * u * u' * raw_data[:, 1:k])^2 + norm(data[1:k, 1:k] - (raw_data[1:k, :] * u * u' * raw_data[:, 1:k]))^2)

(0.1282440427818185, 0.1176001177218695)

In [23]:
a_squared_program = mapslices(
    (row) -> sum(row[y .== 1].^2) + sum(selectsorted(row.^2 .* (y .== -1), stillneed)),
    data,
    dims=2)[:, 1]
# lambda_1^2 >= sum_i rank_one_term_i^2 >= sum_i rank_one_term_i^2 + sum_(i!=j) 2 rank_one_term_i rank_one_term_j
e_squared_program = a_squared_program - rank_one_term.^2
maximum(e_squared_program)

0.04451876283731441

In [64]:
# Contribution to ||E||^2 from the row sums (squared Frobenius norm)
E = data - (raw_data * u * u' * raw_data)
e_squared_program = mapslices(
    (row) -> sum(row[y .== 1].^2) + sum(selectsorted(row.^2 .* (resid_var_sel .& (y .== -1)), stillneed)),
    E,
    dims=2)[:, 1]
maximum(e_squared_program)

0.026386179743713507

In [25]:
# Upper bound on ||E||_F using ||A||_F
sqrt(sum(a_squared_program[y .== 1]) + sum(selectsorted(a_squared_program .* (y .== -1), stillneed)) - lb_diag^2)

0.12443300672604643

In [26]:
# Upper bound on ||E||_F
lambda_2 = sqrt(sum(e_squared_program[y .== 1]) + sum(selectsorted(e_squared_program .* (y .== -1), stillneed)))

0.18117842668099743

In [27]:
# Upper bound performed poorly
lambda_2 = bounds[3, node_id] - lb_diag

0.15848875970617743

In [28]:
sum(residual_term[y .== 1]) + sum(selectsorted(residual_term[y .== -1], stillneed))

0.24795709056878384

In [65]:
# Multiplier for 
mult_parallel = 1. / lambda_1
mult_perp = lambda_2 / (lb_proj - lambda_2)^2
mult_linearize = mult_parallel + mult_perp
linear_program = rank_one_term + mult_linearize * e_squared_program

101-element Array{Float64,1}:
 0.01779264355433725
 0.057995320390979814
 0.18262281400917196
 0.1460602976499621
 0.08803754640128536
 0.2392564293228971
 0.04348631750007592
 0.03736412754088799
 0.05480160623763157
 0.07411834186327422
 ⋮
 0.10085579242512224
 0.08710367055087934
 0.10924901820611159
 0.10349046799096251
 0.003882057589549799
 0.10449036260751611
 0.1293754007046165
 0.18853743075691745
 0.15193791934247894

In [66]:
# Linear program solution
sum(linear_program[y .== 1]) + sum(selectsorted(linear_program[y .== -1], stillneed))

1.6338898195663925

In [67]:
# Conservative program in case we cannot bound sin theta_perp.
linear_program_lam_2 = rank_one_term + mult_parallel .* e_squared_program
lambda_2 + sum(linear_program_lam_2[y .== 1]) + sum(selectsorted(linear_program_lam_2[y .== -1], stillneed))

0.4546649113544162

In [32]:
[sum((sort(rank_one_term, rev=true)[k] .> linear_program) .& (y .== -1)) sum(y .== -1)]

1×2 Array{Int64,2}:
 13  81

In [57]:
resid_var_sel = y .== -1
for iter in countfrom()
    @printf("rank_one_term iter %d\n", iter)
    e_squared_program = mapslices(
        (row) -> sum(row[y .== 1].^2) + sum(selectsorted(row.^2 .* (resid_var_sel .& (y .== -1)), stillneed)),
        E,
        dims=2)[:, 1]
    linear_program = rank_one_term + mult_linearize * e_squared_program
    num_selected = sum(resid_var_sel)
    resid_var_sel .&= sort(rank_one_term, rev=true)[k] .<= linear_program
    if sum(resid_var_sel) == num_selected
        break
    end
end

rank_one_term iter 1
rank_one_term iter 

2


In [58]:
sum(resid_var_sel)

68

In [60]:
# Linear program solution
sum(linear_program[y .== 1]) + sum(selectsorted(linear_program[y .== -1], stillneed))

1.6338898195663925