In [38]:
using Combinatorics, LinearAlgebra, CSV, Tables

In [39]:
data = CSV.File("../NC-Data.csv") |> Tables.matrix;
# Matrix sqrt: via diagonalization and sqrt eigenvalues
raw_data = sqrt(data);
n = size(data, 1);
# All close.
maximum(abs.(raw_data * raw_data .- data))

6.661338147750939e-15

In [40]:
size(raw_data)

(101, 101)

In [54]:
# Say we can use the overall data's principal component.
U = svd(raw_data).U[:, 1]

101-element Vector{Float64}:
 -0.017151166317369033
  0.041436252327551444
 -0.060581505510835916
  0.027828562038693706
  0.11679206787063857
 -0.0018055395671212153
 -0.060090501982123914
 -0.04085328055047178
 -0.038076512388390986
 -0.03445210651101385
 -0.009637582314310613
  0.0677846593389534
  0.18400222069949823
  ⋮
 -0.026041055496287043
 -0.005489179487791176
  0.09494904903761282
 -0.07110007369500787
  0.014372695684782145
 -0.014627111067605114
 -0.00747315762177969
 -0.023892133184153022
  0.03586314503268791
  0.04945410248357551
 -0.016432604943053007
 -0.05019485002081419

In [42]:
# Now pick a reasonable subset of data, and k value.
data = data[70:90, 70:90]
raw_data = raw_data[:, 70:90]
n = size(data, 1)
k = 5

5

In [43]:
raw_data_E = sqrt(Hermitian(raw_data' * (I - U * U') * raw_data) + 1e-12 * I)

21×21 Symmetric{Float64, Matrix{Float64}}:
  0.628297     -0.255353    0.0214677   …   0.0025562   -0.000136602
 -0.255353      0.791214   -0.0290504       0.0207853   -0.0186878
  0.0214677    -0.0290504   0.834704       -0.00858496   0.368667
 -0.0924483     0.0402467  -0.155094       -0.0573986    0.0382871
 -0.202821      0.127652   -0.00549642     -0.00197622  -0.026747
 -0.0871262     0.0352271   0.093584    …   0.0616893    0.0746238
 -0.0986586     0.0772708  -0.0193633       0.167492    -0.0296724
 -0.0311041     0.0221856   0.028737       -0.177724    -0.0424524
 -0.0743755     0.0384544  -0.0715118       0.0212823   -0.0694625
 -0.0289611     0.0126837  -0.0433388       0.0553444   -0.00062023
  0.0808028    -0.0556273   0.0243537   …  -0.0021505    0.0534051
  0.0847395    -0.0577619   0.0327161      -0.00948647   0.0615863
  0.0873286    -0.060943    0.051096       -0.016972     0.0774627
  0.0891786    -0.0694027   0.0497387      -0.00478771   0.046824
  0.0704872    -0.0

In [44]:
actual_sparse_pca = []
A_op_norm = []
E_op_norm = []
for lookup_D_key in combinations(1:n, k)
    push!(
        actual_sparse_pca,
        maximum(svd(raw_data[:, lookup_D_key]).S) ^ 2)
    push!(
        A_op_norm,
        # Eigenvalue comes from the trace.
        sum((U' * raw_data[:, lookup_D_key]) .^ 2))
    push!(
        E_op_norm,
        maximum(svd(raw_data_E[:, lookup_D_key]).S) ^ 2)
end

In [45]:
[actual_sparse_pca (A_op_norm+E_op_norm) A_op_norm E_op_norm]

20349×4 Matrix{Any}:
 2.76907  2.80181  1.36041   1.4414
 2.36656  2.51827  1.22022   1.29804
 2.20451  2.36651  0.969577  1.39694
 2.21674  2.29518  0.969976  1.3252
 2.50036  2.91444  1.60242   1.31202
 2.37554  2.54682  1.2445    1.30232
 2.30962  2.75629  1.3758    1.3805
 2.29311  2.75161  1.35757   1.39405
 2.27446  2.75437  1.34033   1.41404
 2.30508  2.8367   1.43239   1.40431
 2.32686  2.85374  1.46334   1.3904
 2.32227  2.85039  1.45627   1.39412
 2.35031  2.84833  1.47539   1.37295
 ⋮                           
 2.52774  2.95695  1.19319   1.76376
 2.57205  3.2405   1.23689   2.00361
 2.55754  3.31499  1.31423   2.00076
 2.00263  2.73117  1.26924   1.46193
 2.52299  2.95307  1.2123    1.74078
 1.80684  2.50577  0.778088  1.72769
 2.57792  3.25105  1.22982   2.02123
 2.56149  3.32598  1.30716   2.01882
 1.9961   2.74692  1.26218   1.48475
 2.52237  2.95959  1.20523   1.75435
 1.83307  2.52441  0.771021  1.75339
 1.80331  2.51304  0.790132  1.7229

In [46]:
# Now what we would actually like to do is find optimal 3-sparse PCA
# (hope that the heuristic that we used produced good 3-sparse PCA).
# Diagonalize that system.
k_3_sparse_pca = []
k_3_sparse_U = zeros(size(raw_data, 1), binomial(n, 3))
index = 1
for lookup_D_key in combinations(1:n, 3)
    push!(
        k_3_sparse_pca,
        maximum(svd(raw_data[:, lookup_D_key]).S) ^ 2)
    k_3_sparse_U[:, index] = (
        svd(raw_data[:, lookup_D_key]).U[:, 1])
    index += 1
end
size(k_3_sparse_U)

(101, 1330)

In [55]:
U = k_3_sparse_U[:, findmax(k_3_sparse_pca)[2]];

In [56]:
raw_data_E = sqrt(Hermitian(raw_data' * (I - U * U') * raw_data) + 1e-12 * I)

21×21 Symmetric{Float64, Matrix{Float64}}:
  0.778177    -0.356971      0.0904329    …   0.0105525    0.0702265
 -0.356971     0.863772     -0.077093         0.0135079   -0.0668081
  0.0904329   -0.077093      0.866867        -0.00445876   0.401299
 -0.151063     0.0843231    -0.18343         -0.0613927    0.0058828
 -0.359384     0.233406     -0.0774474       -0.0105799   -0.0998231
  0.0293892   -0.0436009     0.14693      …   0.066768     0.132436
 -0.0861393    0.0663386    -0.0126184        0.168701    -0.0188357
 -0.0747341    0.0522152     0.00790144      -0.182382    -0.0600338
  0.103044    -0.0792123     0.00887512       0.0272303    0.0213968
  0.0936149   -0.0713208     0.0133438        0.0621433    0.0586463
 -0.00377799   0.0037544    -0.0123314    …   0.00618196  -0.0105847
  0.0016033   -0.000162482  -0.00287106       2.33203e-5  -0.00216584
  0.00216891  -0.00359499    0.0152335       -0.00621213   0.0127753
 -0.0154196    0.0045869     0.00128136      -0.00708331  -0.

In [49]:
actual_sparse_pca = []
A_op_norm = []
E_op_norm = []
for lookup_D_key in combinations(1:n, k)
    push!(
        actual_sparse_pca,
        maximum(svd(raw_data[:, lookup_D_key]).S) ^ 2)
    push!(
        A_op_norm,
        # Eigenvalue comes from the trace.
        sum((U' * raw_data[:, lookup_D_key]) .^ 2))
    push!(
        E_op_norm,
        maximum(svd(raw_data_E[:, lookup_D_key]).S) ^ 2)
end

In [50]:
[actual_sparse_pca (A_op_norm+E_op_norm) A_op_norm E_op_norm]

20349×4 Matrix{Any}:
 2.76907  2.80222  0.170855  2.63137
 2.36656  2.42178  0.219294  2.20249
 2.20451  2.31778  0.214636  2.10315
 2.21674  2.24519  0.135882  2.10931
 2.50036  2.70598  0.47497   2.23101
 2.37554  2.4163   0.196539  2.21976
 2.30962  3.21912  1.12478   2.09434
 2.29311  3.2286   1.13483   2.09377
 2.27446  3.21701  1.12264   2.09438
 2.30508  2.93557  0.838269  2.09731
 2.32686  3.04325  0.944165  2.09908
 2.32227  3.06578  0.967999  2.09778
 2.35031  3.00793  0.901956  2.10597
 ⋮                           
 2.52774  3.11161  1.99429   1.11732
 2.57205  3.24842  1.93492   1.3135
 2.55754  3.18806  1.93452   1.25355
 2.00263  2.75621  1.5815    1.17471
 2.52299  3.0465   1.92825   1.11825
 1.80684  2.51977  1.16881   1.35096
 2.57792  3.27648  1.95875   1.31773
 2.56149  3.21435  1.95835   1.256
 1.9961   2.78141  1.60533   1.17608
 2.52237  3.07378  1.95208   1.1217
 1.83307  2.54726  1.19264   1.35462
 1.80331  2.47833  1.1266    1.35173

In [51]:
maximum(actual_sparse_pca)

4.74708500443918

In [52]:
function top_k(arr, k)
    sort!(arr)
    arr[1 + size(arr, 1) - k:end]
end

top_k([1; 10; 5; 7; 15], 3)

3-element Vector{Int64}:
  7
 10
 15

In [None]:
sum(top_k((mapslices(norm, raw_data; dims=1)' - abs.(raw_data' * U))[:, 1] .^ 2, k))

4.631493623120979

In [None]:
sum(top_k(mapslices(norm, raw_data' - raw_data' * U * U'; dims=2)[:, 1], k))

4.992993670764804