In [30]:
using Random, Distributions
using LinearAlgebra
using Gurobi, JuMP
using DataFrames
using CSV
using StatsBase
using Plots
using ProgressBars
using Optim

In [31]:
# Create a gurobi model without the annoying academic license message
gurobi_env = Gurobi.Env()
function create_gurobi_model(; TimeLimit=-1, LogFile=nothing)
    model = Model(optimizer_with_attributes(() -> Gurobi.Optimizer(gurobi_env)));
    if TimeLimit >= 0
        println("Set Gurobi TimeLimit.")
        set_optimizer_attribute(model, "TimeLimit", TimeLimit)
    end
    if LogFile != nothing
        println("LogFile: $(LogFile).")
        set_optimizer_attribute(model, "LogFile", LogFile)
    else
        set_optimizer_attribute(model, "OutputFlag", 0)
    end
    set_optimizer_attribute(model, "NumericFocus", 3)
    #set_optimizer_attribute(model, "Threads", 4)
    return model
end;


--------------------------------------------
--------------------------------------------

Academic license - for non-commercial use only


_____
# Dual Holistic Regression | Benchmark

## Primal

In [32]:
function compute_primal(X, y, k, γ, t_α, σ_X, log_path)
    
    n, p = size(X)
    
    model = create_gurobi_model(LogFile=log_path)

    # TODO: change big-M values
    M1 = 1000
    M2 = 1000

    @variable(model, β[i=1:p])
    @variable(model, s[i=1:p], Bin)
    @variable(model, b[i=1:p], Bin)

    @constraint(model, sum(s) <= k)
    
    @constraint(model, [i=1:p], β[i] <= M1*s[i])
    @constraint(model, [i=1:p], β[i] >= -M1*s[i])

    @constraint(model, [i=1:p], β[i]/σ_X[i] + M2*b[i] >= t_α*s[i])
    @constraint(model, [i=1:p], -β[i]/σ_X[i] + M2*(1-b[i]) >= t_α*s[i])

    @objective(model, Min, 0.5*sum((y[i] - X[i,:]'β)^2 for i=1:n) + (0.5/γ)* sum(β[j]^2 for j=1:p))
    JuMP.optimize!(model)
    
    return objective_value(model), value.(β)
end

compute_primal (generic function with 1 method)

In [33]:
function g(s, X, y, D, Z, γ, t_α, σ_X, model; compute_β=false)
    
    zero_indexes = findall(x->x == 0, s)
    
    # Create DZ once
    DZ = D*Z
    
    # Compute norm
    function compute_DZ_square_norm(in_norm)
        return in_norm' * DZ * in_norm
    end
    
    λ = model[:λ]
    obj_1 = 0.5*y'y
    obj_2 = t_α*λ'*(s.*σ_X)
    obj_3 = - 0.5 * compute_DZ_square_norm(X'y + λ)
    @objective(model, Max, obj_2 + obj_3)
    
    JuMP.optimize!(model)
    λ_value = value.(λ)  
    obj_value = objective_value(model)

    
    β_pred = nothing
    
    if compute_β
        
        # Get size
        n, p = size(X)
        
        # Compute β_s
        sparsity_indexes = findall(x->x>0, s)
        X_s = X[:, sparsity_indexes]
        λ_s = λ_value[sparsity_indexes]
        β_s = inv(I/γ + X_s'X_s)*(X_s'y + λ_s)

        # Compute β
        β_pred = zeros(p)
        β_pred[sparsity_indexes] = β_s

    end
    
    return β_pred, λ_value, obj_1 + obj_value
    
end

g (generic function with 1 method)

_____
## Dual Optimized

In [70]:
function g_s(supp_ids, obj1, Xty_s, D_s, σ_X_s, M_s, γ, t_α)
    
    l = length(supp_ids)
    
    if l==0
        return zeros(0), 0.0
    end
    
    lower = [-Inf for _ in 1:l]
    upper = zeros(l)
    
    #initial_λ = zeros(l) .- 1.0
    initial_λ = min.(-1.0, -t_α*(I/γ + M_s)*σ_X_s + Xty_s)
    #println(initial_λ)
    
    
    D_sXty_s = D_s*Xty_s
    
    f(λ) = -t_α*λ'σ_X_s + 0.5(Xty_s+λ)'*(D_sXty_s+D_s*λ)
    
    ∇f = function(λ)
        return -t_α*σ_X_s + D_sXty_s + D_s*λ
    end
    
    #println("Init ", initial_λ, f(initial_λ), ∇f(initial_λ))
    
    inner_optimizer = GradientDescent()
    res = Optim.optimize(f, ∇f, lower, upper, initial_λ, Fminbox(inner_optimizer), inplace=false)

    output =  - res.minimizer, obj1 - res.minimum
    
    #println("output", output)
    
    return output
    
end

g_s (generic function with 2 methods)

In [71]:
function ∇g_s(supp_ids, supp_c_ids, Xty_s, Xty_s_c, DγX_s, M_s_c, λ_s, γ, p)
    
    μ_s = Xty_s + λ_s
    μ_s_c = Xty_s_c
    
    D_stμ_s = D_s'μ_s
    
    grad = zeros(2p)    
    grad[supp_ids] = λ_s .* σ_X_s - 0.5(D_stμ_s).^2
    grad[supp_c_ids] = - 0.5*γ^2*(M_s_c'D_stμ_s + μ_s_c).^2
    
    return grad
    
end

∇g_s (generic function with 2 methods)

In [72]:
function get_support_ids(s)
    supp = similar(s, Int)
    count_supp = 1
    
    supp_c = similar(s, Int)
    count_supp_c = 1
    
    @inbounds for i in eachindex(s)
        supp[count_supp] = i
        supp_c[count_supp_c] = i
        is_zero = s[i] == zero(eltype(s))
        count_supp += !is_zero
        count_supp_c += is_zero
    end
    return resize!(supp, count_supp-1), resize!(supp_c, count_supp_c-1)
end

get_support_ids (generic function with 1 method)

In [73]:
get_support_ids([0, 0, 4, 4, 0, 4])

([3, 4, 6], [1, 2, 5])

In [74]:
function project(s, X, Xty, σ_X, γ)
    supp_ids, supp_c_ids = get_support_ids(s)
    X_s = X[:, supp_ids]
    X_s_c = X[:, supp_c_ids]
    M_s = X_s'X_s
    M_s_c = X_s'X_s_c
    D_s = inv(I/γ + M_s)
    Xty_s = Xty[supp_ids]
    Xty_s_c = Xty[supp_c_ids]
    σ_X_s = σ_X[supp_ids]
    return supp_ids, supp_c_ids, D_s, Xty_s, Xty_s_c, σ_X_s, M_s, M_s_c
end

project (generic function with 1 method)

_____
## Dual

In [75]:
function compute_dual(X_p, y, k, γ, t_α, σ_X_p, log_path)
    n, p = size(X_p)
    
    # Extended Matrices
    X = hcat(X_p, -X_p)
    σ_X = [σ_X_p; σ_X_p]
    Xty = X'y 
    obj1 = 0.5*y'y
    
    # Outer problem
    miop = create_gurobi_model(LogFile=log_path)
    
    @variable(miop, s[1:2*p], Bin)
    @variable(miop, t >= 0)
    
    @constraint(miop, sum(s) <= k)
    @constraint(miop, [i=1:p], s[i]+s[p+i]<=1)
    
    # --- Cutting plane --- #
    
#    inner_model = create_gurobi_model()
#    @variable(inner_model, λ[j=1:2k] >= 0)
    
    # Initial solution
    s_init = zeros(2*p) #TODO: change this
    s_init[1:k] .= 1
    
    supp_ids, supp_c_ids, D_s, Xty_s, Xty_s_c, σ_X_s, M_s, M_s_c = project(s_init, X, Xty, σ_X, γ)
    
    λ_s_init, g_init = g_s(supp_ids, obj1, Xty_s, D_s, σ_X_s, M_s, γ, t_α)
    ∇g_init = ∇g_s(supp_ids, supp_c_ids, Xty_s, Xty_s_c, D_s, σ_X_s, M_s_c, λ_s_init, γ, p)
    
    @constraint(miop, t >= g_init + dot(∇g_init, s - s_init))
    @objective(miop, Min, t)
    
    
    function outer_approximation(cb_data)
        
        s_val = [callback_value(cb_data, s[i]) for i=1:2p]

        supp_ids, supp_c_ids, D_s, Xty_s, Xty_s_c, σ_X_s, M_s, M_s_c = project(s_val, X, Xty, σ_X, γ)
        
        λ_s_val, g_val = g_s(supp_ids, obj1, Xty_s, D_s, σ_X_s, M_s, γ, t_α)
        ∇g_val = ∇g_s(supp_ids, supp_c_ids, Xty_s, Xty_s_c, D_s, σ_X_s, M_s_c, λ_s_val, γ, p)
        
        con = @build_constraint(t >= g_val + dot(∇g_val, s - s_val))
        MOI.submit(miop, MOI.LazyConstraint(cb_data), con)
        
    end
    
    MOI.set(miop, MOI.LazyConstraintCallback(), outer_approximation)
    JuMP.optimize!(miop)
    
    s_opt = JuMP.value.(s)

    supp_ids, supp_c_ids, D_s, Xty_s, Xty_s_c, σ_X_s, M_s, M_s_c = project(s_opt, X, Xty, σ_X, γ)
    λ_s_opt, _ = g_s(supp_ids, obj1, Xty_s, D_s, σ_X_s, M_s, γ, t_α)
    
    β_opt = zeros(2p)
    β_opt[supp_ids] = D_s*(Xty_s + λ_s_opt)
    β_pred = β_opt[1:p] .- β_opt[p+1:end]
    
    return objective_value(miop), β_pred 
end

compute_dual (generic function with 1 method)

____
## Synthetic Data

In [76]:
function generate_synthetic_data(n, p, k, NR)
    
    # Generate PD matrix
    A = randn(p, p)
    A = A'*A
    Σ = (A' + A)/2
    
    # Generate data X
    d = MvNormal(Σ)
    X = rand(d, n)'I
    
    # Split data
    index_train = 1:floor(Int, 0.5*n)
    index_val = floor(Int, 0.5*n)+1:floor(Int, 0.75*n)
    index_test = floor(Int, 0.75*n)+1:n
    
    X_train = X[index_train,:]
    X_val = X[index_val,:]
    X_test = X[index_test,:]
    
    # Center
    μ_train = [mean(X_train[:, j]) for j=1:p]
    for j=1:p
         X_train[:,j] = X_train[:,j] .- μ_train[j]
         X_val[:,j] = X_val[:,j] .- μ_train[j]
         X_test[:,j] = X_test[:,j] .- μ_train[j]
    end
    
    # Scale
    σ_train = [norm(X_train[:, j]) for j=1:p]
    for j=1:p
         X_train[:,j] = X_train[:,j]/σ_train[j]
         X_val[:,j] = X_val[:,j] ./ σ_train[j]
         X_test[:,j] = X_test[:,j] ./ σ_train[j]
    end
    
    # Generate β
    β = zeros(p)
    for j=1:k
        β[floor(Int, j*p/k)] = 1.0*rand([-1, 1])
    end
    
    # Noise
    ϵ = rand(Normal(0, std(X*β)*NR), n)
    
    # Target
    y_train = X_train*β + ϵ[index_train]
    y_val = X_val*β + ϵ[index_val]
    y_test = X_test*β + ϵ[index_test]
            
    return  (X_train, y_train), (X_val, y_val), (X_test, y_test), β
end

generate_synthetic_data (generic function with 1 method)

In [77]:
function get_t_α_and_σ_X(X, y, α, γ)
    n, p = size(X)
    
    # Student law
    t_α = quantile(TDist(n-p), 1 - α/2) # Beware: n-p-1 if we add intercept
    
    # Estimator σ
    M = 1/γ*I + X'X
    M_inv = inv(M)
    σ_tilde = sqrt((y'*(I - X*M_inv*X')*y)/(n-p))
    σ_X = σ_tilde * sqrt.(diag(M_inv))
    
    return t_α, σ_X
end

get_t_α_and_σ_X (generic function with 1 method)

In [78]:
function get_insample_R2(y_pred, y_true, p)
    TSE = sum((y_pred[i]-y_true[i])^2 for i=1:p)
    baseline_E = sum((sum(y_true)/length(y_true)-y_true[i])^2 for i=1:p)
    return 1 - TSE/baseline_E
end

function get_OR2(y_pred, y_true, y_train, p)
    TSE = sum((y_pred[i]-y_true[i])^2 for i=1:p)
    baseline_E = sum((sum(y_train)/length(y_train)-y_true[i])^2 for i=1:p)
    return 1 - TSE/baseline_E
end

get_OR2 (generic function with 1 method)

____
## Experiences

In [79]:
function r(x,dig)
    return round(x,digits = dig)
end

function list2string(list)
    s=""
    for x in list 
        s*=x
    end
    s*="\n"
    return s
end

list2string (generic function with 1 method)

In [80]:
function write_to_file(file_path, str)
    open(file_path, "a+") do io
        write(io, str)
    end
end

write_to_file (generic function with 1 method)

In [None]:
NR = 0.001
α = 0.05
n_train = 10000
n = 2*n_train

file_path = "results/results_arie_8.csv"

write_to_file(file_path, "algo,seed,n,p,k,γ,NR,α,t_algo,R2,OR2,t_synthetic,t_thresh_var\n")

for seed in [2021, 42, 25, 1998]
    for γ in [1.0, 5.0, 10.0, 100.0]
        for p in [10, 50, 70, 100, 150, 200, 250, 500]
            
            # Set k
            k = Int(p/10)

            # Generate data
            Random.seed!(seed)
            t_synthetic = @elapsed (X_train, y_train), (X_val, y_val), (X_test, y_test), β_true = generate_synthetic_data(n, p, k, NR);
            t_thresh_var = @elapsed t_α, σ_X = get_t_α_and_σ_X(X_train, y_train, α, γ)

            # Solving Dual
            #log_path_dual = "logs/dual_n=$(n)_p=$(p)_k=$(k)_gamma=$(γ)_seed=$(seed).txt"
            log_path_dual = "debug.txt"
            t_dual = @elapsed obj_value, β_dual = compute_dual(X_train, y_train, k, γ, t_α, σ_X, log_path_dual);
            R2_dual = get_insample_R2(X_train*β_dual, y_train, p)
            OR2_dual = get_OR2(X_test*β_dual, y_test, y_train, p)
            list_dual = [x*"," for x in string.([seed, n_train, p,k,γ,NR,α,
                            r(t_dual,3),r(R2_dual,3),r(OR2_dual,3),
                            r(t_synthetic,3),r(t_thresh_var,3)])]
            s = "dual,"*list2string(list_dual) 
            write_to_file(file_path, s)

            # Solving Primal
            #log_path_primal = "logs/primal_n=$(n)_p=$(p)_k=$(k)_gamma=$(γ)_seed=$(seed).txt"
            log_path_primal = "debug.txt"
            t_primal = @elapsed obj_value, β_primal = compute_primal(X_train, y_train, k, γ, t_α, σ_X, log_path_primal)
            R2_primal = get_insample_R2(X_train*β_primal, y_train, p)
            OR2_primal = get_OR2(X_test*β_primal, y_test, y_train, p)
            list_primal = [x*"," for x in string.([seed, n_train,p,k,γ,NR,α,
                                r(t_primal,3),r(R2_primal,3),r(OR2_primal,3),
                            r(t_synthetic,3),r(t_thresh_var,3)])]
            s = "primal,"*list2string(list_primal) 
            write_to_file(file_path, s)
        end
    end
end

LogFile: debug.txt.
Gurobi Optimizer version 9.0.3 build v9.0.3rc0 (mac64)
Optimize a model with 12 rows, 21 columns and 61 nonzeros
Model fingerprint: 0x6f13dfe6
Variable types: 1 continuous, 20 integer (20 binary)
Coefficient statistics:
  Matrix range     [4e-03, 1e+00]
  Objective range  [1e+00, 1e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [6e-01, 1e+00]
Presolve removed 10 rows and 0 columns
Presolve time: 0.00s
Presolved: 2 rows, 21 columns, 41 nonzeros
Variable types: 1 continuous, 20 integer (20 binary)

Root relaxation: objective 0.000000e+00, 1 iterations, 0.00 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0    0.17976    0    2          -    0.17976      -     -    0s
H    0     0                       0.2764431    0.17976  35.0%     -    0s

Explored 1 nodes (5 simplex iterations) in 0.16 seconds
Thread count was 4 (of 4 available proce