In [1]:
using Random, Distributions
using LinearAlgebra
using Gurobi, JuMP
using DataFrames
using CSV
using StatsBase
using Plots

In [2]:
# Create a gurobi model without the annoying academic license message
gurobi_env = Gurobi.Env()
function create_gurobi_model(; TimeLimit=-1, LogFile=nothing)
    model = Model(optimizer_with_attributes(() -> Gurobi.Optimizer(gurobi_env)));
    if TimeLimit >= 0
        println("Set Gurobi TimeLimit.")
        set_optimizer_attribute(model, "TimeLimit", TimeLimit)
    end
    if LogFile != nothing
        println("LogFile: $(LogFile).")
        set_optimizer_attribute(model, "LogFile", LogFile)
    else
        set_optimizer_attribute(model, "OutputFlag", 0)
    end
    set_optimizer_attribute(model, "NumericFocus", 3)
    set_optimizer_attribute(model, "Threads", 4)
    return model
end;

Academic license - for non-commercial use only


_____
# Dual Holistic Regression | Benchmark

## Primal

In [72]:
function compute_primal(X, y, k, γ, t_α, σ_X)
    
    n, p = size(X)
    
    model = create_gurobi_model(LogFile="primal.txt")

    # TODO: change big-M values
    M1 = 1000
    M2 = 1000

    @variable(model, β[i=1:p])
    @variable(model, s[i=1:p], Bin)
    @variable(model, b[i=1:p], Bin)

    @constraint(model, sum(s) <= k)
    
    @constraint(model, [i=1:p], β[i] <= M1*s[i])
    @constraint(model, [i=1:p], β[i] >= -M1*s[i])

    @constraint(model, [i=1:p], β[i]/σ_X[i] + M2*b[i] >= t_α*s[i])
    @constraint(model, [i=1:p], -β[i]/σ_X[i] + M2*(1-b[i]) >= t_α*s[i])

    @objective(model, Min, 0.5*sum((y[i] - X[i,:]'β)^2 for i=1:n) + (0.5/γ)* sum(β[j]^2 for j=1:p))
    optimize!(model)
    
    return objective_value(model), value.(β)
end

compute_primal (generic function with 1 method)

_____
## Dual

In [73]:
function g(s, X, y, D, Z, γ, t_α, σ_X)
    
    # Get size
    n, p = size(X)
    
    # Compute norm
    function compute_DZ_square_norm(in_norm)
        return in_norm' * D*Z * in_norm
    end
    
    # Compute max
    model = create_gurobi_model()
    @variable(model, λ[1:p] >= 0)
    
    obj_1 = 0.5*y'y
    obj_2 = t_α*λ'*(s.*σ_X)
    obj_3 = - 0.5 * compute_DZ_square_norm(X'y + λ)
    @objective(model, Max, obj_1 + obj_2 + obj_3)

    optimize!(model)

    # Compute β
    sparsity_indexes = findall(x->x>0, s)
    X_s = X[:, sparsity_indexes]
    λ_s = value.(λ)[sparsity_indexes]
    β_s = (I/γ + X_s'X_s)^(-1)*(X_s'y + λ_s)
    
    β_pred = zeros(p)
    β_pred[sparsity_indexes] = β_s
    return β_pred, value.(λ), objective_value(model)
    
end

g (generic function with 1 method)

In [74]:
function ∇g(s, λ, X, y, D, t_α, σ_X)
    n,p = size(X)
    grad = zeros(p)
    
    function compute_DED_square_norm(E,in_norm)
        return in_norm' * D*E*D' * in_norm
    end
    for i in 1:p
        E_ii = Diagonal([(j == i)*1 for j in 1:p])
        grad[i] = t_α*λ'E_ii*σ_X - 0.5*compute_DED_square_norm(E_ii, X'y+ λ)
    end
    return grad
end

∇g (generic function with 1 method)

In [75]:
function compute_dual(X_p, y, k, γ, t_α, σ_X_p)
    n, p = size(X_p)
    
    # Extended Matrices
    X = hcat(X_p, -X_p)
    M = X'X
    σ_X = [σ_X_p; σ_X_p]
    
    # Outer problem
    miop = create_gurobi_model(LogFile="Dual.txt")
    
    @variable(miop, s[1:2*p], Bin)
    @variable(miop, t >= 0)
    
    @constraint(miop, sum(s) <= k)
    @constraint(miop, [i=1:p], s[i]+s[p+i]<=1)
    
    # Cutting plane
    
    s_init = zeros(2*p)
    s_init[1:k] .= 1
    Z_init = Diagonal(s_init)
    D_init = (I/γ + Z_init*M)^(-1)
    
    β_init, λ_init, g_init = g(s_init, X, y, D_init, Z_init, γ, t_α, σ_X)
    ∇g_init = ∇g(s_init, λ_init, X, y, D_init, t_α, σ_X)
    
    @constraint(miop, t >= g_init + dot(∇g_init, s - s_init))
    @objective(miop, Min, t)
    
    function outer_approximation(cb_data)
        s_val = [callback_value(cb_data, s[i]) for i=1:2*p]
        Z_val = Diagonal(s_val)
        D_val = (I/γ + Z_val*M)^(-1)

        β_val, λ_val, g_val = g(s_val, X, y, D_val, Z_val, γ, t_α, σ_X)
        ∇g_val = ∇g(s_val, λ_val, X, y, D_val, t_α, σ_X)
        
        offset = sum(∇g_val .* s_val)
        con = @build_constraint(t >= g_val + ∇g_val'*s - offset)
        MOI.submit(miop, MOI.LazyConstraint(cb_data), con)
    end
    
    MOI.set(miop, MOI.LazyConstraintCallback(), outer_approximation)
    optimize!(miop)
    
    s_val = JuMP.value.(s)
    Z_val = Diagonal(s_val)
    D_val = (I/γ + Z_val*M)^(-1)
    β_val, λ_val, g_val = g(s_val, X, y, D_val, Z_val, γ, t_α, σ_X)

    β_pred = β_val[1:p] .- β_val[p+1:end]
    return objective_value(miop), β_pred 
end

compute_dual (generic function with 1 method)

____
## Synthetic Data

In [76]:
function generate_synthetic_data(n, p, k, NR)
    
    # Generate PD matrix
    A = randn(p, p)
    A = A'*A
    Σ = (A' + A)/2
    
    # Generate data X
    d = MvNormal(Σ)
    X = rand(d, n)'I
    
    # Split data
    index_train = 1:floor(Int, 0.5*n)
    index_val = floor(Int, 0.5*n)+1:floor(Int, 0.75*n)
    index_test = floor(Int, 0.75*n)+1:n
    
    X_train = X[index_train,:]
    X_val = X[index_val,:]
    X_test = X[index_test,:]
    
    # Center
    μ_train = [mean(X_train[:, j]) for j=1:p]
    for j=1:p
         X_train[:,j] = X_train[:,j] .- μ_train[j]
         X_val[:,j] = X_val[:,j] .- μ_train[j]
         X_test[:,j] = X_test[:,j] .- μ_train[j]
    end
    
    # Scale
    σ_train = [norm(X_train[:, j]) for j=1:p]
    for j=1:p
         X_train[:,j] = X_train[:,j]/σ_train[j]
         X_val[:,j] = X_val[:,j] ./ σ_train[j]
         X_test[:,j] = X_test[:,j] ./ σ_train[j]
    end
    
    # Generate β
    β = zeros(p)
    for j=1:k
        β[floor(Int, j*p/k)] = 1.0*rand([-1, 1])
    end
    
    # Noise
    ϵ = rand(Normal(0, std(X*β)*NR), n)
    
    # Target
    y_train = X_train*β + ϵ[index_train]
    y_val = X_val*β + ϵ[index_val]
    y_test = X_test*β + ϵ[index_test]
            
    return  (X_train, y_train), (X_val, y_val), (X_test, y_test), β
end

generate_synthetic_data (generic function with 1 method)

In [77]:
function get_t_α_and_σ_X(X, y, α, γ)
    n, p = size(X)
    
    # Student law
    t_α = quantile(TDist(n-p), 1 - α/2) # Beware: n-p-1 if we add intercept
    
    # Estimator σ
    M = 1/γ*I + X'X
    M_inv = M^-1
    σ_tilde = sqrt((y'*(I - X*M_inv*X')*y)/(n-p))
    σ_X = σ_tilde * sqrt.(diag(M_inv))
    
    return t_α, σ_X
end

get_t_α_and_σ_X (generic function with 1 method)

In [78]:
function get_insample_R2(y_pred, y_true)
    TSE = sum((y_pred[i]-y_true[i])^2 for i=1:p)
    baseline_E = sum((sum(y_true)/length(y_true)-y_true[i])^2 for i=1:p)
    return 1 - TSE/baseline_E
end

function get_OR2(y_pred, y_true, y_train)
    TSE = sum((y_pred[i]-y_true[i])^2 for i=1:p)
    baseline_E = sum((sum(y_train)/length(y_train)-y_true[i])^2 for i=1:p)
    return 1 - TSE/baseline_E
end

get_OR2 (generic function with 1 method)

____
## Experiences

In [130]:
n = 2*10000
p = 100
k = 10
NR = 0.01
α = 0.05
γ = 1;

In [131]:
(X_train, y_train), (X_val, y_val), (X_test, y_test), β_true = generate_synthetic_data(n, p, k, NR);
#plot(X_train * β_true)
#plot!(y_train)

In [132]:
t_α, σ_X = get_t_α_and_σ_X(X_train, y_train, α, γ)

(1.9602036366973514, [0.253312, 0.249033, 0.247697, 0.249637, 0.253438, 0.254417, 0.255556, 0.256276, 0.24951, 0.250361  …  0.250722, 0.254481, 0.254193, 0.25591, 0.253394, 0.249383, 0.254676, 0.250558, 0.250814, 0.250729])

In [133]:
@time obj_value, β_primal = compute_primal(X_train, y_train, k, γ, t_α, σ_X)

LogFile: primal.txt.
Gurobi Optimizer version 9.0.3 build v9.0.3rc0 (mac64)
Optimize a model with 401 rows, 300 columns and 1100 nonzeros
Model fingerprint: 0x531e0ae2
Model has 5050 quadratic objective terms
Variable types: 100 continuous, 200 integer (200 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+03]
  Objective range  [6e-04, 2e+00]
  QObjective range [1e-04, 2e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [1e+01, 1e+03]
Found heuristic solution: objective 514.1270556
Presolve time: 0.00s
Presolved: 401 rows, 300 columns, 1100 nonzeros
Presolved model has 5050 quadratic objective terms
Variable types: 100 continuous, 200 integer (200 binary)

Root relaxation: objective 5.085777e+02, 1393 iterations, 0.17 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0  508.57772    0  137  514.12706  508.57772  1.08%     -    0s
H    0     0    

(510.6344031013072, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.530197  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.781944])

In [134]:
@time obj_value, β_dual = compute_dual(X_train, y_train, k, γ, t_α, σ_X)

LogFile: Dual.txt.
Gurobi Optimizer version 9.0.3 build v9.0.3rc0 (mac64)
Optimize a model with 102 rows, 201 columns and 601 nonzeros
Model fingerprint: 0x6124a2b8
Variable types: 1 continuous, 200 integer (200 binary)
Coefficient statistics:
  Matrix range     [8e-06, 2e+00]
  Objective range  [1e+00, 1e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [1e+00, 5e+02]
Presolve time: 0.00s
Presolved: 102 rows, 201 columns, 601 nonzeros
Variable types: 1 continuous, 200 integer (200 binary)

Root relaxation: objective 5.034034e+02, 21 iterations, 0.00 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0  505.52990    0    2          -  505.52990      -     -    1s
     0     2  505.52990    0    2          -  505.52990      -     -    2s
   446   553  511.52237   11    2          -  509.54137      -   2.9    7s
*  515   567              38     514.6474232  509.

(510.6344030993802, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.530197  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.781944])

In [141]:
hcat(β_true, β_primal)

100×2 Array{Float64,2}:
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 1.0  0.530197
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 ⋮            
 0.0  0.0     
 1.0  0.49755 
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 1.0  0.781944

In [142]:
hcat(β_true, β_dual)

100×2 Array{Float64,2}:
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 1.0  0.530197
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 ⋮            
 0.0  0.0     
 1.0  0.497551
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 0.0  0.0     
 1.0  0.781944

In [143]:
get_insample_R2(X_train*β_primal, y_train)

0.0050648193214879456

In [144]:
get_insample_R2(X_train*β_dual, y_train)

0.005064827479825951

In [145]:
get_OR2(X_test*β_primal, y_test, y_train)

0.02045648728559335

In [146]:
get_OR2(X_test*β_dual, y_test, y_train)

0.02045648572155312