# A Dual Approach to Holistic Regression

## 2021.02.14 — Compare primal

In [1]:
using Random, Distributions
using LinearAlgebra
using Gurobi, JuMP
using DataFrames

┌ Info: Recompiling stale cache file /home/skand/.julia/compiled/v1.0/DataFrames/AR9oZ.ji for DataFrames [a93c6f00-e57d-5684-b7b6-d8193f3e46c0]
└ @ Base loading.jl:1190


In [2]:
# Create a gurobi model without the annoying academic license message
gurobi_env = Gurobi.Env()
function create_gurobi_model(; TimeLimit=-1, LogFile="logs.txt")
    model = Model(optimizer_with_attributes(() -> Gurobi.Optimizer(gurobi_env)));
    if TimeLimit >= 0
        println("Set Gurobi TimeLimit.")
        set_optimizer_attribute(model, "TimeLimit", TimeLimit)
    end
    set_optimizer_attribute(model, "LogFile", LogFile)
    set_optimizer_attribute(model, "OutputFlag", 0)
    return model
end;

Academic license - for non-commercial use only


## 1. Data and parameters

In [91]:
Random.seed!(2021)

# Params
ϵ = 10^(-15)
n, p = 100, 10

# Robustness
γ = 1

# Significance
t_α = 1 - quantile(TDist(n-p), 0.01/2) # Beware: n-p-1 if we add intercept



# Data
X = rand(n, p)
β_true = [rand([0,1])*randn()*10 for i in 1:p]
σ_noise = 0.01

#y = rand(n)
y = X*β_true + [randn() for i in 1:n] * σ_noise

# Test
X_test = rand(n, p)
y_test = X_test*β_true + [randn() for i in 1:n] * σ_noise

# Variance estimator
M = X'X
M_inv = M^-1
σ_tilde = sqrt((y'*(I - X*M_inv*X')*y)/(n-p))
σ_X = σ_tilde * sqrt.(diag(M_inv))

# Sparsity
k = length(findall(x->x!=0, β_true))

5

In [92]:
t_α, σ_X

(3.6315651655871584, [0.00312396, 0.00322355, 0.00338964, 0.00306998, 0.00346651, 0.00323054, 0.00293714, 0.00296826, 0.00354955, 0.00318882])

## 1. Primal formulation

In [93]:
function get_primal(γ)
    model = create_gurobi_model()

    big_M = 10000
    big_M_sig = 10000

    @variable(model, β[i=1:p])
    @variable(model, s[i=1:p], Bin)
    @variable(model, b[i=1:p], Bin)

    @constraint(model, sum(s) <= k)
    @constraint(model, [i=1:p], β[i] <= big_M*s[i])
    @constraint(model, [i=1:p], β[i] >= -big_M*s[i])

    @constraint(model, [i=1:p], β[i]/σ_X[i] + big_M_sig*b[i] >= t_α*s[i])
    @constraint(model, [i=1:p], -β[i]/σ_X[i] + big_M_sig*(1-b[i]) >= t_α*s[i])

    @objective(model, Min, 0.5*sum((y[i] - X[i,:]'β)^2 for i=1:p) + 1/(2*γ) * sum(β[i]^2 for i=1:p))
#    @objective(model, Min, 0.5*sum((y[i] - X[i,:]'β)^2 for i=1:p))

    optimize!(model)
    
    cat = DataFrame(b = value.(b), β=value.(β), s=value.(s))
    
    return objective_value(model) - 0.5*sum(y[i]^2 for i=1:p), value.(β), cat
end

get_primal (generic function with 1 method)

In [94]:
function get_insample_R2(y_pred, y_true)
    TSE = sum((y_pred[i]-y_true[i])^2 for i=1:p)
    baseline_E = sum((sum(y_true)/length(y_true)-y_true[i])^2 for i=1:p)
    return 1 - TSE/baseline_E
end

get_insample_R2 (generic function with 1 method)

In [95]:
function get_OR2(y_pred, y_true, y_train)
    TSE = sum((y_pred[i]-y_true[i])^2 for i=1:p)
    baseline_E = sum((sum(y_train)/length(y_train)-y_true[i])^2 for i=1:p)
    return 1 - TSE/baseline_E
end

get_OR2 (generic function with 1 method)

## 2. Gradient Descent and Projection

In [96]:
function get_max_inner(s_plus, s_minus, γ)
    
    # Get s
    s = s_plus + s_minus
    
    # Compute matrices
    Z = Diagonal(s)
    Z_plus = Diagonal(s_plus)
    Z_minus = Diagonal(s_minus)
    
    # Compute D
    D = (I/γ + Z*M)^-1
    
    # Compute u_Z
    function compute_u_Z(λ_plus, λ_minus, μ)
        Z_minus*(λ_minus + μ) - Z_plus*(λ_plus + μ)
    end

    # Compute norm
    function compute_DZ_square_norm(in_norm)
        return in_norm' * D*Z * in_norm
    end
    
    # Compute max
    model = create_gurobi_model()

    @variable(model, λ_plus[1:p] >= 0)
    @variable(model, λ_minus[1:p] >= 0)
    @variable(model, μ[1:p] >= 0)

    u_Z = compute_u_Z(λ_plus, λ_minus, μ)
    
    obj_1 = ϵ*sum(s_plus.*λ_plus + s_minus.*λ_minus)
    obj_2 = t_α*μ'*(s.*σ_X)
    obj_3 = - 0.5 * compute_DZ_square_norm(X'y - u_Z)

    @objective(model, Max, obj_1 + obj_2 + obj_3)

    optimize!(model)

    # Compute β
    u_Z = compute_u_Z(value.(λ_plus), value.(λ_minus), value.(μ))
    sparsity_indexes = findall(x->x==1, s)

    X_s = X[:, sparsity_indexes]
    u_Z_s = u_Z[sparsity_indexes]
    
    β_s = ((I / γ) + X_s'X_s)^(-1)*(X_s'y - u_Z_s)
    
    β_pred = zeros(p)
    β_pred[sparsity_indexes] = β_s
    
    return β_pred, objective_value(model)
    
end

get_max_inner (generic function with 1 method)

In [97]:
function compute_gradient(s, γ)

    ∇f = []
    β_pred, max_obj_value = get_max_inner(s[1:p], s[p+1:end], γ)
    
    for i in 1:2p
        h = 10^-10
        s_bis = [s[j] - h*(i == j) for j=1:2p]
        ∂f_i = (max_obj_value - get_max_inner(s_bis[1:p], s_bis[p+1:end], γ)[2]) / h
        push!(∇f, ∂f_i)
    end
    
    return β_pred, max_obj_value, ∇f
end

compute_gradient (generic function with 1 method)

In [98]:
function step_gradient_descent(s, α, γ)
    β_pred, max_obj_value, ∇f = compute_gradient(s, γ)
    return β_pred, max_obj_value, s - α * (∇f / sqrt(sum(e^2 for e in ∇f)))
end

step_gradient_descent (generic function with 1 method)

In [99]:
function step_gradient_descent_and_projection(s, α, γ)

    # Gradient Descent step
    β_pred, max_obj_value, s_gd = step_gradient_descent(s, α, γ)

    # Projection
    model = create_gurobi_model()
    @variable(model, s_proj[1:2p], Bin)
    for i in 1:p
        @constraint(model, s_proj[[i,p+i]] in MOI.SOS1([i,p+i]))
    end
    @constraint(model, sum(s_proj) <= k) # Sparsity
    @objective(model, Min, sum((s_proj[i] - s_gd[i])^2 for i in 1:2p))
    optimize!(model)
    
    return max_obj_value, β_pred, value.(s_proj)
    
end

step_gradient_descent_and_projection (generic function with 1 method)

In [100]:
function step_gradient_descent_and_projection_relaxed(s, α)

    # Gradient Descent step
    β_pred, max_obj_value, s_gd = step_gradient_descent(s, α)

    # Projection
    model = create_gurobi_model()
    @variable(model, 0 <= s_proj[1:2p] <= 1)
    @variable(model, b[1:2p], Bin)   
    @constraint(model, [i=1:2p], s_proj[i] <= b[i])
    @constraint(model, [i=1:2p], s_proj[i] >= ϵ*b[i])
    for i in 1:p
        @constraint(model, b[[i,p+i]] in MOI.SOS1([i,p+i]))
    end
    @constraint(model, sum(b) <= k) # Sparsity
    @objective(model, Min, sum((s_proj[i] - s_gd[i])^2 for i in 1:2p))
    optimize!(model)
    
    return max_obj_value, β_pred, value.(s_proj)
    
end

step_gradient_descent_and_projection_relaxed (generic function with 1 method)

### 4. Get initial feasible solution

In [101]:
function get_initial_solution()
        
    s_plus = zeros(p)
    s_minus = zeros(p)

    indexes_plus = sample(1:p, div(k,2), replace = false)
    indexes_minus = sample([i for i=1:p if !(i in indexes_plus)], k - div(k,2), replace = false)

    s_plus[indexes_plus] .= 1
    s_minus[indexes_minus] .= 1;
    
    return [s_plus; s_minus]
end

get_initial_solution (generic function with 1 method)

In [102]:
function find_solution_gd_and_proj(s_init, α, γ)
    last_s = nothing
    new_s = s_init
    iter = 0

    while(last_s == nothing || new_s != last_s)
        iter += 1
        last_s = new_s
        max_obj_value, β_pred, new_s = step_gradient_descent_and_projection(last_s, α, γ)

        if iter > 100
            break
        end
    end
    
    max_obj, β_pred, _ = step_gradient_descent_and_projection(new_s, α, γ);
    
    return max_obj, β_pred
end

find_solution_gd_and_proj (generic function with 1 method)

In [121]:
for γ in [0.001, 0.1, 0.2, 0.5, 1, 100]
    best_max_obj = Inf
    best_β_pred = nothing
    for i in 1:100
        s_init = get_initial_solution()
        max_obj, β_pred = find_solution_gd_and_proj(s_init, p, γ)
        if max_obj < best_max_obj
            best_β_pred = β_pred
            best_max_obj = max_obj
        end
#        println(max_obj)
    end
    println(γ, " ", best_max_obj)
    println("\tIn Sample: ", get_insample_R2(X*best_β_pred, y))
    println("\tOut of Sample: ", get_OR2(X_test*best_β_pred, y_test, y))
    #println(abs.(best_β_pred) ./ σ_X .>= t_α*(best_β_pred .!= 0))
    #println(best_β_pred)
end    
#hcat(best_β_pred, β_true)

0.001 -3403.832361220223
	In Sample: -1.943599176464573
	Out of Sample: -2.4377176212331766
0.1 -26393.974915951036
	In Sample: 0.8766963253502406
	Out of Sample: 0.877158979630224
0.2 -27837.622203206036
	In Sample: 0.9385843040300765
	Out of Sample: 0.9425460689049362
0.5 -28909.619426104724
	In Sample: 0.9804601682595673
	Out of Sample: 0.981981660653384
1.0 -29344.246031738014
	In Sample: 0.9896869675390619
	Out of Sample: 0.9910293736294489
100.0 -29815.171460514608
	In Sample: 0.996670721448507
	Out of Sample: 0.9970969666366202


In [118]:
for γ in [0.001, 0.1, 0.2, 0.5, 1, 100]
    obj_value, β_pred_primal, a = get_primal(γ)
    println(γ, " ", obj_value)
    println("\tIn Sample: ", get_insample_R2(X*β_pred_primal, y))
    println("\tOut of Sample: ", get_OR2(X_test*β_pred_primal, y_test, y))
end
obj_value, β_pred_primal, a = get_primal(γ)
a

0.001 -22.42341978590639
	In Sample: -2.97338909755844
	Out of Sample: -3.3373704442122545
0.1 -879.3644124293118
	In Sample: 0.10515172305187093
	Out of Sample: -0.4574483763810766
0.2 -1103.3757343230707
	In Sample: 0.5534921895338293
	Out of Sample: 0.03618051383474785
0.5 -1312.45897152329
	In Sample: 0.8255839322427999
	Out of Sample: 0.2559230040976147
1.0 -1411.6904365352045
	In Sample: 0.8900796706175342
	Out of Sample: 0.388740862659238
100.0 -1557.623809715483
	In Sample: 0.9984224907908025
	Out of Sample: 0.9730829766876243


Unnamed: 0_level_0,b,β,s
Unnamed: 0_level_1,Float64,Float64,Float64
1,1.0,-6.4802,1.0
2,0.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,-6.23201,1.0
5,1.0,-6.23852,1.0
6,0.0,0.0,0.0
7,1.0,-6.49663,1.0
8,1.0,0.0,0.0
9,1.0,-7.3912,1.0
10,0.0,0.0,0.0


In [127]:
# WITH S+, S- from PRIMAL
γ = 1
obj_value, β_pred_primal, df = get_primal(γ);
s_plus = df.s .* (df.β .>= ϵ)
s_minus = df.s .* (df.β .<= -ϵ)
β_pred_dual, obj_dual = get_max_inner(s_plus, s_minus, γ)
println("γ=", γ)
println("\tR2 primal: ", get_insample_R2(X*β_pred_primal, y))
println("\tOR2 primal: ", get_OR2(X_test*β_pred_primal, y_test, y))
println("\n\tR2 dual: ", get_insample_R2(X*β_pred_dual, y))
println("\tOR2 dual: ", get_OR2(X_test*β_pred_dual, y_test, y))

γ=1
	R2 primal: 0.8900796706175342
	OR2 primal: 0.388740862659238

	R2 dual: 0.9598301818231438
	OR2 dual: 0.9345107211441972


In [106]:
df

Unnamed: 0_level_0,b,β,s
Unnamed: 0_level_1,Float64,Float64,Float64
1,1.0,-6.4802,1.0
2,0.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,-6.23201,1.0
5,1.0,-6.23852,1.0
6,0.0,0.0,0.0
7,1.0,-6.49663,1.0
8,1.0,0.0,0.0
9,1.0,-7.3912,1.0
10,0.0,0.0,0.0


In [107]:
get_max_inner([1,1,1,1,1,0,0,0,0,0], [0,0,0,0,0, 1,1,1,1,0], γ)

([0.0113449, 0.0117065, 0.0123097, 0.0111488, 0.0125888, -3.69252, -29.9698, -6.03724, -4.69128, 0.0], -28173.89410231351)

In [108]:
β_true

10-element Array{Float64,1}:
  -0.0               
   0.0               
   0.0               
  -0.0               
 -16.2360673918447   
  -0.0               
 -26.900916991523903 
  -4.463388856106313 
   2.647980578161075 
  -1.5987715445258355

In [109]:
σ_X

10-element Array{Float64,1}:
 0.0031239624761478   
 0.0032235474540738855
 0.0033896360535556327
 0.0030699817424625223
 0.0034665074837425267
 0.003230544789483457 
 0.0029371369382087224
 0.0029682618566973435
 0.003549554818276939 
 0.0031888228239923913

In [110]:
0.5*sum(y[i]^2 for i=1:p)

1561.9629197301247