# A Dual Approach to Holistic Regression

## 2021.02.01 — Gradient Descent

In [155]:
using Random, Distributions
using LinearAlgebra
using Gurobi, JuMP

In [156]:
# Create a gurobi model without the annoying academic license message
gurobi_env = Gurobi.Env()
function create_gurobi_model(; TimeLimit=-1, LogFile="logs.txt")
    model = Model(optimizer_with_attributes(() -> Gurobi.Optimizer(gurobi_env)));
    if TimeLimit >= 0
        println("Set Gurobi TimeLimit.")
        set_optimizer_attribute(model, "TimeLimit", TimeLimit)
    end
    set_optimizer_attribute(model, "LogFile", LogFile)
    set_optimizer_attribute(model, "OutputFlag", 0)
    return model
end;

Academic license - for non-commercial use only


## 1. Data and parameters

In [205]:
Random.seed!(2021)

# Params
ϵ = 10^(-15)
n, p = 1000, 20

# Robustness
γ = 1

# Significance
t_α = quantile(TDist(n-p), 1 - 0.05/2) # Beware: n-p-1 if we add intercept

# Data
X = rand(n, p)
β_true = [rand([0,1])*randn()*10 for i in 1:p]
σ_noise = 0.001

#y = rand(n)
y = X*β_true + [randn() for i in 1:n] * σ_noise

# Variance estimator
M = X'X
M_inv = M^-1
σ_tilde = sqrt((y'*(I - X*M_inv*X')*y)/(n-p))
σ_X = σ_tilde * sqrt.(diag(M_inv))

# Sparsity
k = length(findall(x->x!=0, β_true))

13

## 2. Gradient Descent and Projection

In [206]:
function get_max_inner(s_plus, s_minus)
    
    # Get s
    s = s_plus + s_minus
    
    # Compute matrices
    Z = Diagonal(s)
    Z_plus = Diagonal(s_plus)
    Z_minus = Diagonal(s_minus)
    
    # Compute D
    D = (I/γ + Z*M)^-1
    
    # Compute u_Z
    function compute_u_Z(λ_plus, λ_minus, μ)
        Z_minus*(λ_minus + μ) - Z_plus*(λ_plus + μ)
    end

    # Compute norm
    function compute_DZ_square_norm(in_norm)
        return in_norm' * D*Z * in_norm
    end
    
    # Compute max
    model = create_gurobi_model()

    @variable(model, λ_plus[1:p] >= 0)
    @variable(model, λ_minus[1:p] >= 0)
    @variable(model, μ[1:p] >= 0)

    u_Z = compute_u_Z(λ_plus, λ_minus, μ)
    
    obj_1 = ϵ*sum(s_plus.*λ_plus + s_minus.*λ_minus)
    obj_2 = t_α*μ'*(s.*σ_X)
    obj_3 = - 0.5 * compute_DZ_square_norm(X'y - u_Z)

    @objective(model, Max, obj_1 + obj_2 + obj_3)

    optimize!(model)

    # Compute β
    u_Z = compute_u_Z(value.(λ_plus), value.(λ_minus), value.(μ))
    sparsity_indexes = findall(x->x==1, s)

    X_s = X[:, sparsity_indexes]
    u_Z_s = u_Z[sparsity_indexes]
    
    β_s = ((I / γ) + X_s'X_s)^(-1)*(X_s'y - u_Z_s)
    
    β_pred = zeros(p)
    β_pred[sparsity_indexes] = β_s
    
    return β_pred, objective_value(model)
    
end

get_max_inner (generic function with 1 method)

In [207]:
function compute_gradient(s)

    ∇f = []
    β_pred, max_obj_value = get_max_inner(s[1:p], s[p+1:end])
    
    for i in 1:2p
        h = 10^-10
        s_bis = [s[j] - h*(i == j) for j=1:2p]
        ∂f_i = (max_obj_value - get_max_inner(s_bis[1:p], s_bis[p+1:end])[2]) / h
        push!(∇f, ∂f_i)
    end
    
    return β_pred, max_obj_value, ∇f
end

compute_gradient (generic function with 1 method)

In [208]:
function step_gradient_descent(s, α)
    β_pred, max_obj_value, ∇f = compute_gradient(s)
    return β_pred, max_obj_value, s - α * (∇f / sqrt(sum(e^2 for e in ∇f)))
end

step_gradient_descent (generic function with 1 method)

In [209]:
function step_gradient_descent_and_projection(s, α)

    # Gradient Descent step
    β_pred, max_obj_value, s_gd = step_gradient_descent(s, α)

    # Projection
    model = create_gurobi_model()
    @variable(model, s_proj[1:2p], Bin)
    for i in 1:p
        @constraint(model, s_proj[[i,p+i]] in MOI.SOS1([i,p+i]))
    end
    @constraint(model, sum(s_proj) <= k) # Sparsity
    @objective(model, Min, sum((s_proj[i] - s_gd[i])^2 for i in 1:2p))
    optimize!(model)
    
    return max_obj_value, β_pred, value.(s_proj)
    
end

step_gradient_descent_and_projection (generic function with 1 method)

### 4. Get initial feasible solution

In [210]:
function get_initial_solution()
        
    s_plus = zeros(p)
    s_minus = zeros(p)

    indexes_plus = sample(1:p, div(k,2), replace = false)
    indexes_minus = sample([i for i=1:p if !(i in indexes_plus)], k - div(k,2), replace = false)

    s_plus[indexes_plus] .= 1
    s_minus[indexes_minus] .= 1;
    
    return [s_plus; s_minus]
end

get_initial_solution (generic function with 1 method)

In [211]:
function find_solution_gd_and_proj(s_init, α)
    last_s = nothing
    new_s = s_init
    iter = 0

    while(last_s == nothing || new_s != last_s)
        iter += 1
        last_s = new_s
        max_obj_value, β_pred, new_s = step_gradient_descent_and_projection(last_s, α)

        if iter > 100
            break
        end
    end
    
    max_obj, β_pred, _ = step_gradient_descent_and_projection(new_s, α);
    
    return max_obj, β_pred
end

find_solution_gd_and_proj (generic function with 1 method)

In [212]:
best_max_obj = Inf
best_β_pred = nothing

for i in 1:5
    s_init = get_initial_solution()
    max_obj, β_pred = find_solution_gd_and_proj(s_init, p)
    if max_obj < best_max_obj
        best_β_pred = β_pred
        best_max_obj = max_obj
    end
    println(max_obj)
end
hcat(best_β_pred, β_true)

-76812.20512034735
-76247.09677367916
-77026.464207153
-71674.10587870472
-77405.10701412671


20×2 Array{Float64,2}:
  -0.127416      -0.0    
   0.0            2.12786
   0.0            0.0    
 -10.5664       -10.6497 
   0.0           -2.09176
   0.0           -0.0    
  -0.208998       0.0    
  -2.96653       -2.96798
   5.66644        5.79304
   0.0           -0.0    
   0.0           -1.33145
 -11.5003       -11.5181 
  17.0218        17.3192 
   0.0           -0.0    
   9.64674        9.83614
  -5.80372       -5.71342
   7.6499         7.68876
  -0.000211793    0.0    
   5.19872        5.35873
   4.2078         4.37629