In [1]:
using Dates
print(Dates.today(), " ", Dates.Time(Dates.now()))

2019-12-02 19:58:13.947

In [2]:
#load needed modules
using JuMP
using CSV
using DecisionTree
using StatsBase
using DataFrames
using MLDataUtils

#note - Gurobi is not FOSS - licensing required!
#this model can be solved using any MIO solver compatible with JuMP
#see http://www.juliaopt.org/JuMP.jl/v0.20.0/installation/#Getting-Solvers-1
using Gurobi

#data path declaration
FILEDIR = "/home/sronk/Downloads/Machine_Learning_MSCA_31009/Homework/data/"

"/home/sronk/Downloads/Machine_Learning_MSCA_31009/Homework/data/"

In [3]:
#load wine file
csv = CSV.File(FILEDIR * "wine.data")
df = DataFrame(csv)

#shuffle observationns and split into train/test
df = shuffleobs(df)
train, test = splitobs(df, at = 0.7)

#extract features from input matrix
features = Matrix(select(train, Not(:Label)))

#extract labels from input matrix
labels = Array(train.Label)

#extract features from input matrix
test_features = Matrix(select(test, Not(:Label)))

#extract labels from input matrix
test_labels = Array(test.Label)

53-element Array{Int64,1}:
 3
 3
 2
 2
 1
 1
 2
 2
 2
 2
 2
 2
 2
 ⋮
 2
 1
 2
 3
 2
 2
 2
 3
 1
 1
 2
 3

In [4]:
#perform a column-wise unit range transformation on the feature space
features = transpose(standardize(UnitRangeTransform, transpose(features)))

125×13 LinearAlgebra.Transpose{Float64,Array{Float64,2}}:
 0.813158  0.150713   0.513369  0.319588   …  0.560976   0.558052   0.714693
 0.747368  0.236253   0.770053  0.453608      0.626016   0.775281   0.454351
 0.255263  0.547862   0.342246  0.43299       0.365854   0.644195   0.203994
 0.484211  0.788187   0.59893   0.561856      0.0487805  0.198502   0.247504
 0.671053  0.374745   0.71123   0.716495      0.211382   0.17603    0.336662
 0.394737  0.971487   0.684492  0.742268   …  0.276423   0.134831   0.169044
 0.110526  0.338086   0.566845  0.484536      0.260163   0.771536   0.247504
 0.710526  0.154786   0.716578  0.613402      0.577236   0.516854   0.71826 
 0.597368  0.199593   0.417112  0.329897      0.439024   0.539326   0.71826 
 0.7       0.513238   0.631016  0.484536      0.390244   0.183521   0.286733
 0.378947  0.158859   0.449198  0.43299    …  0.552846   0.486891   0.470043
 0.536842  0.154786   0.395722  0.252577      0.569106   0.509363   0.529244
 0.423684  0.12627

In [5]:
function makeAncestorDict(max_nodes)
    #initialize empty dictionaries
    A_left = Dict{Int64, Vector{Int64}}()
    A_right = Dict{Int64, Vector{Int64}}()
    #A_left[1] = [1]
    #A_right[1] = [1]
    #generate keys with empty array values for each node
    for i in 1:max_nodes
        A_left[i] = []
        A_right[i] = []
    end
    #loop over all nodes, copying the left and right ancestors of the node above it
    for i in 2:max_nodes
        left_ancestors = copy(A_left[i ÷ 2])
        right_ancestors = copy(A_right[i ÷ 2])
        direct_ancestor = i ÷ 2
        A_left[i] = left_ancestors
        A_right[i] = right_ancestors
        #add a left ancestor to even nodes
        if i/2 == i ÷ 2
            append!(left_ancestors, direct_ancestor)
            A_left[i] = left_ancestors
        #add a right ancestor to odd nodes
        else
            append!(right_ancestors, direct_ancestor)
            A_right[i] = right_ancestors
        end
    end
    return A_left, A_right
end

function makeYMatrix(labels)    
    #extract dimensions for Y from label array
    num_labels = length(unique(labels))
    len_df = length(labels)
    
    #initialize empty matrix
    Y = zeros(len_df, num_labels)
    
    #set all values to -1 - this will apply a penalty to incorrect predictions
    Y = Y  .- 1
    
    #iterate n over each column, setting Y[n,k] = 1 when the label for x[i] = k
    for k in 1:num_labels
        for n in 1:len_df
            if labels[n] == k
                Y[n,k] = 1
            end
        end
    end
    return Y
end

makeYMatrix (generic function with 1 method)

In [6]:
#set maximum depth of tree as a constant
max_depth = 2

#minimum number of values for a given leaf node
leaf_n_min = 3

#declare complexity parameter alpha
alpha = 1

#find total number of nodes in tree using max_depth, t
max_nodes = 2^(max_depth+1) - 1

#initialize branch and leaf node arrays - first, find the split point between branch and leaf indices
#split point is by definition the number of nodes integer divided by two
leaf_branch_split = max_nodes ÷ 2

#total number of branches
t_b = collect(1:leaf_branch_split)

#total number of leaves
t_l = collect(leaf_branch_split+1:max_nodes)


4-element Array{Int64,1}:
 4
 5
 6
 7

In [7]:
#pull number of samples in dataset, n
num_samples = size(features, 1)

#find total number of columns in the feature space, p
num_features = size(features, 2)

#find total number of labels, k
num_labels = length(unique(labels))

#create dictionary with prediction labels as key and count of each prediction as value
output_count = countmap(labels)

#extract the count for most common labebl to form l_hat, which is baseline accuracy rate
l_hat = sort(collect(output_count), by = tuple -> last(tuple), rev=true)[1,1][2]/length(labels)

#declare alpha
alpha = 0.1


4

In [8]:
#build ancestor dictionaries using definitions given above
A_left, A_right = makeAncestorDict(max_nodes)

#generate Y matrix
Y = makeYMatrix(labels)

#establish mu constant
mu = 0.005

#establish M constant
M = length(labels)

125

In [9]:
#initialize model
model = Model(with_optimizer(Gurobi.Optimizer, Presolve=0, OutputFlag=1))

#L is the loss for a given leaf node t
@variable(model, L[i = t_l])

#N_t is the total number of values in a leaf node t
@variable(model, N_t[i = t_l])

#c_kt is a matrix that holds the label count of each variable within a given leaf nodes
@variable(model, c_kt[i = 1:num_labels, j = t_l])

#constrain c_kt to binary values {0,1}
for k in 1:num_labels
    for t in t_l
        @constraint(model, c_kt[k,t] in MOI.ZeroOne())
    end
end

#N_kt is the number of points with label k in leaf node t
@variable(model, N_kt[i = 1:num_labels, j = t_l])

#l is a hot-coded array s/t l(t) = 1 when leaf node t contains any values 
@variable(model, l[t = t_l])

#contrain l to binary values {0,1}
for t in t_l
    @constraint(model, l[t] in MOI.ZeroOne())
end

#z is a hot-coded matrix captures which values are assigned to which node
@variable(model, z[i = 1:num_samples, t = t_l])

#constrain z to binary values {0,1}
for i in 1:num_samples
    for t in t_l
        @constraint(model, z[i,t] in MOI.ZeroOne())
    end
end

#b is the decision point for each branch node
#s/t a.T*x < b at a given split 
@variable(model, b[i=t_b])

#a is a hot-coded matrix that captures the variable being used to split at given branch node
@variable(model, a[j = 1:num_features, t = 1:leaf_branch_split])

#a_hat is an auxillary variable to linearize absolute value function
@variable(model, a_hat[j = 1:num_features, t = 1:leaf_branch_split])

#s tracks if variable j is used at branch node t
@variable(model, s[j = 1:num_features, t=1:leaf_branch_split])

#constrain s to binary values
for j in 1:num_features
    for t in t_b
        @constraint(model, s[j,t] in MOI.ZeroOne())
    end
end

#d is an indicator array equal to one when a split is applied at a given node
@variable(model, d[1:leaf_branch_split])

#constrain d to binary values
for t in t_b
    @constraint(model, d[t] in MOI.ZeroOne())
end


Academic license - for non-commercial use only


In [10]:
#7 (modified 13) - establish left split constraints - yes
for i in 1:num_samples  
    for t in t_l
        for m in A_left[t]
            @constraint(model, (transpose(a[:,m]) * features[i,:]) + mu  <= b[m] + (2 + mu)*(1 - z[i,t]))
        end
    end
end

In [11]:
#1 (20) - set loss function lower bound - yes
for k in 1:num_labels
    for t in t_l
        @constraint(model, L[t] >= N_t[t] - N_kt[k,t] - (M * (1 - c_kt[k,t])))
    end
end

#2 (21) - set loss function upper bound - yes
for k in 1:num_labels
    for t in t_l
        @constraint(model, L[t] <= N_t[t] - N_kt[k,t] + (M * c_kt[k,t]))
    end
end

#3 (22) - set all L values to be positive - yes
for t in t_l
   @constraint(model, L[t] >= 0) 
end

#4 (15) - establish values for N_kt[t] - yes
for k in 1:num_labels
    for t in t_l
        @constraint(model, N_kt[k,t] == 0.5 * sum((1 + Y[i,k])*z[i,t] for i = 1:num_samples))
    end
end

#5 (16) - establish values for N_t[t] as sum of z[i,t] for each t - yes
for t in t_l
    @constraint(model, N_t[t] == sum(z[i,t] for i = 1:num_samples))
end

#6 (18) - force prediction for each node with values - yes
for t in t_l
    @constraint(model, l[t] == sum(c_kt[k,t] for k = 1:num_labels))
end

#7 (modified 13) - establish left split constraints - yes
for i in 1:num_samples  
    for t in t_l
        for m in A_left[t]
            @constraint(model, (transpose(a[:,m]) * features[i,:]) + mu  <= b[m] + (2 + mu)*(1 - z[i,t]))
        end
    end
end

#8 (modified 14) - establish right split contraints - yes
for i in 1:num_samples
    for t in t_l
        for m in A_right[t]      
            @constraint(model, transpose(a[:,m]) * features[i,:] >= b[m] - 2*(1 - z[i,t]))
        end
    end
end

#9 (8) - constrain each point in data set so it can only be assigned to one leaf node - yes
for i in 1:num_samples
    @constraint(model, sum(z[i,t] for t in t_l) == 1)
end

#10 (6) - constrain predictions to only be fit into nodes containing points
for i in 1:num_samples
    for t in t_l
        @constraint(model, z[i,t] <= l[t])
    end
end

#11 (7)- constrain number of samples assigned to a given leaf by lower bound 
#s/t number of samples is always greater/equal to min leaf size constant
for t in t_l
    @constraint(model, sum(z[i,t] for i in 1:num_samples) >= leaf_n_min * l[t])
end

#12 - declare a_hat constraints s/t splits are only applied when a given branch is active
for t in t_b
    @constraint(model, sum(a_hat[j,t] for j in 1:num_features) <= d[t])
end

#13 & 14 - constrain a_hat to equal absolute value of a
for j in 1:num_features
    for t in t_b
        @constraint(model, a_hat[j,t] >= a[j,t])
        @constraint(model, a_hat[j,t] >= -1*a[j,t])
    end
end

#15 constrain b[t] values relative to d[t] values 
for j in 1:num_features
    for t in t_b
        @constraint(model, -1*s[j,t] <= a[j,t])
        @constraint(model, a[j,t] <= s[j,t])
    end
end

#16 constrain s relative to d[t] values
for j in 1:num_features
    for t in t_b
        @constraint(model, s[j,t] <= d[t])
    end
end

#17 constrain s sum-wise
for t in t_b
    @constraint(model, sum(s[j,t] for j in 1:num_features) >= d[t])
end

#18 expand previous constraints on b_t to include negative values
for t in t_b
    @constraint(model, -1 * d[t] <= b[t])
    @constraint(model, b[t] <= d[t])
end

#19 constrain branches s/t no splits can occur if a parent node doesn't also split
for t in 2:leaf_branch_split
    parent = t ÷ 2
    @constraint(model, d[t] <= d[parent])
end


In [12]:
@objective(model, Min, (1/l_hat) * sum(L[t] for t in t_l) 
    + alpha * sum(sum(s[j,t] for j in 1:num_features) for t in t_b))


2.6041666666666665 L[4] + 2.6041666666666665 L[5] + 2.6041666666666665 L[6] + 2.6041666666666665 L[7] + 4 s[1,1] + 4 s[2,1] + 4 s[3,1] + 4 s[4,1] + 4 s[5,1] + 4 s[6,1] + 4 s[7,1] + 4 s[8,1] + 4 s[9,1] + 4 s[10,1] + 4 s[11,1] + 4 s[12,1] + 4 s[13,1] + 4 s[1,2] + 4 s[2,2] + 4 s[3,2] + 4 s[4,2] + 4 s[5,2] + 4 s[6,2] + 4 s[7,2] + 4 s[8,2] + 4 s[9,2] + 4 s[10,2] + 4 s[11,2] + 4 s[12,2] + 4 s[13,2] + 4 s[1,3] + 4 s[2,3] + 4 s[3,3] + 4 s[4,3] + 4 s[5,3] + 4 s[6,3] + 4 s[7,3] + 4 s[8,3] + 4 s[9,3] + 4 s[10,3] + 4 s[11,3] + 4 s[12,3] + 4 s[13,3]

In [13]:
optimize!(model)


Academic license - for non-commercial use only
Optimize a model with 2386 rows, 659 columns and 25970 nonzeros
Variable types: 101 continuous, 558 integer (558 binary)
Coefficient statistics:
  Matrix range     [3e-03, 1e+02]
  Objective range  [3e+00, 4e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [1e+00, 1e+02]
Variable types: 85 continuous, 574 integer (558 binary)

Root relaxation: objective 0.000000e+00, 560 iterations, 0.01 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0    0.00000    0    3          -    0.00000      -     -    0s
     0     0    0.00000    0  263          -    0.00000      -     -    0s
     0     0    0.00000    0  263          -    0.00000      -     -    0s
     0     0    0.00000    0  229          -    0.00000      -     -    0s
     0     0    0.00000    0  227          -    0.00000      -     -    0s
     0     0    0.



InterruptException: InterruptException: