## Import Packages

In [240]:
using Gurobi, CSV, DataFrames, JuMP, LinearAlgebra, Distributions, Random, GLMNet

## Read in Data 

### training and validation set created from stable regression (see notebook)

In [241]:
trainx = CSV.read("Data/x_training_stable.csv",DataFrame)
trainy = CSV.read("Data/y_training_stable.csv",DataFrame,header=0);

### Convert data to matrix and vector form 

In [242]:
trainxmat = Matrix(trainx)
trainyarr = Vector(trainy[:,1]);

### Subset the first four columns (numeric) we do transformation on our numeric columns later

In [243]:
trainxmat_modified=trainxmat[:,1:4]
trainxmat_others=trainxmat[:,5:size(trainxmat)[2]]
;


## Holistic Regression

In [244]:
function holistic(x, y, weirdr, K, M)
    
    n, p = size(x)

    m = Model(Gurobi.Optimizer)
    
    set_optimizer_attribute(m, "OutputFlag", 0) 
    
    @variable(m, beta[1:p])
    @variable(m, beta0)
    @variable(m, z[1:p], Bin)
    @variable(m, q[1:p] >= 0)
    @variable(m, t[1:n] >= 0)
    
    @constraint(m, sum(z) <= K)
    pby4 = trunc(Int, 4) ##CHECK IF NOT WORKS +++++++++++++++++++++++++++++++++++++++ 4 or 1
    @constraint(m, [i=1:pby4], sum(z[(i-1)*4+j] for j=1:4) == 1)
    @constraint(m, z[5] == 1) #ensure visibility feature chosen
    
    @constraint(m, [i=1:p], -M*z[i] <= beta[i])
    @constraint(m, [i=1:p], beta[i] <= M*z[i])
    @constraint(m, [i=1:p], beta[i] <= q[i])
    @constraint(m, [i=1:p], -beta[i] <= q[i])

    for i=2:p
        for j=1:(i-1)
            if (cor(x[:,i],x[:,j]) > 0.7)
                @constraint(m, z[i]+z[j] <= 1)
            end
        end
    end
    
    @constraint(m, [i=1:n], (y[i] - sum(x[i,j]*beta[j] for j=1:p) - beta0) <= t[i])
    @constraint(m, [i=1:n], -(y[i] - sum(x[i,j]*beta[j] for j=1:p) - beta0) <= t[i])
    
    @objective(m, Min, 0.5 * sum(t[i]^2 for i=1:n) + weirdr * sum(q[j] for j=1:p))
    optimize!(m)
        
    obj = JuMP.objective_value(m)
    beta0 = JuMP.value(beta0)
    betas = JuMP.value.(beta)
    z = JuMP.value.(z)
        
    return obj, beta0, betas, z
end

holistic (generic function with 2 methods)

## Adding transformations of our 3 numeric columns

In [245]:
function creatematrices(trainxmat,  eps)
    ntrain, ptrain = size(trainxmat)
    #ntest, ptest = size(testxmat)

    modtrainxmat = zeros(ntrain, 4*ptrain)
    #modtestxmat = zeros(ntest, 4*ptest)

    for i = 1:(4)
        modtrainxmat[:,4*(i-1)+1] = trainxmat[:,i]
        modtrainxmat[:,4*(i-1)+2] = trainxmat[:,i].^2
        modtrainxmat[:,4*(i-1)+3] = sqrt.(abs.(trainxmat[:,i]))
        modtrainxmat[:,4*(i-1)+4] = log.(abs.(trainxmat[:,i]) .+ eps)


        end

    return modtrainxmat #, modtestxmat
end

creatematrices (generic function with 1 method)

In [246]:
using Gurobi, CSV, DataFrames, JuMP, LinearAlgebra, Distributions, Random, GLMNet

### Getting the full data (post fransformation)



In [247]:
first_four_transformed = creatematrices(trainxmat_modified,  10e-6) #removed tesstxmat
exttrainxmat=hcat(first_four_transformed, trainxmat_others)

#Export it just in case the kernel is restarted
using Tables
CSV.write("Data/exttrainxmat.csv",  Tables.table(exttrainxmat), writeheader=false)


"Data/exttrainxmat.csv"

### Line to run if things break (delete when everything works well!)

In [248]:
trainy = CSV.read("Data/y_training_stable.csv",DataFrame,header=0)
trainyarr = Vector(trainy[:,1])

exttrainxmat= CSV.read("Data/exttrainxmat.csv",DataFrame,header=0)
;

## Cross validation function

In [249]:
M=100
r=0.1#

n, p = size(exttrainxmat)
numfolds = 10
foldsamps = sample(1:numfolds, n, replace = true)

function cross_val(rho, fold,M)
   # first_four_transformed = creatematrices(trainxmat_modified,  10e-6) #removed tesstxmat
   # exttrainxmat=hcat(first_four_transformed, trainxmat_others)


   


    traintrainx = exttrainxmat[[x for x in 1:n if foldsamps[x] != fold],:]
    trainvalx = exttrainxmat[[x for x in 1:n if foldsamps[x] == fold],:]
    traintrainy = trainyarr[[x for x in 1:n if foldsamps[x] != fold],:]
    trainvaly = trainyarr[[x for x in 1:n if foldsamps[x] == fold],:]
    print("D")


    print("A")
    obj, beta0, betas, z = holistic(traintrainx, traintrainy, rho, 10 M)
    print("B")
    predictions = Matrix(trainvalx) * betas .+ beta0
    mse = mean((predictions .- trainvaly).^2)
    #savemses[j] = savemses[j] + mse
    
    return(beta0, betas, z, trainvalx, trainvaly, mse)
end

LoadError: syntax: missing comma or ) in argument list

### Doing 10 fold  Cross validation to find the best gamma

In [133]:
gamma = collect(0:.01:.2)

avg_mse_gamma = []
for g in gamma  
    mse_gamma_fold = []
    for f in 1:10
        beta0, betas, z, trainvalx, trainvaly, mse = cross_val(g, f, 100);
        push!(mse_gamma_fold, mse)
    end
    push!(avg_mse_gamma, mean(mse_gamma_fold))
end
        

DAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18


LoadError: SYSTEM: show(lasterr) caused an error

## Get the best gamma (0.08)

In [134]:
best_gamma_idx = argmin(avg_mse_gamma)
best_gamma = gamma[best_gamma_idx]

#best gamma is 0.08

LoadError: ArgumentError: collection must be non-empty

## Get the betas for the best gamma

In [251]:
obj, beta0, betas, z = holistic(exttrainxmat, trainyarr, 0.08, 10, M)
;

Academic license - for non-commercial use only - expires 2022-08-18


## Read in the actual validation set (from stable regression/see notebook)

In [252]:
x_val = CSV.read("Data/x_valid_set", DataFrame)
x_val = Matrix(x_val)
x_val_first_four = x_val[:,1:4]
x_val_others = x_val[:,5:size(x_val)[2]]


first_four_transformed = creatematrices(x_val_first_four,  10e-6) #removed tesstxmat
x_val_final=hcat(first_four_transformed, x_val_others)




y_val = CSV.read("Data/y_valid_set", DataFrame,header=0);

## Get the predictions on the actual validation set

In [253]:
#predictions = trainvalx * betas .+ beta0
predictions = Matrix(x_val_final) * betas  .+ beta0
;

In [254]:
nonzerob = findall(x->x==1,z)
betas[nonzerob]
nonzerob

10-element Vector{Int64}:
  2
  5
  9
 14
 20
 22
 24
 44
 46
 52

In [255]:
function compute_r2(predictions, actual, mean_train)
    
    SSres = sum(Matrix((actual .- predictions).^2))
    SStot = sum(Matrix((actual .- mean_train).^2))
    1-SSres/SStot

    return 1-SSres/SStot
end

function compute_MSE(predictions, actual)
    sum_squared_error = sum(Matrix((predictions .- actual).^2))
    MSE = sum_squared_error/size(actual)[1]
    
    return(MSE)
    
end

compute_MSE (generic function with 1 method)

## Get r squared

In [256]:
using Statistics
compute_r2(predictions, y_val, mean(trainyarr))

0.6533891519701033

In [257]:
full_x = vcat(Matrix(exttrainxmat), Matrix(x_val_final))
#CURRENTLY ONLY TRAIN +VALIDATION GOT TO ADD IN THE TEST SET. 
size(full_x)
#column 17 is outlet 13

(8523, 63)

## Retrain on the full training set

In [258]:
#trainyarr = Vector(trainy[:,1]);
valyarr = Vector(y_val[:,1])


full_y_arr = vcat(trainyarr, valyarr)

;

In [232]:
obj, beta0, betas, z = holistic(full_x, full_y_arr, 0.08, 10, M)

Academic license - for non-commercial use only - expires 2022-08-18


(7.505565243973097e9, -510.7681021152996, [0.0, 0.0, 0.0, -9.868355864470479, -100.0, 0.0, 0.0, 0.0, 15.60769360261643, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0  …  -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0])

In [259]:
betas
## different bets getting selected form train and vali veruss full test set. 

63-element Vector{Float64}:
    0.0
   -0.05021890270160881
    0.0
    0.0
 -100.0
    0.0
    0.0
    0.0
   16.212020226789694
    0.0
    0.0
    0.0
    0.0
    ⋮
   46.32016976614781
    0.0
    0.0
    0.0
    0.0
    0.0
    0.0
    0.0
    0.0
    0.0
    0.0
    0.0

## Get store 13

In [260]:
store_13idx = findall(x->x== 1, full_x[:,17])
store_mat = full_x[store_13idx,:]
MRP = store_mat[:,9]

CSV.write("Store/store_13.csv", Tables.table(store_mat))


"Store/store_13.csv"

In [263]:
store_mat

932×63 Matrix{Float64}:
 10.1    102.01    3.17805  2.31254  …  0.0  0.470112    605.877   2.79218
 20.25   410.062   4.5      3.00816     0.0  0.299316   3924.78    2.86481
 16.5    272.25    4.06202  2.80336     0.0  1.31418    1673.65    8.07887
 18.85   355.323   4.34166  2.93651     0.0  2.60489    4728.94   34.6681
  7.105   50.481   2.66552  1.9608      0.0  0.318101    413.91    2.60822
 11.35   128.822   3.36898  2.42922  …  0.0  0.0627191  1909.97    0.929895
 12.15   147.623   3.48569  2.49733     0.0  0.799027   1429.02    7.73478
  9.6     92.16    3.09839  2.26176     0.0  0.0642483  1583.22    1.10372
  5.98    35.7604  2.4454   1.78842     0.0  0.451461    331.061   4.17951
 11.6    134.56    3.40588  2.45101     0.0  0.474577   1650.86    5.82238
  9.195   84.548   3.03233  2.21866  …  0.0  0.432913    470.477   2.409
 12.3    151.29    3.50714  2.5096      0.0  0.853644   1328.32    7.49498
 20.7    428.49    4.54973  3.03013     0.0  1.00847     802.137   1.88786
  ⋮

In [261]:
all_betas = vcat(beta0, betas)
all_betas = Tables.table(reshape(all_betas, length(all_betas), 1))
CSV.write("Data/betas_naive.csv", all_betas, writeheader= false)

"Data/betas_naive.csv"

# Get MSE

In [262]:
compute_MSE(predictions, y_val)

624211.7316713276