## Import Packages

In [1]:
using Gurobi, CSV, DataFrames, JuMP, LinearAlgebra, Distributions, Random, GLMNet

## Read in Data 

### training and validation set created from stable regression (see notebook)

In [2]:
trainx = CSV.read("Data/x_training_stable_optimpute.csv",DataFrame)
trainy = CSV.read("Data/y_training_stable_optimpute.csv",DataFrame,header=0);


### Convert data to matrix and vector form 

In [3]:
trainxmat = Matrix(trainx)
trainyarr = Vector(trainy[:,1]);

### Subset the first four columns (numeric) we do transformation on our numeric columns later

In [4]:
trainxmat_modified=trainxmat[:,1:4]
trainxmat_others=trainxmat[:,5:size(trainxmat)[2]]
;


## Holistic Regression

In [5]:
function holistic(x, y, weirdr, K, M)
    
    n, p = size(x)

    m = Model(Gurobi.Optimizer)
    
    set_optimizer_attribute(m, "OutputFlag", 0) 
    
    @variable(m, beta[1:p])
    @variable(m, beta0)
    @variable(m, z[1:p], Bin)
    @variable(m, q[1:p] >= 0)
    @variable(m, t[1:n] >= 0)
    
    @constraint(m, sum(z) <= K)
    pby4 = trunc(Int, 4) ##CHECK IF NOT WORKS +++++++++++++++++++++++++++++++++++++++ 4 or 1
    @constraint(m, [i=1:pby4], sum(z[(i-1)*4+j] for j=1:4) == 1)
    @constraint(m, z[5] == 1) #ensure visibility feature chosen
    
    @constraint(m, [i=1:p], -M*z[i] <= beta[i])
    @constraint(m, [i=1:p], beta[i] <= M*z[i])
    @constraint(m, [i=1:p], beta[i] <= q[i])
    @constraint(m, [i=1:p], -beta[i] <= q[i])

    for i=2:p
        for j=1:(i-1)
            if (cor(x[:,i],x[:,j]) > 0.7)
                @constraint(m, z[i]+z[j] <= 1)
            end
        end
    end
    
    @constraint(m, [i=1:n], (y[i] - sum(x[i,j]*beta[j] for j=1:p) - beta0) <= t[i])
    @constraint(m, [i=1:n], -(y[i] - sum(x[i,j]*beta[j] for j=1:p) - beta0) <= t[i])
    
    @objective(m, Min, 0.5 * sum(t[i]^2 for i=1:n) + weirdr * sum(q[j] for j=1:p))
    optimize!(m)
        
    obj = JuMP.objective_value(m)
    beta0 = JuMP.value(beta0)
    betas = JuMP.value.(beta)
    z = JuMP.value.(z)
        
    return obj, beta0, betas, z
end

holistic (generic function with 1 method)

## Adding transformations of our 3 numeric columns

In [6]:
#add origional variables back in that were not transformed
#trainxmat=orig 4 columns

function creatematrices(trainxmat,  eps)
    ntrain, ptrain = size(trainxmat)
    #ntest, ptest = size(testxmat)

    modtrainxmat = zeros(ntrain, 4*ptrain)
    #modtestxmat = zeros(ntest, 4*ptest)

    for i = 1:(4)
        modtrainxmat[:,4*(i-1)+1] = trainxmat[:,i]
        modtrainxmat[:,4*(i-1)+2] = trainxmat[:,i].^2
        modtrainxmat[:,4*(i-1)+3] = sqrt.(abs.(trainxmat[:,i]))
        modtrainxmat[:,4*(i-1)+4] = log.(abs.(trainxmat[:,i]) .+ eps)

#         modtestxmat[:,4*(i-1)+1] = testxmat[:,i]
#         modtestxmat[:,4*(i-1)+2] = testxmat[:,i].^2
#         modtestxmat[:,4*(i-1)+3] = sqrt.(abs.(testxmat[:,i]))
#         modtestxmat[:,4*(i-1)+4] = log.(abs.(testxmat[:,i]) .+ eps)
        end

    return modtrainxmat #, modtestxmat
end

creatematrices (generic function with 1 method)

In [7]:
using Gurobi, CSV, DataFrames, JuMP, LinearAlgebra, Distributions, Random, GLMNet

### Getting the full data (post fransformation)



In [8]:
first_four_transformed = creatematrices(trainxmat_modified,  10e-6) #removed tesstxmat
exttrainxmat=hcat(first_four_transformed, trainxmat_others)

;

## Cross validation function

In [9]:
M=100
r=0.1#
K = 10
n, p = size(exttrainxmat)
numfolds = 10
foldsamps = sample(1:numfolds, n, replace = true)

function cross_val(rho, fold,M)
   # first_four_transformed = creatematrices(trainxmat_modified,  10e-6) #removed tesstxmat
   # exttrainxmat=hcat(first_four_transformed, trainxmat_others)


   


    traintrainx = exttrainxmat[[x for x in 1:n if foldsamps[x] != fold],:]
    trainvalx = exttrainxmat[[x for x in 1:n if foldsamps[x] == fold],:]
    traintrainy = trainyarr[[x for x in 1:n if foldsamps[x] != fold],:]
    trainvaly = trainyarr[[x for x in 1:n if foldsamps[x] == fold],:]
    print("D")


    print("A")
    obj, beta0, betas, z = holistic(traintrainx, traintrainy, rho, K, M)
    print("B")
    predictions = Matrix(trainvalx) * betas .+ beta0
    mse = mean((predictions .- trainvaly).^2)
    #savemses[j] = savemses[j] + mse
    
    return(beta0, betas, z, trainvalx, trainvaly, mse)
end

cross_val (generic function with 1 method)

### Doing 10 fold  Cross validation to find the best gamma

In [26]:
gamma = collect(0:.01:.2)

avg_mse_gamma = []
for g in gamma  
    mse_gamma_fold = []
    for f in 1:10
        beta0, betas, z, trainvalx, trainvaly, mse = cross_val(g, f, 100);
        push!(mse_gamma_fold, mse)
    end
    push!(avg_mse_gamma, mean(mse_gamma_fold))
end
        

DAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcad

BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAcademic license - for non-commercial use only - expires 2022-08-18
BDAAca

## Get the best gamma (0.08)

In [31]:
best_gamma_idx = argmin(avg_mse_gamma)
best_gamma = gamma[best_gamma_idx]

#best gamma is 0.08

0.08

## Get the betas for the best gamma

In [10]:
obj, beta0, betas, z = holistic(exttrainxmat, trainyarr, 0.08, 10, M)
;

Academic license - for non-commercial use only - expires 2022-08-18


## Read in the actual validation set (from stable regression/see notebook)

In [11]:
x_val = CSV.read("Data/x_valid_set_optimpute.csv", DataFrame)
x_val = Matrix(x_val)
x_val_first_four = x_val[:,1:4]
x_val_others = x_val[:,5:size(x_val)[2]]


first_four_transformed = creatematrices(x_val_first_four,  10e-6) #removed tesstxmat
x_val_final=hcat(first_four_transformed, x_val_others)




y_val = CSV.read("Data/y_valid_set_optimpute.csv", DataFrame,header=0);

## Get the predictions on the actual validation set

In [12]:
#predictions = trainvalx * betas .+ beta0
predictions = Matrix(x_val_final) * betas  .+ beta0
;

In [13]:
nonzerob = findall(x->x==1,z)
betas[nonzerob]
nonzerob

10-element Vector{Int64}:
  2
  5
  9
 14
 34
 36
 37
 38
 42
 44

In [14]:
function compute_r2(predictions, actual, mean_train)
    
    SSres = sum(Matrix((actual .- predictions).^2))
    SStot = sum(Matrix((actual .- mean_train).^2))
    1-SSres/SStot

    return 1-SSres/SStot
end

function compute_MSE(predictions, actual)
    sum_squared_error = sum(Matrix((predictions .- actual).^2))
    MSE = sum_squared_error/size(actual)[1]
    
    return(MSE)
    
end

compute_MSE (generic function with 1 method)

## Get r squared

In [15]:
compute_r2(predictions, y_val, mean(trainyarr))

0.617076973174755

# Get MSE

In [16]:
compute_MSE(predictions, y_val)

689016.2993123356

In [17]:
full_x = vcat(Matrix(exttrainxmat), Matrix(x_val_final))
#CURRENTLY ONLY TRAIN +VALIDATION GOT TO ADD IN THE TEST SET. 
size(full_x)
#column 17 is outlet 13

(8523, 55)

### Get Store 13

In [18]:
store_13idx = findall(x->x== 1, full_x[:,17])
store_mat = full_x[store_13idx,:]
MRP = store_mat[:,9]

CSV.write("Store/store_13_impute.csv", Tables.table(store_mat))

"Data/store_13_impute.csv"

In [19]:
all_betas = vcat(beta0, betas)
all_betas = Tables.table(reshape(all_betas, length(all_betas), 1))
CSV.write("Data/betas.csv", all_betas, writeheader= false)

"Data/betas.csv"