# Importing Packages

In [4]:
using Pkg

Pkg.add("Random")
Pkg.add("Statistics")
Pkg.add("DataFrames")
Pkg.add("CSV")
Pkg.add("Plots")

[32m[1m    Updating[22m[39m registry at `C:\Users\ultan\.julia\registries\General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\ultan\.julia\environments\v1.6\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\ultan\.julia\environments\v1.6\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\ultan\.julia\environments\v1.6\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\ultan\.julia\environments\v1.6\Manifest.toml`
`DataFrames` is pinned at `v1.1.1`: maintaining pinned version
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\ultan\.julia\environments\v1.6\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\ultan\.julia\environments\v1.6\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m

In [1]:
using CSV, Tables
using DataFrames
using Statistics
using Random

# Import Train and Validation Data

In [2]:
stable_all_train_val_proc = CSV.read("Data/train_val_optimpute.csv", DataFrame,header=true);

## Split the x and y

In [3]:
#Y is outlet sales
stable_y=stable_all_train_val_proc[:,4]

stable_x=select!(stable_all_train_val_proc,Not(:Item_Outlet_Sales))
;

In [4]:
#stable_x = CSV.read("stable_.csv", DataFrame,header=true);
#stable_y = CSV.read("train_processed_Y.csv", DataFrame,header=true);
#stableX_test = CSV.read("test_processed_without_Y.csv", DataFrame,header=false);
#stableY_test = CSV.read("test_processed_Y.csv", DataFrame,header=false);

# Accuracy Functions

In [5]:
function compute_mse(X, y, beta,beta0) #inc beta0
    n,p = size(X)
    beta_0= [beta0 for i = 1:n]
    
    return (sum((beta0.+Matrix(X)*beta .- Array(y)).^2)/n)
end

compute_mse (generic function with 1 method)

In [6]:
function compute_r2(X, y, beta,beta0) #inc beta0
    
    beta_0= [beta0 for i = 1:n]
    
    SSres = sum( (Array(y) .- Matrix(X)*beta.+beta0).^2 )
    SStot = sum( (y .- Statistics.mean(Array(y))).^2 )
    return 1-SSres/SStot
end

compute_r2 (generic function with 1 method)

In [7]:
using JuMP, Gurobi

In [8]:
model = Model(Gurobi.Optimizer)

Academic license - for non-commercial use only - expires 2022-08-18


A JuMP Model
Feasibility problem with:
Variables: 0
Model mode: AUTOMATIC
CachingOptimizer state: EMPTY_OPTIMIZER
Solver name: Gurobi

## Stable Regression

Conduct a 70% split and find the toughest training set.

In [9]:
gurobi_env=Gurobi.Env()

function l2_regression2(X, y, rho,split_at; solver_output=0)
    n,p = size(X)
    
    
    k = convert(Int,floor(split_at*n)) #floor takes the integer part
    
    # Build model
    model = Model(with_optimizer(Gurobi.Optimizer,gurobi_env))
    set_optimizer_attribute(model, "OutputFlag", solver_output) 
    
    # Insert variables
    @variable(model,beta[j=1:p])
    @variable(model, beta0) 
    @variable(model,theta)
    
    @variable(model,a[j=1:p]) 
    @variable(model, u[i=1:n]>=0) 
    @variable(model, d>=0)
 
    
    #Insert constraints   
    @constraint(model,[j=1:p], beta[j]<=a[j])   
    @constraint(model,[j=1:p], -(beta[j])<=a[j])
    @constraint(model, [i=1:n], ( (y[i,1]-beta0-transpose(beta)*Vector(X[i,:])) ) <= theta+u[i])  
    @constraint(model,[i=1:n], -( (y[i,1]-beta0-transpose(beta)*Vector(X[i,:]) )) <= theta+u[i])
    @constraint(model,[i=1:n],u[i]>=0)
    @constraint(model, beta0<=d)
    @constraint(model, -beta0<=d) 
    
    #Objective
    @objective(model,Min, k*theta+sum(u[i] for i=1:n) + rho*(sum(a[j] for j=1:p)) )   
    
    # Optimize
    optimize!(model)
    
    # Return estimated betas
    beta0=value.(beta0)    
    beta=value.(beta)

    
    errors=zeros(n) 
    beta_0= [beta0 for i = 1:n]
       
    errors= ((beta0.+Matrix(X)*beta .- Array(y)).^2) .^0.5
    
    #Sort indices of MSE's
    indicies=sortperm(errors[:,1])
    
    #for (i,rho) in enumerate(indicies)
    #    print(i,rho)
    #end
    
    valid_indices=indicies[1:n-k] 
    training_indices=indicies[1+n-k:n]
      
    x_valid_set, y_valid_set = X[valid_indices,:], y[valid_indices,:]
    x_training_set,y_training_set=X[training_indices,:],y[training_indices,:]
   
    
    return (value.(beta0),value.(beta),errors,x_valid_set,y_valid_set,x_training_set,y_training_set)
end

Academic license - for non-commercial use only - expires 2022-08-18


l2_regression2 (generic function with 1 method)

### EXPORT TOUGHEST SUBSET 

In [10]:
#collect the toughest training set

beta0,beta,errors,x_valid_set,y_valid_set,x_training_set,y_training_set= l2_regression2(stable_x,stable_y,.01,.7);


In [12]:
using Tables
CSV.write( "Data/x_training_stable_optimpute.csv",  x_training_set)
CSV.write("Data/y_training_stable_optimpute.csv",  Tables.table(y_training_set), writeheader=false)


CSV.write("Data/x_valid_set_optimpute.csv", x_valid_set)
CSV.write("Data/y_valid_set_optimpute.csv", Tables.table(y_valid_set), writeheader=false)

"Data/y_valid_set_optimpute.csv"