# SARAH+ with fine-tuned hyper-parameters

In [None]:
import os
import sys

path = os.getcwd()
parent_path = os.path.abspath(os.path.join(path, os.pardir))
sys.path.append(parent_path)

import random

import pprint as pp
import numpy as np
import time
import os
import shutil
from numpy import genfromtxt
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

torch.set_default_dtype(torch.float64)
torch.set_num_threads(1) #cpu num

import itertools
import numpy.linalg  as lin

import cProfile, pstats

from collections import OrderedDict

from Sparse_Init.sparseinit import *    
from Sparse_Init.sparsedata import *
from Sparse_Init.sparsemodule import * 
from sklearn.preprocessing import normalize

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print (device)

# Configuration

In [None]:
algo = 'plus' # algorithm
dname = 'rcv1' # dataset name
BS = 64 # mini-batch size
StrongConvex = True # L2 regularization
if StrongConvex:
    case = 'reg'
else:
    case = 'non_reg'

# Load data - user need to download datasets from LIBSVM# generate data

In [None]:
# specify data directory
datafolder = '../Data/'+dname+'/' # please download libsvm dataset to this folder before executing this code
# Specify directory to save log files - optional
logfolder = '../Logs/'+dname+'/'+case+'/'+algo+'/'
# to run all hyper-parameters, please use the following log folders
# logfolder = '../AllLogs/'+dname+'/'+case+'/'+algo+'/'

if not os.path.exists(logfolder):
    os.makedirs(logfolder)
    

# dataset files - need to be downloaded from LIBSVM website
if dname == 'covtype':
    file = datafolder+'covtype.libsvm.binary.scale.bz2'
    
if dname == 'ijcnn1':
    trfile = datafolder+'ijcnn1.bz2'
    tefile = datafolder+'ijcnn1.t.bz2'
    
if dname == 'rcv1':
    trfile = datafolder+'rcv1_train.binary.bz2'
    tefile = datafolder+'rcv1_test.binary.bz2'
    
if dname == 'news20':
    file = datafolder+'news20.binary.bz2'
    
if dname == 'real-sim':
    file = datafolder+'real-sim.bz2'
    
    
try:
    data = SparseData(dname,device,file=file)
    csr = data.read()
    normalize(csr[0],copy=False)
    data.load(_csr=csr)
except:
    data = SparseData(dname,device,trfile=trfile,tefile=tefile)
    train_csr, test_csr = data.read()
    normalize(train_csr[0],copy=False)
    normalize(test_csr[0],copy=False)
    data.load(_trainCSR=train_csr,_testCSR=test_csr)
print(data)

In [None]:
if StrongConvex:
    lam = 1/data.trSize

# use best parameters
## please see appendix for best hyper-parameters

In [None]:
# put hyper-parameters into the list
LR = [0] # constant step-size
GAMMA = [0] # early stopping parameter

In [None]:
LR,GAMMA

#### a. experiment setup 

In [None]:
SEED = [0,1,2,3,4,5,6,7,8,9] # 10 random seeds

#### b. parameters 

In [None]:
# running budget
if case=='reg':
    if dname =='rcv1':
        TotalEP = 30.0
    if dname =='ijcnn1':
        TotalEP = 20.0
    if dname =='news20':
        TotalEP = 40.0
    if dname =='covtype':
        TotalEP = 20.0
    if dname =='real-sim':
        TotalEP = 20.0
        
if case=='non_reg':
    if dname =='rcv1':
        TotalEP = 40.0
    if dname =='ijcnn1':
        TotalEP = 20.0
    if dname =='news20':
        TotalEP = 50.0
    if dname =='covtype':
        TotalEP = 20.0
    if dname =='real-sim':
        TotalEP = 30.0
        
perEpoch = data.trSize//BS

In [None]:
logfolder

#### c. run 

In [None]:
for seed,alpha,gamma in itertools.product(SEED,LR,GAMMA):
    timer=[] # timer
    
    run_status = logfolder+'RUN-lr-%s-gamma-%s-seed-%s/'%(alpha,gamma,seed)
    done_status = logfolder+'DONE-lr-%s-gamma-%s-seed-%s/'%(alpha,gamma,seed)
    savefile = logfolder+'lr-%s-gamma-%s-seed-%s.tar'%(alpha,gamma,seed)
    
    if os.path.exists(run_status) or os.path.exists(done_status) or os.path.exists(savefile):
        print(done_status)
        continue
    else:
        os.makedirs(run_status)
    print('======\nlr - %s | gamma - %s | seed - %s\n======'%(alpha,gamma,seed))  
    
    # results
    HIST=[]
    STAT=[]
    
    TIME = time.time()# total run timer
    
    # initialize random stream
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    # define one layer model for linear model with logistic regression
    if StrongConvex:
        # L2 regularized case
        model = ConvexModel(data.num_feature,data.num_label,lam=lam,StrongConvex=True).to(device)
        prev_net = ConvexModel(data.num_feature,data.num_label,lam=lam,StrongConvex=True).to(device)
    else:
        # un-regularized case
        model = ConvexModel(data.num_feature,data.num_label).to(device)
        prev_net = ConvexModel(data.num_feature,data.num_label).to(device)
      
    # for weight wrt features in testing but not in training dataset, set them to ZERO
    if len(data.in_te_not_tr)>0:
        model.del_in_te_not_tr(data.in_te_not_tr)
        prev_net.del_in_te_not_tr(data.in_te_not_tr) # redundant - remember to remove before SUBMISSION !!!
    
    allSamples = list(range(data.trSize))
        
    # intialize counter    
    ep=0.0 # count effective pass 
    innerT=0 # count inner iterations
    outerT=0 # count outer iterations
    st=0 # mini-batch loop counter
    # initialize stopping flag
    converge=False
    fatal=False
    # epoch time - time for one epoch
    epoch_time = time.time()
    
    t=0
    
    outer_record=True # for print&save log purpose
    while ep<=TotalEP+1:
        
        if converge or fatal: 
            break
            
        for wi,pi in zip(model.parameters(),prev_net.parameters()):
            with torch.no_grad():
                pi.set_(wi+0.0)   
          
        # compute batch loss,grad,test
        Loss, V = prev_net.LossGrad(data)
        Grad = np.sum([(gi.data**2).sum().item() for gi in V])
        Test = prev_net.ComputeAccuracy(data)
        
        with torch.no_grad():
            for wi,vi in zip(model.parameters(),V):
                wi.sub_(alpha*vi)
        
        if outer_record:
            timeT = time.time() - epoch_time
            if ep==0 and t==0:
                timeT=0.0
            HIST.append([ep,Loss,Grad,Test])
            STAT.append([ep,outerT,innerT,timeT,1])
            print('outer-ep: %.2f, alpha: %.4f, loss: %.2e, Grad: %.2e, Test: %.4f, Time: %.2f, t: %d'\
                  %(ep,alpha,Loss,Grad,Test,timeT,t))
            epoch_time = time.time()
 
        ep+=1.0
        outerT+=data.trSize//BS
        
        if np.isnan(Loss) or np.isnan(Grad) or np.isnan(Test):
            fatal = True
        if Grad < 1e-15:
            converge=True
                    
        # initialize inner loop
        normV0 = Grad
        normV = normV0
        t=0
        outer_record=True
        # inner loop
        while ep<=TotalEP+1:
            
            if fatal or converge or normV<gamma*normV0:
                break
            
            # random mini-batch
            st=st%perEpoch
            if st==0:
                np.random.shuffle(allSamples)
            if st==perEpoch-1:
                sample = allSamples[st*BS:]
            else:
                sample = allSamples[st*BS:(st+1)*BS]
                
            x_sample,y_sample = data.mb(sample)
            # compute sample grad: g0
            _,g0 = prev_net.LossGrad(data,sample=sample)  
            # compute sample grad: g1
            _,g1 = model.LossGrad(data,sample=sample)
            # update recusrive gradient
            V = [g1i.data - g0i.data + vi.data for g1i,g0i,vi in zip(g1,g0,V)]
            
            normV = np.sum([(vi.data**2).sum().item() for vi in V])
            
            with torch.no_grad():
                for wi,pi,vi in zip(model.parameters(),prev_net.parameters(),V):
                    pi.set_(wi+0.0)
                    wi.sub_(alpha*vi)
            
            st+=1 # sample counter
            t+=1 # count inner iteration
            innerT+=1
            ep+=BS/data.trSize # count effective pass
            
            inner_record=False
            if (t-1)%perEpoch==0:
                
                timeT = time.time()-epoch_time
                
                inner_record=True
                Lossprint, Vprint = model.LossGrad(data)
                Gradprint = np.sum([(gi.data**2).sum().item() for gi in Vprint])
                Testprint = model.ComputeAccuracy(data)

                HIST.append([ep,Lossprint,Gradprint,Testprint])
                STAT.append([ep,outerT,innerT,timeT,0])
                print('inner-ep: %.2f, alpha: %.4f, loss: %.2e, Grad: %.2e, Test: %.4f, Time: %.2f, t: %d'\
                      %(ep,alpha,Lossprint,Gradprint,Testprint,timeT,t))
                
                epoch_time = time.time()
            
        if inner_record:
            outer_record=False  
            
    TIME = time.time() - TIME # total running time per run
    
    RESULTS = OrderedDict()
    RESULTS = {
        'parm': [BS,seed,alpha,gamma],
        'end': [Loss,Grad,Test,TIME,converge,fatal],
        'hist': HIST,
        'stat': STAT
    }
    torch.save(RESULTS,savefile)
    
    # update running status
    if os.path.exists(run_status):
        os.rmdir(run_status)
    if not os.path.exists(done_status):
        os.mkdir(done_status)

In [None]:
exit(0)