# ADAM with fine-tuned hyper-parameters

In [None]:
import os
import sys

path = os.getcwd()
parent_path = os.path.abspath(os.path.join(path, os.pardir))
sys.path.append(parent_path)

import random

import pprint as pp
import numpy as np
import time
import os
import shutil
from numpy import genfromtxt
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

torch.set_default_dtype(torch.float64)
torch.set_num_threads(1) #cpu num

import itertools
import numpy.linalg  as lin

import cProfile, pstats

from collections import OrderedDict

from Sparse_Init.sparseinit import *    
from Sparse_Init.sparsedata import *
from Sparse_Init.sparsemodule import * 
from sklearn.preprocessing import normalize

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print (device)

# Configuration

In [None]:
algo = 'Adam' # algorithm
dname = 'real-sim' # dataset name
BS = 64 # mini-batch size
StrongConvex = True # L2 regularization
if StrongConvex:
    case = 'reg'
else:
    case = 'non_reg'

# Load data - user need to download datasets from LIBSVM

In [None]:
# specify data directory
datafolder = '../Data/'+dname+'/' # please download libsvm dataset to this folder before executing this code
# Specify directory to save log files - optional
logfolder = '../Logs/'+dname+'/'+case+'/'+algo+'/'
# to run all hyper-parameters, please use the following log folders
# logfolder = '../AllLogs/'+dname+'/'+case+'/'+algo+'/'

if not os.path.exists(logfolder):
    os.makedirs(logfolder)
    

# dataset files
if dname == 'covtype':
    file = datafolder+'covtype.libsvm.binary.scale.bz2'
    
if dname == 'ijcnn1':
    trfile = datafolder+'ijcnn1.bz2'
    tefile = datafolder+'ijcnn1.t.bz2'
    
if dname == 'rcv1':
    trfile = datafolder+'rcv1_train.binary.bz2'
    tefile = datafolder+'rcv1_test.binary.bz2'
    
if dname == 'news20':
    file = datafolder+'news20.binary.bz2'
    
if dname == 'real-sim':
    file = datafolder+'real-sim.bz2'
    
    
try:
    data = SparseData(dname,device,file=file)
    csr = data.read()
    normalize(csr[0],copy=False)
    data.load(_csr=csr)
except:
    data = SparseData(dname,device,trfile=trfile,tefile=tefile)
    train_csr, test_csr = data.read()
    normalize(train_csr[0],copy=False)
    normalize(test_csr[0],copy=False)
    data.load(_trainCSR=train_csr,_testCSR=test_csr)
print(data)

In [None]:
# penalty term: lam = 1/n
if StrongConvex:
    lam = 1/data.trSize

# use best parm
## please see appendix for fine-tuned parameters for each dataset and case

In [None]:
# put hyper-parameters into the list
LR = [0] # initial step-size
SCHEDULE = [0] # schedule to reduce step-size per effective pass

In [None]:
print(LR,SCHEDULE)

#### b. parameters 

In [None]:
SEED = [0,1,2,3,4,5,6,7,8,9] # 10 random seeds

# running budget
if case=='reg':
    if dname =='rcv1':
        TotalEP = 30
    if dname =='ijcnn1':
        TotalEP = 20
    if dname =='news20':
        TotalEP = 40
    if dname =='covtype':
        TotalEP = 20
    if dname =='real-sim':
        TotalEP = 20
        
if case=='non_reg':
    if dname =='rcv1':
        TotalEP = 40
    if dname =='ijcnn1':
        TotalEP = 20
    if dname =='news20':
        TotalEP = 50
    if dname =='covtype':
        TotalEP = 20
    if dname =='real-sim':
        TotalEP = 30
        
perEpoch = data.trSize//BS

#### c. run 

In [None]:
for seed,lr,schedule in itertools.product(SEED,LR,SCHEDULE):
    timer=[] # timer
    initial_lr=lr

    run_status = logfolder+'RUN-lr-%s-schedule-%s-seed-%s/'%(initial_lr,schedule,seed)
    done_status = logfolder+'DONE-lr-%s-schedule-%s-seed-%s/'%(initial_lr,schedule,seed)
    savefile = logfolder+'lr-%s-schedule-%s-seed-%s.tar'%(initial_lr,schedule,seed)
    
    if os.path.exists(run_status) or os.path.exists(done_status) or os.path.exists(savefile):
        print(done_status)
        continue
    else:
        os.makedirs(run_status)
    print('======\nlr - %s | schedule - %s | seed - %s\n======'%(initial_lr,schedule,seed))  
    
    # results
    HIST=[]
    STAT=[]
    ALPHA=[]
    
    TIME = time.time() # total run time
    
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    allSamples = list(range(data.trSize))
    
    # define one sparse layer model for linear model with logistic regression
    if StrongConvex:
        # L2 regularized case
        model = ConvexModel(data.num_feature,data.num_label,lam=lam,StrongConvex=True).to(device)
    else:
        # non-regularized case
        model = ConvexModel(data.num_feature,data.num_label).to(device)
      
    # for weight wrt features in testing but not in training dataset, set them to ZERO
    if len(data.in_te_not_tr)>0:
        model.del_in_te_not_tr(data.in_te_not_tr)
        
    optimizer = optim.Adam(model.parameters(),lr=lr)
    
    np.random.shuffle(allSamples)
        
    # intialize counter    
    ep=0.0 # count effective pass 
    # initialize stopping flag
    converge=False
    fatal=False
    loop_time = time.time()
    
    for EP in range(TotalEP+1):
        lr = optimizer.param_groups[0]['lr']
        
        timeT = time.time() - loop_time
        loop_time = time.time()
        
        if converge or fatal: 
            break
        
        np.random.shuffle(allSamples)
        
        # evaluate batch loss,grad,test
        Loss, V = model.LossGrad(data)
        Grad = np.sum([(gi.data**2).sum().item() for gi in V])
        Test = model.ComputeAccuracy(data)
        
        HIST.append([ep,Loss,Grad,Test])
        STAT.append([ep,timeT])
        
        print('ep: %.2f, lr: %.4f, loss: %.2e, Grad: %.2e, Test: %.4f, Time: %.2f'\
              %(ep,lr,Loss,Grad,Test,timeT))
                    
        if np.isnan(Loss) or np.isnan(Grad) or np.isnan(Test):
            fatal = True
        if Grad < 1e-15:
            converge=True
           
        for t in range(perEpoch):
            
            if fatal or converge:
                break
            
            if t<perEpoch:
                sample = allSamples[t*BS:(t+1)*BS]
            else:
                sample = allSamples[t*BS:]
                
            x_sample,y_sample = data.mb(sample)
            
            model.zero_grad()
            optimizer.zero_grad()
            pred_sample = model(x_sample)
            loss_sample = model.logloss(pred_sample,y_sample)
            
            if np.isnan(loss_sample.item()) or np.isinf(loss_sample.item()):
                fatal=True
                break
                
            loss_sample.backward()
            optimizer.step()
            
            t+=1 
            ep+=BS/data.trSize # count epoch
            
        # update learning rate with schedule    
        optimizer.param_groups[0]['lr']*=schedule
            
    
    TIME = time.time() - TIME # total running time per run
    
    RESULTS = OrderedDict()
    RESULTS = {
        'parm': [BS,seed,initial_lr,schedule],
        'end': [Loss,Grad,Test,TIME,converge,fatal],
        'hist': HIST,
        'stat': STAT
    }
    torch.save(RESULTS,savefile)
    
#     update running status
    if os.path.exists(run_status):
        os.rmdir(run_status)
    if not os.path.exists(done_status):
        os.mkdir(done_status)

In [None]:
exit(0)