In [None]:
#===============================================================================
# IMPORTS
#===============================================================================
from collections import defaultdict
import glob
import numpy as np
from sklearn.datasets import load_svmlight_file, load_svmlight_files
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import pandas as pd
import dro_model as model
from joblib import Parallel, delayed
import multiprocessing
from parallel_process import parallel_classification_table2
import fio
import os

#===============================================================================
# SETTINGS
#===============================================================================
DIR_DATA = './dataset/'
DIR_CACHE = './datacache/'
DIR_FIGURES = './figures/'
DIR_TABLE = './tables/'
TOTAL_RUN = 100
pnorms = [1, 2, float('Inf')]

In [None]:
#===============================================================================
# Load data, train & test models
#===============================================================================
FILE_NAMES = []
FILE_NAMES = glob.glob(DIR_DATA + "*.txt")
FILE_NAMES = [fname for fname in FILE_NAMES if '_test.txt' not in fname]
NUM_CORES = multiprocessing.cpu_count()
DRSVM_AUC = defaultdict(list)
DRSVM_CVaR = defaultdict(list)
RSVM_AUC = defaultdict(list)
RSVM_CVaR = defaultdict(list)
SVM_AUC = defaultdict(list)
SVM_CVaR = defaultdict(list)
for fname in FILE_NAMES:
    print(fname[10:-4])
    try:
        X_train, y_train, X_test, y_test = load_svmlight_files(
            (fname, fname[:-4] + '_test.txt'))
        X_train = X_train.todense()
        X_test = X_test.todense()
        labels = np.unique(y_train)
        y_train[y_train == labels[0]] = -1
        y_train[y_train == labels[1]] = 1
        y_test[y_test == labels[0]] = -1
        y_test[y_test == labels[1]] = 1
        is_test = True
    except FileNotFoundError:
        data = load_svmlight_file(fname)
        X_data = data[0]
        y_data = data[1]
        X_data = X_data.todense()
        labels = np.unique(y_data)
        y_data[y_data == labels[0]] = -1
        y_data[y_data == labels[1]] = 1
        is_test = False
    results = []
    if is_test:
        results = Parallel(n_jobs=NUM_CORES)(
            delayed(parallel_classification_table2)(
                X_train, y_train, X_test, y_test)
            for i in range(TOTAL_RUN))
    else:
        results = Parallel(n_jobs=NUM_CORES)(
            delayed(parallel_classification_table2)(X_data, y_data)
            for i in range(TOTAL_RUN))
    tmp1, tmp2, tmp3 = zip(*results)
    reform1 = defaultdict(list)
    reform2 = defaultdict(list)
    for pnorm in pnorms:
        for i in range(TOTAL_RUN):    
            reform1[pnorm].append(tmp1[i][pnorm])
            reform2[pnorm].append(tmp2[i][pnorm])
    DRSVM_AUC[fname[10:-4]] = reform1
    RSVM_AUC[fname[10:-4]] = reform2
    SVM_AUC[fname[10:-4]] = list(tmp3)

In [None]:
# reform dictionaries
DRSVM_AUC = {(outerKey, innerKey): values
             for outerKey, innerDict in DRSVM_AUC.items()
             for innerKey, values in innerDict.items()}
RSVM_AUC = {(outerKey, innerKey): values
            for outerKey, innerDict in RSVM_AUC.items()
            for innerKey, values in innerDict.items()}

In [None]:
if not os.path.exists(DIR_CACHE):
    os.makedirs(DIR_CACHE)
fio.cache(DRSVM_AUC, 'DRSVM_AUC', msg='saving DRSVM AUC results')
fio.cache(RSVM_AUC, 'RSVM_AUC', msg='saving RSVM AUC results')
fio.cache(SVM_AUC, 'SVM_AUC', msg='saving SVM AUC results')

In [None]:
# convert results to pandas dataframe
df1 = pd.DataFrame(DRSVM_AUC)
df2 = pd.DataFrame(RSVM_AUC)
df3 = pd.DataFrame(SVM_AUC)

In [None]:
out_text = """\documentclass{article}
\\usepackage{multirow}
\\usepackage[margin=0.5in]{geometry}
\\begin{document}
\\begin{table} [h] 
\\centering
\\caption{The average Area Under the Curve (AUC) scores
evaluated on the test dataset over 100 trials.}
\\bgroup
\\def\\arraystretch{1.1}
\\begin{tabular}{|l|c|cc|cc|cc|}
\\cline{3-8}
\\multicolumn{2}{c|}{} & \\multicolumn{2}{c|}{$\\ell_1$-regularizer} & 
\\multicolumn{2}{c|}{$\\ell_2$-regularizer} & \\multicolumn{2}{c|}{$\\ell_\\infty$-regularizer} \\\\ 
\\cline{2-8}
\\multicolumn{1}{c|}{} & SVM & RSVM & DRSVM & RSVM & DRSVM & RSVM & DRSVM \\\\ \hline \n"""
for fname in FILE_NAMES:
    tmp = fname[10:-4].replace("_", " ") + '& $ ' + str(round(100*df3.mean()[fname[10:-4]],2)) + \
    ' \\pm ' + str(round(100*df3.std()[fname[10:-4]],2)) + ' $ '
    for pnorm in pnorms:
        tmp = tmp + '& $ ' + str(round(100*df2.mean()[fname[10:-4]][pnorm],2)) + \
        ' \\pm ' + str(round(100*df2.std()[fname[10:-4]][pnorm],2)) + ' $ '
        tmp = tmp + '& $ ' + str(round(100*df1.mean()[fname[10:-4]][pnorm],2)) + \
        ' \\pm ' + str(round(100*df1.std()[fname[10:-4]][pnorm],2)) + ' $ '
    out_text = out_text + tmp + '\\\\ \\hline \n'
out_text = out_text + '\\end{tabular}\n\\egroup\n\\end{table}\\end{document}'

In [None]:
# Output latex table
if not os.path.exists(DIR_TABLE):
    os.makedirs(DIR_TABLE)
f = open(DIR_TABLE + 'table_2.tex', 'w')
f.write(out_text)
f.close()
os.chdir(DIR_TABLE)
assert os.system('pdflatex table_2.tex') == 0
os.chdir('..')