# evaluate urls

must have already run urls.ipynb nb

This does lightweight preprocessing and assesses urls dataset on xgboost.

#### combine sparse and continuous data for regular runs

In [1]:
awk_max_feature = """
{
  for(i = 1; i <= NF; ++i) {
    s = index($i, ":");
    $i = substr($i, 1, s - 1) + 0;
    if ($i > m) m = $i;
  }
}
END { print m }
"""

In [2]:
%%script time bash -s '{awk_max_feature}'

# for pre-processing, need to shift over continuous / sparse features
files="urls-data/sps00.svm urls-data/sps01.svm"
nlines=$(cat $files | wc -l)
echo nlines $nlines

splits=16
cat $files | cut -d" " -f2- | split -l $(( $nlines / $splits )) - /tmp/urls_sps
ls /tmp/urls_sps* | xargs -P 16 -L1 awk "$1" | \
  awk 'BEGIN{m="NaN"}{if(m>$1)m=$1;if(mm<$1)mm=$1}END{print "min " m " max " mm}'

nlines 2396130
min 306110 max 3381344


2740.96user 9.81system 6:25.49elapsed 713%CPU (0avgtext+0avgdata 3304maxresident)k
1184inputs+3572008outputs (8major+3471minor)pagefaults 0swaps


In [3]:
awk_shift = """
{
  $1="";
  for(i = 2; i <= NF; ++i) {
    s = index($i, ":");
    j = substr($i, 1, s - 1) + 3381344 + 1;
    $i = j substr($i, s)
  }
  print
}
"""

In [4]:
%%script time bash -s '{awk_shift}'

for i in "00" "01" ; do 
  wc -l urls-data/cont${i}.svm
  wc urls-data/sps${i}.svm
  cat urls-data/cont${i}.svm | awk "$1" \
    |  paste -d" " urls-data/sps${i}.svm - > urls-data/all${i}.svm
  wc urls-data/all${i}.svm
done

1677291 urls-data/cont00.svm
   1677291  145867839 1286989217 urls-data/sps00.svm
   1677291  196072305 1862973889 urls-data/all00.svm
718839 urls-data/cont01.svm
   718839  62013131 546637929 urls-data/sps01.svm
   718839  83382469 791571250 urls-data/all01.svm


107.22user 7.15system 1:39.93elapsed 114%CPU (0avgtext+0avgdata 3336maxresident)k
80inputs+5184664outputs (1major+1654minor)pagefaults 0swaps


#### lets see what xgb does

Try out on 16cpu machine maybe

In [1]:
%%bash

cd /tmp
test -e svmlight-loader || git clone https://github.com/mblondel/svmlight-loader.git
cd svmlight-loader
make
python setup.py build
python setup.py install

python setup.py build_ext --inplace
running build_ext
running build
running build_py
running build_ext
running install
running build
running build_py
running build_ext
running install_lib
running install_egg_info
Removing /home/ubuntu/dev/anaconda3/envs/env2020/lib/python3.7/site-packages/svmlight_loader-0.1-py3.7.egg-info
Writing /home/ubuntu/dev/anaconda3/envs/env2020/lib/python3.7/site-packages/svmlight_loader-0.1-py3.7.egg-info


In [2]:
import sys
if '/tmp/svmlight-loader' not in sys.path:
    sys.path.append('/tmp/svmlight-loader')
    sys.path.append('/tmp/svmlight-loader/build')
import svmlight_loader

In [8]:
nthread = 64
max_depth = 0
num_round = 100
baseline = "urls-data/baseline.model"
import_svmlight = """import sys
    if '/tmp/svmlight-loader' not in sys.path:
        sys.path.append('/tmp/svmlight-loader')
        sys.path.append('/tmp/svmlight-loader/build')
    from svmlight_loader import load_svmlight_file"""
load_sps = f"""
def load():
    {import_svmlight}
    return load_svmlight_file('urls-data/all00.svm')
"""
! /usr/bin/time python xgb.py {nthread} {max_depth} {num_round} {baseline} "{load_sps}"

load 120.01790261268616
train X (1677291, 3381408) y (1677291,)
train 267.0287036895752
3092.79user 57.11system 6:28.02elapsed 811%CPU (0avgtext+0avgdata 11653680maxresident)k
0inputs+464outputs (0major+12144782minor)pagefaults 0swaps


In [9]:
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
import json

def eval_model(name, load):
    print(name)
    with open(name + '.json', 'r') as f:
        param = json.load(f)
    print('load   {: 10.0f} sec'.format(param.pop('load')))
    print('train  {: 10.0f} sec'.format(param.pop('train')))
    gb = xgb.Booster(param, model_file = name)
    X, y = load()
    print('test X {} y {}'.format(X.shape, y.shape))
    dtest = xgb.DMatrix(X, label=y)
    preds = gb.predict(dtest, ntree_limit=num_round)
    print('acc', (dtest.get_label() == (preds >= 0.5)).mean())
    print('auc', roc_auc_score(dtest.get_label(), preds))
    
exec(load_sps.replace("all00.svm", "all01.svm"))
eval_model(baseline, load)

urls-data/baseline.model
load          120 sec
train         267 sec
test X (718839, 3381408) y (718839,)
acc 0.988820862529718
auc 0.9990112327853139


In [14]:
cmd = f"""
RAYON_NUM_THREADS={nthread} /usr/bin/time ./csl/target/release/csl \
  --budget 500 --compress TargetEncode \
  --train ./urls-data/sps00.svm \
  --valid ./urls-data/sps01.svm
"""
! {cmd}

num threads 64
intitialize training file scanner 1ms
first cardinality scan 531ms
num lines 1677291
num words total 144190548
approx unique words 2707692
nfeatures 2707692
edge collection 27s
num unique edges 259136336
avg degree 191
avg nnz per row 85
max nnz per row 381
avg edges per row 3757
adjacency degree 2s
adjacency offsets 2ms
adjacency assign 6s
adjacency list construction 8s
graph coloring 3s
num colors 465
stats for counts: 0%: 1 25%: 1 50%: 2 75%: 2 90%: 4 95%: 8 99%: 79 100%: 1677291
budget 500 >= ncolors 465, will have excess
categorical encoding 4s
out file "./urls-data/sps00.te500.svm" exists, will overwrite
convert training 15s
e2e pipeline time 58s
out file "./urls-data/sps01.te500.svm" exists, will overwrite
convert valid 6s
1640.41user 204.19system 1:04.09elapsed 2877%CPU (0avgtext+0avgdata 16081512maxresident)k
704inputs+17383608outputs (0major+14411075minor)pagefaults 0swaps


In [15]:
te_model = "urls-data/te.model"
load_te = f"""
def load():
    import numpy as np
    {import_svmlight}
    X0, y = load_svmlight_file('urls-data/cont00.svm')
    X1, y1 = load_svmlight_file('urls-data/sps00.te500.svm')
    X0 = X0.todense()
    X1 = X1.todense()
    assert np.all(y == y1)
    X = np.concatenate((X0, X1), axis=1)
    return X, y
"""
! /usr/bin/time python xgb.py {nthread} {max_depth} {num_round} {te_model} "{load_te}"

load 137.73586535453796
train X (1677291, 529) y (1677291,)
train 90.85661792755127
4975.93user 74.15system 3:49.64elapsed 2199%CPU (0avgtext+0avgdata 27917092maxresident)k
0inputs+384outputs (0major+16062789minor)pagefaults 0swaps


In [16]:
exec(load_te.replace("cont00.svm", "cont01.svm").replace("sps00", "sps01"))
eval_model(te_model, load)

urls-data/te.model
load          138 sec
train          91 sec
test X (718839, 529) y (718839,)
acc 0.9129248691292486
auc 0.9938938059551535


In [17]:
dense_model = "urls-data/dense.model"
load_dense = f"""
def load():
    import numpy as np
    {import_svmlight}
    return load_svmlight_file('urls-data/cont00.svm')
"""
! /usr/bin/time python xgb.py {nthread} {max_depth} {num_round} {dense_model} "{load_dense}"
exec(load_dense.replace("cont00.svm", "cont01.svm"))
eval_model(dense_model, load)

load 30.084582090377808
train X (1677291, 64) y (1677291,)
train 33.29405879974365
1197.79user 16.21system 1:04.08elapsed 1894%CPU (0avgtext+0avgdata 3133860maxresident)k
0inputs+800outputs (0major+2212160minor)pagefaults 0swaps
urls-data/dense.model
load           30 sec
train          33 sec
test X (718839, 64) y (718839,)
acc 0.9696274130925006
auc 0.9923397746323106


In [18]:
cmd = f"""
RAYON_NUM_THREADS={nthread} /usr/bin/time ./csl/target/release/csl \
  --budget 16384 --compress SubmodularExpansion \
  --train ./urls-data/sps00.svm \
  --valid ./urls-data/sps01.svm
"""
! {cmd}

num threads 64
intitialize training file scanner 1ms
first cardinality scan 538ms
num lines 1677291
num words total 144190548
approx unique words 2707692
nfeatures 2707692
edge collection 27s
num unique edges 259136336
avg degree 191
avg nnz per row 85
max nnz per row 381
avg edges per row 3757
adjacency degree 2s
adjacency offsets 3ms
adjacency assign 7s
adjacency list construction 10s
graph coloring 3s
num colors 465
sketch collection 3s
counts for feature x: 0%: 1 25%: 1 50%: 2 75%: 2 90%: 4 95%: 8 99%: 79 100%: 1677291
P(y>0|x), in %: 0%: 0 25%: 0 50%: 0 75%: 1 90%: 1 95%: 1 99%: 1 100%: 1
collect input feature stats 54ms
sort features 51ms
accumulate 48ms
heap init 1s
lazy greedy 16ms
extract dictionary 30ms
submodular quantization 1s
categorical encoding 5s
out file "./urls-data/sps00.sm16384.svm" exists, will overwrite
convert training 3s
e2e pipeline time 49s
out file "./urls-data/sps01.sm16384.svm" exists, will overwrite
convert valid 1s
1618.35user 166.76system 0:50.28elapsed

In [20]:
sm_model = "urls-data/sm.model"
load_sm = f"""
def load():
    import numpy as np, scipy.sparse as sps
    {import_svmlight}
    X0, y = load_svmlight_file('urls-data/cont00.svm')
    X1, y1 = load_svmlight_file('urls-data/sps00.sm16384.svm')
    assert np.all(y == y1)
    X = sps.hstack((X0, X1))
    return X, y
"""
! /usr/bin/time python xgb.py {nthread} {max_depth} {num_round} {sm_model} "{load_sm}"
exec(load_sm.replace("cont00.svm", "cont01.svm").replace("sps00", "sps01"))
eval_model(sm_model, load)

load 121.07721495628357
train X (1677291, 886) y (1677291,)
train 37.399537324905396
1809.10user 58.03system 2:39.22elapsed 1172%CPU (0avgtext+0avgdata 14692196maxresident)k
416inputs+560outputs (2major+8412782minor)pagefaults 0swaps
urls-data/sm.model
load          121 sec
train          37 sec
test X (718839, 886) y (718839,)
acc 0.9857534162726285
auc 0.9989700485474203
