In [2]:
REPO_ROOT = "/usr/src/app"

import json
import math
import pickle
import time

import numpy as np
import pandas as pd
import scipy as sc

from sklearn.svm import *
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import normalize

import theano
import theano.tensor as T
import lasagne

import util

In [3]:
def load_model(model_type, train_size):
    with open("%s/model-data/dataset_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "r") as f:
        return pickle.load(f)

def concat_models(model_names, train_size):
    datasets = [load_model(name, train_size) for name in model_names]
    
    X_trains = [
        normalize(sc.sparse.csr.csr_matrix(dataset["X_train"]), norm='l2', axis=1)
        for dataset in datasets
    ]
    X_tests = [
        normalize(sc.sparse.csr.csr_matrix(dataset["X_test"]), norm='l2', axis=1)
        for dataset in datasets
    ]
       
    concat_dataset = {
        "X_train": sc.sparse.hstack(X_trains),
        "Y_train": datasets[0]["Y_train"],
        "X_test": sc.sparse.hstack(X_tests),
        "Y_test": datasets[0]["Y_test"],
        "shas_test": datasets[0]["shas_test"],
    }
    
    print "Datasets %s: %s = %s" % (
        model_names,
        " + ".join([str(np.shape(dataset["X_train"])[1]) for dataset in datasets]),
        np.shape(concat_dataset["X_train"])[1])

    print "Labels equal: %s %s" % (
        [np.array_equal(datasets[0]["Y_train"], dataset["Y_train"]) for dataset in datasets[1:]],
        [np.array_equal(datasets[0]["Y_test"], dataset["Y_test"]) for dataset in datasets[1:]])
    
    return concat_dataset

def test_model(dataset, model_type, train_size, model, model_name, output_errors):
    model.fit(dataset["X_train"], dataset["Y_train"])
    test_pred = model.predict(dataset["X_test"])
    test_y = dataset["Y_test"]
    shas = dataset["shas_test"]
        
    accuracy = (float(sum(test_y == test_pred))) / len(test_pred)
    precision = (float(sum((test_y == test_pred) & (test_pred == 1)))) / float(max(1, sum(test_pred == 1)))
    recall = (float(sum((test_y == test_pred) & (test_pred == 1)))) / float(sum(test_y == 1))
    f1 = 2 * (precision * recall) / max(1, precision + recall)

    print "%10s %15s. Train set size %5d. %0.1f%% / %0.1f%% / %0.1f%% (%0.3f)" % (
        model_type,
        model_name,
        train_size,
        accuracy * 100,
        precision * 100,
        recall * 100,
        f1)
        
    output_table.append([
        model_type,
        model_name,
        train_size, 
        accuracy,
        precision,
        recall,
        f1,
    ])
    
    if output_errors:
        # Save 10 errors
        error_shas = np.array(shas)[test_y != test_pred][0:50]
        error_correct = np.array(test_y)[test_y != test_pred][0:50]

        with open("%s/results/model_errors_%s_%s_%d.txt" % (REPO_ROOT, model_type, model_name, train_size), "w") as fout:
            for sha, correct in zip(error_shas, error_correct):
                fout.write("#### %s FLAG: %s ####\n\n" % (sha, "Yes" if correct > 0 else "No"))
                with open("%s/scripts/%s.js" % (REPO_ROOT, sha), "r") as fin:
                    fout.write(fin.read())
                fout.write("\n\n")


In [4]:
def build_mlp(input_var, input_size):
    l_in = lasagne.layers.InputLayer(shape=(None, input_size),
                                     input_var=input_var)
    l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
    
    l_hid1 = lasagne.layers.DenseLayer(
        l_in_drop, num_units=20,
        nonlinearity=lasagne.nonlinearities.tanh,
        W=lasagne.init.GlorotUniform())
    
    l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=0.5)

    l_hid2 = lasagne.layers.DenseLayer(
        l_hid1_drop, num_units=10,
        nonlinearity=lasagne.nonlinearities.tanh,
        W=lasagne.init.GlorotUniform())
    
    l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=0.5)
    
    l_out = lasagne.layers.DenseLayer(
        l_hid2_drop, num_units=2,
        nonlinearity=lasagne.nonlinearities.softmax)
    
    return l_out

def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert np.shape(inputs)[0] == len(targets)
    indices = np.arange(np.shape(inputs)[0])
    if shuffle:
        np.random.shuffle(indices)
    for start_idx in range(0, np.shape(inputs)[0] - batchsize + 1, batchsize):
        excerpt = indices[start_idx:start_idx + batchsize]
        if isinstance(inputs[excerpt], np.ndarray):
            i = inputs[excerpt]
        else:
            i = inputs[excerpt].toarray()
        yield i, targets[excerpt]
    
def test_mlp(dataset, model_type, train_size):
    input_var = T.matrix('inputs')
    target_var = T.lvector('targets')
    # Create neural network model
    network = build_mlp(input_var, np.shape(dataset["X_train"])[1])
    
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.adam(loss, params)
    
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)
    test_loss = test_loss.mean()
    
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                  dtype=theano.config.floatX)
    
    train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True)
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)
    
    if isinstance(dataset["X_train"], np.ndarray):
        X_train_flat = dataset["X_train"]
    else:
        X_train_flat = dataset["X_train"].tocsc()

    if isinstance(dataset["X_test"], np.ndarray):
        X_test_flat = dataset["X_test"]
    else:
        X_test_flat = dataset["X_test"].tocsc()

    best_accuracy = 0
    bad_count = 0
    batch_size = min(200, train_size/10)
    for epoch in xrange(999):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train_flat, dataset["Y_train"], batch_size, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_test_flat, dataset["Y_test"], batch_size, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1
        
        current_accuracy = val_acc / val_batches

        # Then we print the results for this epoch:
        print("Epoch {} took {:.3f}s - accuracy {:.2f} %".format(
            epoch + 1, time.time() - start_time, current_accuracy * 100))
        
        if current_accuracy > best_accuracy:
            best_accuracy = current_accuracy
            bad_count = 0
        else:
            bad_count += 1
            if bad_count > 4:
                break
        
    print "%10s %15s. Train set size %5d. %0.1f%% / %0.1f%% / %0.1f%% (%0.3f)" % (
            model_type,
            "MLP",
            train_size,
            current_accuracy * 100,
            0,
            0,
            0)
    output_table.append([
            model_type,
            "MLP",
            train_size, 
            current_accuracy,
            0,
            0,
            0,
        ])

In [5]:
with open("%s/model-data/metadata.pickle" % (REPO_ROOT,), "r") as f:
    size_data = pickle.load(f)
    
TRAIN_SIZES = size_data["train_sizes"]
TEST_SIZE = size_data["test_size"]

print "Training sizes: %s" % TRAIN_SIZES
print "Test size: %d" % TEST_SIZE

Training sizes: [300, 600, 1200, 2400, 4800, 9600, 19200]
Test size: 3588


In [17]:
output_table = []

for train_size in TRAIN_SIZES:
    for model_type in ["RegEx", "BiRegEx", "TriRegEx", "AST", "BiAST", "TriAST", "Random2Vec", "Word2Vec", "AST2Vec"]:
        dataset = load_model(model_type, train_size)

        test_model(dataset, model_type, train_size,
                   KNeighborsClassifier(2), "KNN",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   BernoulliNB(), "Bernoulli",
                   train_size == TRAIN_SIZES[-1])

        test_model(dataset, model_type, train_size,
                   linear_model.SGDClassifier(n_iter=1000, loss="log"), "SGD",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   RandomForestClassifier(max_depth=15, n_estimators=100, max_features=30), "RandomForest",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   LinearSVC(), "LinearSVC",
                   train_size == TRAIN_SIZES[-1])
                
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/linear_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

     RegEx             KNN. Train set size   300. 78.0% / 87.3% / 65.7% (0.749)
     RegEx       Bernoulli. Train set size   300. 66.0% / 60.9% / 89.2% (0.724)
     RegEx             SGD. Train set size   300. 79.7% / 83.5% / 73.9% (0.784)
     RegEx    RandomForest. Train set size   300. 81.1% / 84.8% / 75.8% (0.800)
     RegEx       LinearSVC. Train set size   300. 79.4% / 84.6% / 71.9% (0.777)
   BiRegEx             KNN. Train set size   300. 78.1% / 85.7% / 67.4% (0.755)
   BiRegEx       Bernoulli. Train set size   300. 64.0% / 59.1% / 91.3% (0.717)
   BiRegEx             SGD. Train set size   300. 79.9% / 83.5% / 74.4% (0.787)
   BiRegEx    RandomForest. Train set size   300. 80.7% / 84.4% / 75.4% (0.796)
   BiRegEx       LinearSVC. Train set size   300. 79.5% / 84.1% / 72.6% (0.780)
  TriRegEx             KNN. Train set size   300. 78.8% / 87.1% / 67.5% (0.761)
  TriRegEx       Bernoulli. Train set size   300. 63.2% / 58.2% / 93.4% (0.717)
  TriRegEx             SGD. Train set si

In [15]:
output_table = []

train_size = TRAIN_SIZES[-1]
for model_type in ["BiRegEx1K", "BiRegEx4K", "BiRegEx16K", "BiRegEx64K", "BiRegEx256K", "BiRegEx1M"]:
    dataset = load_model(model_type, train_size)

    test_model(dataset, model_type, train_size,
               LinearSVC(), "LinearSVC",
               train_size == TRAIN_SIZES[-1])
                
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/truncated_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

 BiRegEx1K       LinearSVC. Train set size 19200. 91.8% / 94.4% / 88.8% (0.915)
 BiRegEx4K       LinearSVC. Train set size 19200. 91.8% / 93.9% / 89.4% (0.916)
BiRegEx16K       LinearSVC. Train set size 19200. 92.1% / 93.9% / 89.9% (0.919)
BiRegEx64K       LinearSVC. Train set size 19200. 92.1% / 94.0% / 90.1% (0.920)
BiRegEx256K       LinearSVC. Train set size 19200. 92.1% / 93.9% / 90.0% (0.919)
 BiRegEx1M       LinearSVC. Train set size 19200. 92.1% / 93.8% / 90.1% (0.919)


In [18]:
output_table = []

train_size = TRAIN_SIZES[-1]
for model_type in ["RegEx", "TriRegEx", "AST", "TriAST", "AST2Vec"]:
    dataset = load_model(model_type, train_size)

    test_mlp(dataset, model_type, train_size)
        
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/mlp_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

In [22]:
output_table = []

for train_size in TRAIN_SIZES:
    for model_type in ["Url3", "Url6", "Url12"]:
        dataset = load_model(model_type, train_size)

        test_model(dataset, model_type, train_size,
                   KNeighborsClassifier(2), "KNN",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   BernoulliNB(), "Bernoulli",
                   train_size == TRAIN_SIZES[-1])

        test_model(dataset, model_type, train_size,
                   linear_model.SGDClassifier(n_iter=1000, loss="log"), "SGD",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   RandomForestClassifier(max_depth=15, n_estimators=100, max_features=30), "RandomForest",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   LinearSVC(), "LinearSVC",
                   train_size == TRAIN_SIZES[-1])
        
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/url_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

      Url3             KNN. Train set size   300. 80.5% / 82.0% / 78.2% (0.800)
      Url3       Bernoulli. Train set size   300. 71.8% / 92.6% / 47.3% (0.626)
      Url3             SGD. Train set size   300. 83.8% / 86.1% / 80.6% (0.833)
      Url3    RandomForest. Train set size   300. 81.2% / 92.1% / 68.2% (0.784)
      Url3       LinearSVC. Train set size   300. 84.0% / 86.9% / 80.1% (0.834)
      Url6             KNN. Train set size   300. 81.9% / 86.7% / 75.5% (0.807)
      Url6       Bernoulli. Train set size   300. 70.0% / 97.4% / 41.2% (0.579)
      Url6             SGD. Train set size   300. 85.8% / 89.4% / 81.3% (0.852)
      Url6    RandomForest. Train set size   300. 80.0% / 91.8% / 65.9% (0.767)
      Url6       LinearSVC. Train set size   300. 85.7% / 89.4% / 80.9% (0.850)
     Url12             KNN. Train set size   300. 82.2% / 88.8% / 73.6% (0.805)
     Url12       Bernoulli. Train set size   300. 68.7% / 99.6% / 37.6% (0.546)
     Url12             SGD. Train set si

In [7]:
output_table = []

train_size = TRAIN_SIZES[-1]
model_type = "FileSize"
dataset = load_model(model_type, train_size)

test_model(dataset, model_type, train_size,
           KNeighborsClassifier(2), "KNN",
           train_size == TRAIN_SIZES[-1])

test_model(dataset, model_type, train_size,
           BernoulliNB(), "Bernoulli",
           train_size == TRAIN_SIZES[-1])

test_model(dataset, model_type, train_size,
           linear_model.SGDClassifier(n_iter=1000, loss="log"), "SGD",
           train_size == TRAIN_SIZES[-1])

test_model(dataset, model_type, train_size,
           RandomForestClassifier(max_depth=15, n_estimators=100, max_features=15), "RandomForest",
           train_size == TRAIN_SIZES[-1])

test_model(dataset, model_type, train_size,
           LinearSVC(), "LinearSVC",
           train_size == TRAIN_SIZES[-1])
        
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/filesize_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

  FileSize             KNN. Train set size 19200. 52.5% / 58.2% / 17.6% (0.204)
  FileSize       Bernoulli. Train set size 19200. 61.8% / 61.1% / 64.9% (0.630)
  FileSize             SGD. Train set size 19200. 66.0% / 64.0% / 73.4% (0.683)
  FileSize    RandomForest. Train set size 19200. 66.0% / 64.0% / 73.4% (0.683)
  FileSize       LinearSVC. Train set size 19200. 66.0% / 64.0% / 73.4% (0.683)


In [16]:
output_table = []

train_size = TRAIN_SIZES[-1]
for model_names in [
        ("BiRegEx", "Url6", "FileSize"),
        ("BiRegEx1K", "Url6", "FileSize"),
        ("BiRegEx", "Url6"),
        ("BiRegEx1K", "Url6"),
        ("BiRegEx", "TriAST", "Url6"),
        ("RegEx", "Random2Vec"),
        ("RegEx", "AST"),
        ("BiRegEx", "Word2Vec"),
        ("BiRegEx", "TriAST"),
        ("Word2Vec", "AST2Vec") ]:
    model_type = "-".join(model_names)
    dataset = concat_models(model_names, train_size)

    test_model(dataset, model_type, train_size,
               RandomForestClassifier(max_depth=15, n_estimators=100, max_features=30), "RandomForest",
               False)

    test_model(dataset, model_type, train_size,
               LinearSVC(), "LinearSVC",
               False)
        
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/combined_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

Datasets ('BiRegEx', 'Url6', 'FileSize'): 500000 + 500000 + 15 = 1000015
Labels equal: [True, True] [True, True]
BiRegEx-Url6-FileSize    RandomForest. Train set size 19200. 81.9% / 93.4% / 68.6% (0.791)
BiRegEx-Url6-FileSize       LinearSVC. Train set size 19200. 96.6% / 97.8% / 95.4% (0.966)
Datasets ('BiRegEx1K', 'Url6', 'FileSize'): 101468 + 500000 + 15 = 601483
Labels equal: [True, True] [True, True]
BiRegEx1K-Url6-FileSize    RandomForest. Train set size 19200. 78.7% / 98.5% / 58.4% (0.733)
BiRegEx1K-Url6-FileSize       LinearSVC. Train set size 19200. 96.4% / 97.5% / 95.3% (0.964)
Datasets ('BiRegEx', 'Url6'): 500000 + 500000 = 1000000
Labels equal: [True] [True]
BiRegEx-Url6    RandomForest. Train set size 19200. 81.5% / 94.5% / 67.0% (0.784)
BiRegEx-Url6       LinearSVC. Train set size 19200. 96.5% / 98.0% / 95.0% (0.965)
Datasets ('BiRegEx1K', 'Url6'): 101468 + 500000 = 601468
Labels equal: [True] [True]
BiRegEx1K-Url6    RandomForest. Train set size 19200. 79.0% / 98.7% / 58

IOError: [Errno 2] No such file or directory: '/usr/src/app/model-data/dataset_Random2Vec_19200.pickle'

In [17]:
def calculate_accuracy(X_test, Y_test, b, w):
    nrows = np.shape(X_test)[0]
    num_correct = 0
    for row in xrange(nrows):
        score = b + np.dot(X_test.getrow(row).toarray().flatten(), w)
        pred = 1 if score > 0 else 0
        actual = Y_test[row]
        correct = (pred == actual)
        if correct:
            num_correct += 1
        
    return float(num_correct) / nrows

def test_thresholds(dataset, b, w):
    max_threshold = np.max(np.abs(w))
    best_threshold = 0
    best_accuracy = 0
    
    if isinstance(dataset["X_test"], np.ndarray):
        X_test_flat = dataset["X_test"]
    else:
        X_test_flat = dataset["X_test"].tocsc()
        
    print "Max weight: %f" % max_threshold
    for threshold in np.arange(0, max_threshold, max_threshold/100):
        wprime = np.array(w)
        wprime[np.abs(wprime) <= threshold] = 0
        nonzero_count = np.sum(wprime != 0)
        accuracy = calculate_accuracy(X_test_flat, dataset["Y_test"], b, wprime)
        best_accuracy = max(best_accuracy, accuracy)
        print "Accuracy at threshold %f (%d): %.1f%%" % (threshold, nonzero_count, accuracy * 100)
        if accuracy > 0.99 * best_accuracy:
            best_threshold = threshold
        else:
            break
    print "Done. Best threshold: %f" % best_threshold
    return best_threshold

def recover_input(tokenized_input, test_vector, vocab_info):
    vocab_size = np.shape(vocab_info["idf"])[0]
    input_vector = np.zeros(vocab_size)

    for tokens in tokenized_input:
        #print "%s -> %s" % (tokens, vocab_info["vocab"].get(tokens, "-"))
        if tokens in vocab_info["vocab"]:
            index = vocab_info["vocab"][tokens]
            idf = vocab_info["idf"][index]
            input_vector[index] += idf

    print "norm: %s" % np.linalg.norm(input_vector)
    input_vector = input_vector / np.linalg.norm(input_vector)

    print "Test vector: %s" % test_vector[(test_vector != 0) | (input_vector != 0)]
    print "Input vector: %s" % input_vector[(test_vector != 0) | (input_vector != 0)]
    
    for idx in xrange(np.shape(test_vector)[0]):
        if test_vector[idx] != 0 and input_vector[idx] == 0:
            print "Test %d %s = %f" % (
                idx, next(v for v, i in vocab_info["vocab"].iteritems() if i == idx), test_vector[idx])
        if test_vector[idx] == 0 and input_vector[idx] != 0:
            print "Input %d %s = %f" % (
                idx, next(v for v, i in vocab_info["vocab"].iteritems() if i == idx), input_vector[idx])

In [19]:
def validate_url6():
    model_type = "Url6"
    train_size = TRAIN_SIZES[-1]
    dataset = load_model(model_type, train_size)

    with open("%s/model-data/vocab_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "r") as f:
        vocab_info = pickle.load(f)

    # Try to recover a test input
    url = dataset["urls_test"][0]
    url_chars = util.ngramizer(util.tokenize_url, 6)(next(util.parse_url([{"url": url}])))
    test_vector = dataset["X_test"].getrow(0).toarray().flatten()
    recover_input(url_chars, test_vector, vocab_info)

validate_url6()

norm: 266.414786012
Test vector: [ 0.2211524   0.01987661  0.03141249 ...,  0.01292275  0.03264087
  0.05653225]
Input vector: [ 0.2211524   0.01987661  0.03141249 ...,  0.01292275  0.03264087
  0.05653225]


In [20]:
def validate_bigregex():
    model_type = "BiRegEx1K"
    train_size = TRAIN_SIZES[-1]
    dataset = load_model(model_type, train_size)

    with open("%s/model-data/vocab_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "r") as f:
        vocab_info = pickle.load(f)

    # Try to recover a test input
    sha = dataset["shas_test"][0]
    tokens = util.ngramizer(util.tokenize_js, 2)(next(util.parse_js([{"sha": sha}])))
    test_vector = dataset["X_test"].getrow(0).toarray().flatten()
    recover_input(tokens, test_vector, vocab_info)

validate_bigregex()

norm: 29.5626283938
Test vector: [ 0.23359267  0.05684434  0.13917522  0.14201405  0.14218708  0.13108535
  0.0351596   0.07040361  0.03932155  0.03858764  0.06079984  0.0753506
  0.14037187  0.14218708  0.09358728  0.14218708  0.14218708  0.08745182
  0.14214374  0.09340654  0.14086872  0.07828346  0.06077638  0.13331221
  0.07932212  0.05829805  0.06432907  0.04666207  0.13491761  0.14197093
  0.14020786  0.14028977  0.11483334  0.14218708  0.14066081  0.14218708
  0.13837681  0.14218708  0.13776246  0.14053668  0.14218708  0.14218708
  0.14218708  0.14218708  0.14188486  0.14214374  0.26349541  0.14218708
  0.14218708  0.14171337  0.14201405  0.14218708  0.14218708  0.14201405
  0.14205722  0.12961839  0.13105413  0.07545332  0.06016067  0.14218708
  0.03735115  0.07142665  0.04105859  0.06735573]
Input vector: [ 0.23359267  0.05684434  0.13917522  0.14201405  0.14218708  0.13108535
  0.0351596   0.07040361  0.03932155  0.03858764  0.06079984  0.0753506
  0.14037187  0.14218708  0.0

In [21]:
def create_url_model():
    train_size = TRAIN_SIZES[-1]
    dataset = load_model("Url6", train_size)

    with open("%s/model-data/vocab_%s_%d.pickle" % (REPO_ROOT, "Url6", train_size), "r") as f:
        url_vocab_info = pickle.load(f)

    model = LinearSVC()
    model.fit(dataset["X_train"], dataset["Y_train"])

    b = model.intercept_[0]
    w = model.coef_.flatten()

    best_threshold = test_thresholds(dataset, b, w)

    # Reduce vocabulary using the threshold
    vocab_indices = np.nonzero(np.abs(w) > best_threshold)[0].tolist()

    index_map = {
        vocab_indices[idx]: idx
        for idx in xrange(len(vocab_indices))
    }

    print "%d indices" % len(vocab_indices)

    url_model = {
        "vocab": {
            key: index_map[index]
            for key, index in url_vocab_info["vocab"].iteritems()
            if index in index_map
        },
        "idf": [url_vocab_info["idf"][idx] for idx in vocab_indices],
        "w": [w[idx] for idx in vocab_indices],
        "b": b,
    }

    with open("%s/model-data/url_model.json" % REPO_ROOT, "w") as f:
        json.dump(url_model, f)
    
create_url_model()

Max weight: 3.980340
Accuracy at threshold 0.000000 (416002): 96.5%
Accuracy at threshold 0.039803 (127899): 96.5%
Accuracy at threshold 0.079607 (58556): 96.5%
Accuracy at threshold 0.119410 (30121): 96.3%
Accuracy at threshold 0.159214 (16497): 96.0%
Accuracy at threshold 0.199017 (9596): 95.5%
Done. Best threshold: 0.159214
16497 indices


In [27]:
def create_final_model():
    train_size = TRAIN_SIZES[-1]

    dataset = concat_models(["Url6", "BiRegEx1K", "FileSize"], train_size)

    with open("%s/model-data/vocab_%s_%d.pickle" % (REPO_ROOT, "Url6", train_size), "r") as f:
        url_vocab_info = pickle.load(f)

    with open("%s/model-data/vocab_%s_%d.pickle" % (REPO_ROOT, "BiRegEx1K", train_size), "r") as f:
        script_vocab_info = pickle.load(f)

    size_vocab = {
        2**(n+5): n for n in xrange(0, 15)
    }
    
    model = LinearSVC()
    model.fit(dataset["X_train"], dataset["Y_train"])

    b = model.intercept_[0]
    w = model.coef_.flatten()

    best_threshold = test_thresholds(dataset, b, w)

    # Reduce vocabulary using the threshold
    vocab_indices = np.nonzero(np.abs(w) > best_threshold)[0].tolist()

    url_vocab_size = np.shape(url_vocab_info["idf"])[0]
    script_vocab_size = np.shape(script_vocab_info["idf"])[0]

    url_index_map = {
        vocab_indices[idx]: idx
        for idx in xrange(len(vocab_indices))
        if vocab_indices[idx] < url_vocab_size
    }

    url_new_vocab_size = len(url_index_map)
    
    script_index_map = {
        (vocab_indices[idx] - url_vocab_size): idx - url_new_vocab_size
        for idx in xrange(len(vocab_indices))
        if vocab_indices[idx] >= url_vocab_size and vocab_indices[idx] < url_vocab_size + script_vocab_size
    }
    
    script_new_vocab_size = len(script_index_map)
    
    size_index_map = {
        (vocab_indices[idx] - url_vocab_size - script_vocab_size): idx - url_new_vocab_size - script_new_vocab_size
        for idx in xrange(len(vocab_indices))
        if vocab_indices[idx] >= url_vocab_size + script_vocab_size
    }

    print "%d indices (%d url; %d script; %d size)" % (
        len(vocab_indices), len(url_index_map), len(script_index_map), len(size_index_map))

    final_model = {
        # Just the vocab & weights for URL features
        "url": {
            "vocab": {
                key: url_index_map[index]
                for key, index in url_vocab_info["vocab"].iteritems()
                if index in url_index_map
            },
            "idf": [url_vocab_info["idf"][idx] for idx in vocab_indices[:url_new_vocab_size]],
            "w": [w[idx] for idx in vocab_indices[:url_new_vocab_size]],
        },
        # Just the vocab & weights for the script features
        "script": {
             "vocab": {
                key: script_index_map[index]
                for key, index in script_vocab_info["vocab"].iteritems()
                if index in script_index_map
            },
            "idf": [script_vocab_info["idf"][idx - url_vocab_size] for idx in vocab_indices[url_new_vocab_size:url_new_vocab_size + script_new_vocab_size]],
            "w": [w[idx] for idx in vocab_indices[url_new_vocab_size:url_new_vocab_size + script_new_vocab_size]],
        },
        # Just the indices & weights for the file size features
        "size": {
            "vocab": {
                key: size_index_map[index]
                for key, index in size_vocab.iteritems()
                if index in size_index_map
            },
            "idf": [1 for idx in vocab_indices[url_new_vocab_size + script_new_vocab_size:]],
            "w": [w[idx] for idx in vocab_indices[url_new_vocab_size + script_new_vocab_size:]],
        },
        # The SVM intercept
        "b": b,
    }

    with open("%s/model-data/final_model.json" % REPO_ROOT, "w") as f:
        json.dump(final_model, f)

    
create_final_model()

Datasets ['Url6', 'BiRegEx1K', 'FileSize']: 500000 + 101468 + 15 = 601483
Labels equal: [True, True] [True, True]
Max weight: 3.180620
Accuracy at threshold 0.000000 (506054): 96.6%
Accuracy at threshold 0.031806 (141096): 96.5%
Accuracy at threshold 0.063612 (65634): 96.4%
Accuracy at threshold 0.095419 (34526): 96.3%
Accuracy at threshold 0.127225 (19823): 96.1%
Accuracy at threshold 0.159031 (11937): 95.8%
Accuracy at threshold 0.190837 (7590): 95.4%
Done. Best threshold: 0.159031
11937 indices (7712 url; 4216 script; 9 size)
