In [1]:
# This is the third attempt at training a NN. I am doing sequence padding
# again, but truncating to as low a value as possible to make training
# faster and avoid memory issues (I've been having crashes on the current)
# feature set

In [2]:
# Force Theano to use multiple cores
!OMP_NUM_THREADS=4

In [3]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
import matplotlib.pyplot as plt
from scipy.io import mmread
%matplotlib inline

Using Theano backend.


In [4]:
from theano import config
config.openmp = True
config.openmp_elemwise_minsize = 100000

In [5]:
np.random.seed(42)

In [6]:
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

def classes_to_Y(classes):
    output = []
    for cls in classes:
        output.append(malware_classes.index(cls))
    return np.array(output)

In [2]:
!ls ../data/features

100_cutoff_alphabet_19679_padded_len1.npy
100_cutoff_alphabet_19679_padded_len200.npy
100_cutoff_alphabet_19679_padded_len35.npy
100_cutoff_alphabet_19679_padded_len5.npy
100_cutoff_alphabet_19679_word_to_intseq.npy
10_cutoff_word_to_intseq.npy
3_cutoff_word_to_intseq.npy
50_cutoff_word_to_intseq.npy
count_vector_full_10k_features.npy
count_vector_full_10k_features_tfidf.npy
naive_word_hashed_full_features.mtx.tar.gz
test_ids.npy
tfifd_4gram_hashed_full_features.mtx
train_classes.npy
train_ids.npy


In [7]:
# load training classes
classes = np.load("../data/features/train_classes.npy")

In [8]:
sparse_mat_train_test = mmread("../data/features/tfifd_4gram_hashed_full_features.mtx")

In [9]:
# convert csr to a numpy array
sparse = sparse_mat_train_test.toarray()

# pull out training examples
X = sparse[:classes.shape[0],:]
# X_CV = X[-300:]
# X = X[:-300]

X_test = sparse[classes.shape[0]:,:]
print X_test.shape

Y = classes_to_Y(classes)

Y_hot = np.zeros((classes.shape[0], 16))
for i, clazz in enumerate(Y):
    Y_hot[i,clazz] = 1

print Y_hot

(3724, 1048576)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [10]:
# Just to check that worked ok.
print classes[21]
print Y[21]
print Y_hot[21]
print len(malware_classes)

None
8
[ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
15


In [11]:
# Now randomly select 100 samples to hold out
rand_index = np.random.permutation(np.arange(classes.shape[0]))

X_train = X[rand_index[100:]]
Y_train = Y_hot[rand_index[100:]]
X_validate = X[rand_index[:100]]
Y_validate = Y_hot[rand_index[:100]]


print X_train.shape

print Y_train.shape

print X_validate.shape

print Y_validate.shape
                                   

(2986, 1048576)
(2986, 16)
(100, 1048576)
(100, 16)


In [12]:
# Clobbering to save memory
padding = 0
full_features = 0
classes= 0
X = 0
Y_hot = 0
Y =0

In [13]:
batch_size = 150
model = Sequential()
model.add(Dense(1000, input_dim=1048576, activation="tanh"))
model.add(Dropout(.5))
model.add(Dense(500, activation="tanh"))
model.add(Dropout(.5))
model.add(Dense(200, activation="tanh"))
model.add(Dropout(.2))
model.add(Dense(16, activation="softmax"))

In [14]:
model.compile(loss='categorical_crossentropy',
             optimizer="adam",
             metrics=["accuracy"])

In [15]:
from keras.callbacks import ProgbarLogger, History, LambdaCallback, ModelCheckpoint

In [16]:
import psutil
from __future__ import print_function
summarize = lambda *__: print([psutil.virtual_memory(),psutil.cpu_percent(percpu=True)])

In [17]:
callbacks = [
    ProgbarLogger(), 
    History(),
    LambdaCallback(
        on_batch_begin=summarize, 
        on_batch_end=summarize, 
        on_epoch_begin=summarize
    ),
    ModelCheckpoint(
        "bigtfifd_best_weighted.hdf5", 
        verbose=1, 
        monitor="val_acc",
        mode="max",
        save_best_only=True)]

class_weights = {
    0: 14,
    1: 32,
    2: 43,
    3: 51,
     4: 39,
     5: 41,
     6: 30,
     7: 39,
     8: 1,
     9: 77,
     10: 3,
     11: 50,
     12: 4,
     13: 27,
     14: 40,
     15: 1
}

model.fit(
    X_train, Y_train, batch_size=batch_size,
    nb_epoch=5, verbose=1, callbacks=callbacks, 
    class_weight=class_weights,
    validation_data=(X_validate, Y_validate)
    )

Train on 2986 samples, validate on 100 samples
Epoch 1/5
[svmem(total=270846246912, available=151952228352, percent=43.9, used=109137641472, free=39880314880, active=127155335168, inactive=91049312256, buffers=467398656, cached=121360891904, shared=8891445248), [100.0, 0.6, 0.0, 100.0, 1.5, 82.6, 100.0, 0.0, 100.0, 4.4, 0.0, 100.0, 0.2, 0.0, 0.1, 0.0, 100.0, 100.0, 100.0, 100.0, 0.1, 0.0, 0.0, 0.0, 0.2, 0.2, 0.2, 0.1, 0.0, 0.2, 0.2, 0.0, 100.0, 100.0, 100.0, 100.0, 0.1, 0.0, 0.2, 0.0, 0.5, 1.2, 0.0, 0.1, 0.0, 0.0, 0.0, 0.2, 100.0, 100.0, 100.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.2, 0.0, 0.2, 0.0]]
Epoch 1/5
[svmem(total=270846246912, available=150677401600, percent=44.4, used=110412427264, free=38605406208, active=128415502336, inactive=91049291776, buffers=467398656, cached=121361014784, shared=8891445248), [100.0, 0.0, 0.0, 100.0, 0.6, 100.0, 100.0, 0.0, 100.0, 0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 100.0, 100.0, 100.0, 100.0, 0.0, 0.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 

<keras.callbacks.History at 0x2b7f7c00b8d0>

In [None]:
print "ok"

In [24]:
predictions = model.predict(X_test)
print(predictions)
class_preds = model.predict_classes(X_test)
print(class_preds)
class_prob = model.predict_proba(X_test)
print(class_prob)
np.save("../predictions/tfidf_deepnet.npy", predictions)
np.save("../predictions/tfidf_deepnet_seq_class.npy", class_preds)
np.save("../predictions/tfidf_deepnet_probs.npy", class_prob)

[[  2.15688742e-05   1.97208578e-06   5.74652913e-05 ...,   9.61687692e-06
    6.77621429e-05   4.40143449e-05]
 [  5.23514114e-04   1.72804648e-05   2.08137304e-01 ...,   2.16114728e-04
    1.57162343e-04   3.90037138e-04]
 [  3.17032172e-05   2.27671280e-05   8.07671040e-06 ...,   1.56348488e-05
    2.91905144e-06   3.40267184e-06]
 ..., 
 [  1.62160868e-04   2.97555744e-06   4.59415896e-05 ...,   7.12431483e-06
    3.94365961e-05   5.38862660e-05]
 [  1.65346966e-04   2.21393893e-05   3.81413083e-05 ...,   2.36850738e-05
    3.11092685e-06   3.21430957e-06]
 [  1.55904796e-04   1.99106835e-05   2.75537841e-05 ...,   1.11764248e-05
    1.96607743e-06   2.02078536e-06]]
[10  5  8 ..., 10  8  8]
[[  2.15688742e-05   1.97208578e-06   5.74652913e-05 ...,   9.61687692e-06
    6.77621429e-05   4.40143449e-05]
 [  5.23514114e-04   1.72804648e-05   2.08137304e-01 ...,   2.16114728e-04
    1.57162343e-04   3.90037138e-04]
 [  3.17032172e-05   2.27671280e-05   8.07671040e-06 ...,   1.56348488e

In [18]:
class_preds = model.predict_classes(X_test)
print(class_preds)
test_ids = np.load("../data/features/test_ids.npy")
print(test_ids)
write_predictions(class_preds, test_ids, "../predictions/tfidf_deepnet_class_weight.csv")

[10  5  8 ..., 10  8  8]
['e5b875f7e584b29fd9e85c1f232956849aabcb311'
 '18abefbfb74285D709bcf665d594df11bf56e1984'
 '47cd5265b1fc52021c025452e084c405a0a03df1e' ...,
 '6abb75b149d8e39e30c8df2c19bfd96986f0e35b3'
 'f0e968070037717da88665ab091ff2B4973528f30'
 '7b2459e11cac9341a00fa7bDcd5b17618a0b97dc8']


In [None]:
from keras.models import load_model
model_best = load_model("bigtfifd_best.hdf5")