In [1]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import pandas as pd
import cPickle as pickle
import io

import util

## Feature Extraction

In [52]:
TRAIN_DIR = "train"
TEST_DIR = "test"

call_set = pickle.load(open( "call_set.p", "rb" ))

In [53]:
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = list(call_set)
    
    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [55]:
## Feature extraction
def main():
    # SAVE TRAINING DATA
    X_train, t_train, train_ids = create_data_matrix(0, 3085, direc=TRAIN_DIR)
    X_train = pd.DataFrame(X_train)
    t_train = pd.DataFrame(t_train, columns=['class'])
    train_ids = pd.DataFrame(train_ids,columns=['id'])
          
    filename1 = 'X_train.p'
    filename2 = 't_train.p'
    filename3 = 'train_ids.p'
    
    pickle.dump(X_train, io.open(filename1, 'wb'))
    pickle.dump(t_train, io.open(filename2, 'wb'))
    pickle.dump(train_ids, io.open(filename3, 'wb'))
    
    # SAVE TESTING DATA
    X_test, t_test, test_ids = create_data_matrix(0, 3723, direc=TEST_DIR)
    X_test = pd.DataFrame(X_test)
    test_ids = pd.DataFrame(test_ids,columns=['id'])
    
    filename4 = 'X_test.p'
    filename5 = 't_test.p'
    filename6 = 'test_ids.p'
    
    pickle.dump(X_test, io.open(filename4, 'wb'))
    pickle.dump(t_test, io.open(filename5, 'wb'))
    pickle.dump(test_ids, io.open(filename6, 'wb'))

In [56]:
if __name__ == "__main__":
    main()

## Data Modeling

In [57]:
X_train = pickle.load(open( "X_train.p", "rb" ))
t_train = pickle.load(open( "t_train.p", "rb" ))
train_ids = pickle.load(open( "train_ids.p", "rb" ))

X_test = pickle.load(open("X_test.p", "rb"))
t_test = pickle.load(open( "t_test.p", "rb" ))
test_ids = pickle.load(open( "test_ids.p", "rb" ))