In [1]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import pandas as pd
import pickle
import io

import util

## Feature Extraction

In [2]:
TRAIN_DIR = "train"
TEST_DIR = "test"

call_set = pickle.load(open( "call_set.p", "rb" ))

In [3]:
call_set_nonredundant = set(call_set) - set(["processes", "all_section"])

In [12]:
#
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = list(call_set)
    call_counter = {}
    total_calls = 0
    n_success, success_denom = 0.0,0.0
    for el in tree.iter():
        call = el.tag
        if el.attrib.has_key('successful'):
            success_denom += 1
            n_success += int(el.attrib['successful'])
        total_calls += 1
        if call not in call_counter:
            call_counter[call] = 1
        else:
            call_counter[call] += 1
    unq_calls = len(call_counter.keys())
    percentage_success = n_success / success_denom
            
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]
    print total_calls, unq_calls, percentage_success, success_denom, n_success
    call_feat_array = np.append(call_feat_array, [total_calls, unq_calls, percentage_success])
    
    return call_feat_array

In [14]:
## Feature extraction
def main():
    # SAVE TRAINING DATA
    X_train, t_train, train_ids = create_data_matrix(0, 3086, direc=TRAIN_DIR)
    X_train = pd.DataFrame(X_train)
    t_train = pd.DataFrame(t_train, columns=['class'])
    train_ids = pd.DataFrame(train_ids,columns=['id'])
          
    filename1 = 'X_train.p'
    filename2 = 't_train.p'
    filename3 = 'train_ids.p'
    
    pickle.dump(X_train, io.open(filename1, 'wb'))
    pickle.dump(t_train, io.open(filename2, 'wb'))
    pickle.dump(train_ids, io.open(filename3, 'wb'))
    
    # SAVE TESTING DATA
    X_test, t_test, test_ids = create_data_matrix(0, 3724, direc=TEST_DIR)
    X_test = pd.DataFrame(X_test)
    test_ids = pd.DataFrame(test_ids,columns=['id'])
    
    filename4 = 'X_test.p'
    filename5 = 't_test.p'
    filename6 = 'test_ids.p'
    
    pickle.dump(X_test, io.open(filename4, 'wb'))
    pickle.dump(t_test, io.open(filename5, 'wb'))
    pickle.dump(test_ids, io.open(filename6, 'wb'))

In [15]:
if __name__ == "__main__":
    main()

728 35 1.0 141.0 141.0
12137 64 0.962962962963 783.0 754.0
97 25 0.962962962963 27.0 26.0
484 35 1.0 92.0 92.0
1136 40 0.971264367816 174.0 169.0
23 7 1.0 18.0 18.0
716 34 1.0 143.0 143.0
1313 42 0.852040816327 196.0 167.0
713 34 0.981651376147 109.0 107.0
702 34 1.0 139.0 139.0
702 34 1.0 139.0 139.0
715 34 1.0 143.0 143.0
77 18 1.0 23.0 23.0
9926 56 0.986190089358 1231.0 1214.0
631 42 0.972413793103 145.0 141.0
28 10 1.0 20.0 20.0
706 34 1.0 140.0 140.0
936 27 1.0 35.0 35.0
709 34 1.0 141.0 141.0
117 26 1.0 37.0 37.0
7301 63 0.969655172414 725.0 703.0
97 25 0.962962962963 27.0 26.0
2909 40 0.989189189189 185.0 183.0
716 34 1.0 143.0 143.0
634 42 0.972789115646 147.0 143.0
1342 40 0.973404255319 188.0 183.0
706 34 1.0 139.0 139.0
705 34 1.0 140.0 140.0
2027 51 0.974576271186 118.0 115.0
7132 63 0.968481375358 698.0 676.0
40 13 1.0 14.0 14.0
561 33 0.916666666667 60.0 55.0
704 34 1.0 139.0 139.0
57 17 1.0 22.0 22.0
97 25 0.962962962963 27.0 26.0
192 25 0.965517241379 29.0 28.0
3593 53 

## Data Modeling

In [None]:
X_train = pickle.load(open( "X_train.p", "rb" ))
t_train = pickle.load(open( "t_train.p", "rb" ))
train_ids = pickle.load(open( "train_ids.p", "rb" ))

"""
X_test = pickle.load(open("X_test.p", "rb"))
t_test = pickle.load(open( "t_test.p", "rb" ))
test_ids = pickle.load(open( "test_ids.p", "rb" ))
"""

In [None]:
X_train.ix[1]