In [2]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import pandas as pd
import cPickle as pickle
import io

import util

## Feature Extraction

In [2]:
TRAIN_DIR = "train"
TEST_DIR = "test"

call_set = pickle.load(open( "call_set.p", "rb" ))

In [96]:
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = list(call_set)
    
    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [97]:
## Feature extraction
def main():
    # SAVE TRAINING DATA
    X_train, t_train, train_ids = create_data_matrix(0, 3086, direc=TRAIN_DIR)
    X_train = pd.DataFrame(X_train)
    t_train = pd.DataFrame(t_train, columns=['class'])
    train_ids = pd.DataFrame(train_ids,columns=['id'])
          
    filename1 = 'X_train.p'
    filename2 = 't_train.p'
    filename3 = 'train_ids.p'
    
    pickle.dump(X_train, io.open(filename1, 'wb'))
    pickle.dump(t_train, io.open(filename2, 'wb'))
    pickle.dump(train_ids, io.open(filename3, 'wb'))
    
    # SAVE TESTING DATA
    X_test, t_test, test_ids = create_data_matrix(0, 3724, direc=TEST_DIR)
    X_test = pd.DataFrame(X_test)
    test_ids = pd.DataFrame(test_ids,columns=['id'])
    
    filename4 = 'X_test.p'
    filename5 = 't_test.p'
    filename6 = 'test_ids.p'
    
    pickle.dump(X_test, io.open(filename4, 'wb'))
    pickle.dump(t_test, io.open(filename5, 'wb'))
    pickle.dump(test_ids, io.open(filename6, 'wb'))

In [98]:
%%time
if __name__ == "__main__":
    main()

## Data Modeling

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.grid_search import GridSearchCV

In [4]:
X_train = pickle.load(open( "X_train.p", "rb" ))
t_train = np.squeeze(np.asarray(pickle.load(open( "t_train.p", "rb" ))))
train_ids = np.squeeze(np.asarray(pickle.load(open( "train_ids.p", "rb" ))))

X_test = pickle.load(open("X_test.p", "rb"))
t_test = np.squeeze(np.asarray(pickle.load(open( "t_test.p", "rb" ))))
test_ids = np.squeeze(np.asarray(pickle.load(open( "test_ids.p", "rb" ))))

### Logistic Regression

##### L2 (Ridge Regression)

In [107]:
# CV optimize to pick best hyperparamters
log_l2 = LogisticRegression(penalty='l2')
params = {'C':[0.00001, 0.0001, 0.001]}
log_l2 = GridSearchCV(log_l2, param_grid=params, cv=5).fit(X_train,t_train)
print log_l2.grid_scores_
print log_l2.best_score_

[mean: 0.74076, std: 0.00125, params: {'C': 1e-05}, mean: 0.75437, std: 0.01119, params: {'C': 0.0001}, mean: 0.75178, std: 0.01359, params: {'C': 0.001}]
0.754374594945


In [108]:
# Take estimator with best parameters and fit to training data
log_l2_best = log_l2.best_estimator_
log_l2_best = log_l2_best.fit(X_train,t_train)

In [109]:
# Predict on test data and write to file
preds = log_l2_best.predict(X_test)
util.write_predictions(preds, test_ids, "logistic_l2.csv")

##### L1 (Lasso Regression)

In [None]:
# CV optimize to pick best hyperparamters
log_l1 = LogisticRegression(penalty='l1')
params = {'C':[10.]}
log_l1 = GridSearchCV(log_l1, param_grid=params, cv=5).fit(X_train,t_train)
print log_l1.grid_scores_
print log_l1.best_score_

In [None]:
# Take estimator with best parameters and fit to training data
log_l1_best = log_l1.best_estimator_
log_l1_best = log_l1_best.fit(X_train,t_train)

In [None]:
# Predict on test data and write to file
preds = log_l1_best.predict(X_test)
util.write_predictions(preds, test_ids, "logistic_l1.csv")

### Random Forest Classifier

In [None]:
# CV optimize to pick best hyperparameters
rf = RandomForestClassifier(n_estimators=10,random_state=4,verbose=1)
params = {'max_depth':[None,10,15,30],'criteron':['gini','entropy'],'max_features':[None,'auto'],
         'min_samples_split':[2,3,5],'min_samples_leaf':[1,2,5]}
rf = GridSearchCV(rf, param_grid=params, cv=5).fit(X_train,t_train)
print rf.grid_scores_
print rf.best_score_

In [None]:
# Take estimator with best parameters and fit to training data
rf_best = rf.best_estimator_
rf_best = rf_best.fit(X_train,t_train)

In [None]:
# Predict on test data and write to file
preds = rf_best.predict(X_test)
util.write_predictions(preds, test_ids, "rf.csv")