In [1]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import pandas as pd
import cPickle as pickle
import io
from sklearn import grid_search
from sklearn import mixture

import util

## Feature Extraction

In [2]:
TRAIN_DIR = "train"
TEST_DIR = "test"

call_set = pickle.load(open( "call_set.p", "rb" ))

In [3]:
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = list(call_set)
    
    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [4]:
## Feature extraction
def main():
    # SAVE TRAINING DATA
    X_train, t_train, train_ids = create_data_matrix(0, 3085, direc=TRAIN_DIR)
    X_train = pd.DataFrame(X_train)
    t_train = pd.DataFrame(t_train, columns=['class'])
    train_ids = pd.DataFrame(train_ids,columns=['id'])
          
    filename1 = 'X_train.p'
    filename2 = 't_train.p'
    filename3 = 'train_ids.p'
    
    pickle.dump(X_train, io.open(filename1, 'wb'))
    pickle.dump(t_train, io.open(filename2, 'wb'))
    pickle.dump(train_ids, io.open(filename3, 'wb'))
    
    # SAVE TESTING DATA
    X_test, t_test, test_ids = create_data_matrix(0, 3723, direc=TEST_DIR)
    X_test = pd.DataFrame(X_test)
    test_ids = pd.DataFrame(test_ids,columns=['id'])
    
    filename4 = 'X_test.p'
    filename5 = 't_test.p'
    filename6 = 'test_ids.p'
    
    pickle.dump(X_test, io.open(filename4, 'wb'))
    pickle.dump(t_test, io.open(filename5, 'wb'))
    pickle.dump(test_ids, io.open(filename6, 'wb'))

In [5]:
if __name__ == "__main__":
    main()

ParseError: unclosed token: line 4971, column 0

## Data Modeling

In [4]:
X_train = pickle.load(open( "X_train.p", "rb" ))

TypeError: ('_reconstruct: First argument must be a sub-type of ndarray', <built-in function _reconstruct>, (<class 'pandas.core.index.Int64Index'>, (0,), 'b'))

In [2]:
X_train = pickle.load(open( "X_train.p", "rb" ))
t_train = pickle.load(open( "t_train.p", "rb" ))
train_ids = pickle.load(open( "train_ids.p", "rb" ))

X_test = pickle.load(open("X_test.p", "rb"))
t_test = pickle.load(open( "t_test.p", "rb" ))
test_ids = pickle.load(open( "test_ids.p", "rb" ))

TypeError: ('_reconstruct: First argument must be a sub-type of ndarray', <built-in function _reconstruct>, (<class 'pandas.core.index.Int64Index'>, (0,), 'b'))

In [None]:
X_train = np.asarray(X_train)
t_train = np.as

In [36]:
np.squeeze(np.asarray(t_train))

array([ 8,  6, 12, ..., 10,  8,  8])

## GMM

## Messing around

In [None]:
# Try to optimize the parameter alpha
parameters = {'alpha':[1, 50, 100, 500]}
lasso = linear_model.Lasso()
clf_lasso = grid_search.GridSearchCV(estimator=lasso, param_grid=parameters)
clf_lasso.fit(X_train,Y_train)
print clf_lasso.grid_scores_
print clf_lasso.best_score_

In [106]:
gmm = mixture.GMM(n_components=14,covariance_type='full', n_iter=1000)
gmm_test = gmm.fit(np.asarray(X_train), np.squeeze(np.asarray(t_train)))

In [144]:
gmm_test.score(np.asarray(X_train), np.squeeze(np.asarray(t_train)))
gmm_test.means_?

In [145]:
parameters = {'covariance_type':['full','spherical','tied','diag']}
gmm = mixture.GMM(n_components=14)
gmm_mod = grid_search.GridSearchCV(estimator = gmm, param_grid = parameters)
#gmm_fit = gmm_mod.fit(np.asarray(X_train), np.squeeze(np.asarray(t_train)))
gmm_mod

GridSearchCV(cv=None, error_score='raise',
       estimator=GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
  n_components=14, n_init=1, n_iter=100, params='wmc', random_state=None,
  thresh=None, tol=0.001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'covariance_type': ['full', 'spherical', 'tied', 'diag']},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [146]:
from sklearn import mixture
gmm = mixture.GMM(n_components=14, covariance_type='tied')
gmm.fit(np.asarray(X_train), np.squeeze(np.asarray(t_train)))

GMM(covariance_type='tied', init_params='wmc', min_covar=0.001,
  n_components=14, n_init=1, n_iter=100, params='wmc', random_state=None,
  thresh=None, tol=0.001)

In [103]:
parameters = {'covariance_type':['full','spherical','tied','diag']}
gmm = mixture.GMM(n_components=14)
gmm_mod = grid_search.GridSearchCV(estimator=gmm, param_grid=parameters)


In [102]:
grid_search.GridSearchCV?

In [72]:
parameters = {'covariance_type':['full','spherical','tied','diag']}
clf_lasso = grid_search.GridSearchCV(estimator=lasso, param_grid=parameters)
clf_lasso.fit(X_train,Y_train)
print clf_lasso.grid_scores_
print clf_lasso.best_score_
gmm.predict(np.asarray(X_test))

13

In [None]:
parameters = {'alpha':[0.002, 0.004, 0.006, 0.008, 0.01]}
clf_lasso = grid_search.GridSearchCV(estimator=lasso, param_grid=parameters)
clf_lasso.fit(X_train,Y_train)
print clf_lasso.grid_scores_
print clf_lasso.best_score_