In [17]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import pandas as pd
import pickle
import io
from sklearn import grid_search
from sklearn import mixture

import util

## Feature Extraction

In [18]:
TRAIN_DIR = "train"
TEST_DIR = "test"

call_set = pickle.load(open( "call_set.p", "rb" ))

In [19]:
call_set_nonredundant = set(call_set) - set(["processes", "all_section"])

In [20]:
#
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = list(call_set_nonredundant)
    call_counter = {}
    total_calls = 0
    n_success, success_denom = 0.0,0.0
    for el in tree.iter():
        call = el.tag
        if el.attrib.has_key('successful'):
            success_denom += 1
            n_success += int(el.attrib['successful'])
        total_calls += 1
        if call not in call_counter:
            call_counter[call] = 1
        else:
            call_counter[call] += 1
    unq_calls = len(call_counter.keys())
    percentage_success = n_success / success_denom
            
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]
    #print total_calls, unq_calls, percentage_success, success_denom, n_success
    call_feat_array = np.append(call_feat_array, [total_calls, unq_calls, percentage_success])
    
    return call_feat_array

In [21]:
## Feature extraction
def main():
    # SAVE TRAINING DATA
    '''
    X_train, t_train, train_ids = create_data_matrix(0, 3086, direc=TRAIN_DIR)
    X_train = pd.DataFrame(X_train)
    t_train = pd.DataFrame(t_train, columns=['class'])
    train_ids = pd.DataFrame(train_ids,columns=['id'])
          
    filename1 = 'X_train.p'
    filename2 = 't_train.p'
    filename3 = 'train_ids.p'
    
    pickle.dump(X_train, io.open(filename1, 'wb'))
    pickle.dump(t_train, io.open(filename2, 'wb'))
    pickle.dump(train_ids, io.open(filename3, 'wb'))
    
    '''    
    # SAVE TESTING DATA
    X_test, t_test, test_ids = create_data_matrix(0, 3724, direc=TEST_DIR)
    X_test = pd.DataFrame(X_test)
    test_ids = pd.DataFrame(test_ids,columns=['id'])
    
    filename4 = 'X_test.p'
    filename5 = 't_test.p'
    filename6 = 'test_ids.p'
    
    pickle.dump(X_test, io.open(filename4, 'wb'))
    pickle.dump(t_test, io.open(filename5, 'wb'))
    pickle.dump(test_ids, io.open(filename6, 'wb'))

In [22]:
if __name__ == "__main__":
    main()

ParseError: unclosed token: line 4971, column 0

## Data Modeling

In [2]:
X_train = pickle.load(open( "X_train.p", "rb" ))
t_train = pickle.load(open( "t_train.p", "rb" ))
train_ids = pickle.load(open( "train_ids.p", "rb" ))

"""
X_test = pickle.load(open("X_test.p", "rb"))
t_test = pickle.load(open( "t_test.p", "rb" ))
test_ids = pickle.load(open( "test_ids.p", "rb" ))
"""

'\nX_test = pickle.load(open("X_test.p", "rb"))\nt_test = pickle.load(open( "t_test.p", "rb" ))\ntest_ids = pickle.load(open( "test_ids.p", "rb" ))\n'

In [29]:
"""
X_train = pickle.load(open( "X_train.p", "rb" ))
t_train = pickle.load(open( "t_train.p", "rb" ))
train_ids = pickle.load(open( "train_ids.p", "rb" ))

"""
X_test = pd.read_pickle("X_test.p")
t_test = pickle.load(open( "t_test.p", "rb" ))
test_ids = pd.read_pickle( "test_ids.p" )


## Change data

In [3]:
t_train_squeezed = np.squeeze(np.asarray(t_train))
X_train_array = np.asarray(X_train)

In [None]:
mixture.GMM()

##GMM

In [158]:
# Create a way to determine which GMM parameters are best
# Note that we had to do this because the grid_search will not work for GMM because score returns an array
# Instead we decided to use AIC and BIC
# This averages the AIC and BIC from the same model with different random_states so we could choose the "best model"
def GMMavg(gmm, n_iter, X, t):
    AIC_array = []
    BIC_array = []
    for i in range(n_iter):
        gmm_iter_i = gmm
        gmm_iter_i = gmm.fit(X, t)
        AIC_array.append(gmm_iter_i.aic(X))
        BIC_array.append(gmm_iter_i.bic(X))
    return np.mean(AIC_array), np.mean(BIC_array)

In [156]:
GMMavg(mixture.GMM(n_components=14,covariance_type='diag', n_iter=1000, random_state=None), 25, X_train_array, t_train_squeezed)

(569054.42432612833, 587212.629104194)

In [159]:
GMMavg(mixture.GMM(n_components=14,covariance_type='full', n_iter=1000, random_state=None), 25, X_train_array, t_train_squeezed)

(642539.26815047557, 1139810.9691564385)

In [161]:
GMMavg(mixture.GMM(n_components=14,covariance_type='spherical', n_iter=1000, random_state=None), 25, X_train_array, t_train_squeezed)

(3414361.7436124687, 3423564.5559376762)

In [162]:
GMMavg(mixture.GMM(n_components=14,covariance_type='tied', n_iter=1000, random_state=None), 25, X_train_array, t_train_squeezed)

(1417041.4782957423, 1461027.903894719)

In [164]:
GMMavg(mixture.GMM(n_components=14,covariance_type='diag', n_iter=10000, random_state=None, min_covar=0.0000001), 25, X_train_array, t_train_squeezed)

(-467519.52244008018, -449361.31766201457)

In [218]:
gmm_test = mixture.GMM(n_components=14,covariance_type='diag', n_iter=100000, random_state=None, min_covar=0.0000000001)

In [409]:
skf = StratifiedKFold(t_train_squeezed, n_folds=5, shuffle=True)
# Code credit: http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_classifier.html

train_index, test_index = next(iter(skf))
X_train1 = X_train_array[train_index]
y_train1 = t_train_squeezed[train_index]
X_test1 = X_train_array[test_index]
y_test1 = t_train_squeezed[test_index]

train_index

array([   0,    1,    2, ..., 3082, 3083, 3084])

In [185]:
i = 1
print "data observation: %s" % i
print "actual class: %s" % t_train_squeezed[i]
print "prediction: %s" % gmm_test.predict(np.asarray(X_train))[i]
print "data probability predictions: %s" % gmm_test.predict_proba(np.asarray(X_train))[i]
print "log of probability predictions: %s" % np.log(gmm_test.predict_proba(np.asarray(X_train)))[i]
print "score from GMM.score: %s" % gmm_test.score(np.asarray(X_train))[i]
print "AIC: %s" % gmm_test.aic(X_train_array)
print "BIC: %s" % gmm_test.bic(X_train_array)

data observation: 1
actual class: 6
prediction: 1
data probability predictions: [  0.00000000e+000   1.00000000e+000   0.00000000e+000   0.00000000e+000
   0.00000000e+000   0.00000000e+000   0.00000000e+000   0.00000000e+000
   2.74722721e-163   0.00000000e+000   0.00000000e+000   0.00000000e+000
   0.00000000e+000   0.00000000e+000]
log of probability predictions: [         -inf    0.                  -inf          -inf          -inf
          -inf          -inf          -inf -374.31077804          -inf
          -inf          -inf          -inf          -inf]
score from GMM.score: -99.1689138884
AIC: -103489.204112
BIC: -85330.9993336


## GMM using Cross Validation

In [38]:
string = 'test'
str(2)

'2'

In [39]:
classifiers = dict((covar_type+str(random_state), mixture.GMM(n_components=14, random_state=random_state,
                    covariance_type=covar_type, n_iter=100, min_covar = 0.0000001))
                   for covar_type in ['spherical', 'diag', 'tied', 'full']
                   for random_state in range(3))



In [44]:
# Code credit: http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_classifier.html

# Split 
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold(t_train_squeezed, n_folds=5, shuffle=True, random_state=3)

'''
train_index, test_index = next(iter(skf))
X_train1 = X_train_array[train_index]
y_train1 = t_train_squeezed[train_index]
X_test1 = X_train_array[test_index]
y_test1 = t_train_squeezed[test_index]
'''

classifiers = dict((covar_type+str(random_state), mixture.GMM(n_components=14, random_state=random_state,
                    covariance_type=covar_type, n_iter=100, min_covar = 0.0000001))
                   for covar_type in ['spherical', 'diag', 'tied', 'full']
                   for random_state in range(50))

# Loop over classifiers
for index, (name, classifier) in enumerate(classifiers.items()):
    
    i = 1
    
    # Loop over k-folds
    for train_index, test_index in skf:
        
        train_accuracy = []
        test_accuracy = []
    
        X_train_k, X_test_k = X_train_array[train_index], X_train_array[test_index]
        y_train_k, y_test_k = t_train_squeezed[train_index], t_train_squeezed[test_index]
    
        classifier.fit(X_train_k,y_train_k)
        y_train_pred = classifier.predict(X_train_k)

        x = (np.mean(y_train_pred.ravel() == y_train_k.ravel()) * 100)
        train_accuracy.append(x)
        #print 'Train accuracy for %s %s: %.2f' % (name, i, x)

        y_test_pred = classifier.predict(X_test_k)
        y = (np.mean(y_test_pred.ravel() == y_test_k.ravel()) * 100)
        test_accuracy.append(y)
        #print 'Test accuracy for %s %s: %.2f' % (name, i, y)
        
        i += 1

        #print 'Converged? %s' % classifier.converged_      
    
    print 'MEAN Train accuracy for %s: %.2f' % (name, np.mean(train_accuracy))
    print 'MEAN Test accuracy for %s: %.2f' % (name, np.mean(test_accuracy))

MEAN Train accuracy for tied48: 5.29
MEAN Test accuracy for tied48: 4.91
MEAN Train accuracy for tied49: 1.09
MEAN Test accuracy for tied49: 1.31
MEAN Train accuracy for full21: 3.39
MEAN Test accuracy for full21: 3.60
MEAN Train accuracy for full20: 3.35
MEAN Test accuracy for full20: 3.76
MEAN Train accuracy for full27: 2.95
MEAN Test accuracy for full27: 3.60
MEAN Train accuracy for full26: 7.23
MEAN Test accuracy for full26: 9.33
MEAN Train accuracy for full25: 4.40
MEAN Test accuracy for full25: 4.42
MEAN Train accuracy for full22: 8.81
MEAN Test accuracy for full22: 9.82
MEAN Train accuracy for tied40: 2.91
MEAN Test accuracy for tied40: 3.60
MEAN Train accuracy for tied41: 3.07
MEAN Test accuracy for tied41: 3.44
MEAN Train accuracy for tied42: 5.86
MEAN Test accuracy for tied42: 4.91
MEAN Train accuracy for tied43: 2.67
MEAN Test accuracy for tied43: 3.44
MEAN Train accuracy for tied44: 3.31
MEAN Test accuracy for tied44: 3.93
MEAN Train accuracy for tied45: 8.04
MEAN Test accu

In [50]:
from sklearn.metrics import confusion_matrix
classifiers['diag38'].fit(X_train, t_train)
train_preds = classifiers['diag38'].predict(X_train)
pd.DataFrame(confusion_matrix(t_train, train_preds),columns=util.malware_classes,index=util.malware_classes)

Unnamed: 0,Agent,AutoRun,FraudLoad,FraudPack,Hupigon,Krap,Lipler,Magania,None,Poison,Swizzor,Tdss,VB,Virut,Zbot
Agent,33,42,2,0,0,1,0,0,7,0,3,17,0,9,0
AutoRun,2,6,0,0,0,0,0,0,32,0,1,7,0,2,0
FraudLoad,4,14,0,0,0,0,0,0,2,0,0,6,0,11,0
FraudPack,4,4,0,0,0,0,0,0,7,0,0,17,0,0,0
Hupigon,7,12,0,0,0,0,0,0,15,0,0,2,0,5,0
Krap,2,31,0,0,0,0,0,0,2,0,0,4,0,0,0
Lipler,0,51,0,0,0,0,0,0,1,0,0,1,0,0,0
Magania,30,6,0,0,0,0,0,0,4,0,0,1,0,0,0
,388,199,0,953,2,6,4,1,25,1,0,13,1,16,0
Poison,16,0,0,0,0,0,0,0,3,0,1,1,0,0,0


In [None]:
train_preds = log_l2_best.predict(X_train)
pd.DataFrame(confusion_matrix(t_train, train_preds),columns=util.malware_classes,index=util.malware_classes)

Good Random_states
0:
MEAN Train accuracy for diag: 9.90
MEAN Test accuracy for diag: 9.66


What is EM iteratating to find if y is known???
After cross validation, fit on entire training set before predicting on test set?

In [None]:
import confusion.

train_preds=log_12_best.predict(X_train
                               pd.DataFrame(confusion_matrix(t_train, train_preds), columns=util.malware_classes,index=util.malware_class))