In [1]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import pandas as pd
import pickle
import io
from sklearn import grid_search
from sklearn import mixture
from sklearn.metrics import confusion_matrix
import util
from sklearn import preprocessing
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

## Feature Extraction

In [8]:
TRAIN_DIR = "train"
TEST_DIR = "test"

call_set = pickle.load(open( "call_set.p", "rb" ))

In [9]:
call_set_nonredundant = set(call_set) - set(["processes", "all_section"])

In [10]:
#
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = list(call_set_nonredundant)
    call_counter = {}
    total_calls = 0
    n_success, success_denom = 0.0,0.0
    for el in tree.iter():
        call = el.tag
        if el.attrib.has_key('successful'):
            success_denom += 1
            n_success += int(el.attrib['successful'])
        total_calls += 1
        if call not in call_counter:
            call_counter[call] = 1
        else:
            call_counter[call] += 1
    unq_calls = len(call_counter.keys())
    percentage_success = n_success / success_denom
            
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]
    #print total_calls, unq_calls, percentage_success, success_denom, n_success
    call_feat_array = np.append(call_feat_array, [total_calls, unq_calls, percentage_success])
    
    return call_feat_array

In [13]:
## Feature extraction
def main():
    # SAVE TRAINING DATA

    X_train, t_train, train_ids = create_data_matrix(0, 3086, direc=TRAIN_DIR)
    X_train = pd.DataFrame(X_train)
    t_train = pd.DataFrame(t_train, columns=['class'])
    train_ids = pd.DataFrame(train_ids,columns=['id'])
          
    filename1 = 'X_train.p'
    filename2 = 't_train.p'
    filename3 = 'train_ids.p'
    
    pickle.dump(X_train, io.open(filename1, 'wb'))
    pickle.dump(t_train, io.open(filename2, 'wb'))
    pickle.dump(train_ids, io.open(filename3, 'wb'))
     
    # SAVE TESTING DATA
    X_test, t_test, test_ids = create_data_matrix(0, 3724, direc=TEST_DIR)
    X_test = pd.DataFrame(X_test)
    test_ids = pd.DataFrame(test_ids,columns=['id'])
    
    filename4 = 'X_test.p'
    filename5 = 't_test.p'
    filename6 = 'test_ids.p'
    
    pickle.dump(X_test, io.open(filename4, 'wb'))
    pickle.dump(t_test, io.open(filename5, 'wb'))
    pickle.dump(test_ids, io.open(filename6, 'wb'))

In [37]:
if __name__ == "__main__":
    main()

## Load Data, and Standardize

In [14]:
X_train = pd.read_pickle("X_train.p")
t_train = pd.read_pickle("t_train.p" )
train_ids = pd.read_pickle( "train_ids.p" )

X_test = pd.read_pickle("X_test.p")
t_test = pickle.load(open( "t_test.p", "rb" ))
test_ids = pd.read_pickle( "test_ids.p" )

In [15]:
t_train_squeezed = np.squeeze(np.asarray(t_train))
X_train_array = np.asarray(X_train)

t_test_squeezed = np.squeeze(np.asarray(t_test))
X_test_array = np.asarray(X_test)

In [16]:
# Center data
standardizer = preprocessing.StandardScaler().fit(X_train)
X_train_std = standardizer.transform(X_train)
X_test_std = standardizer.transform(X_test)

## Sklearn's GMM

In [46]:
# Code credit: http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_classifier.html

# Split 
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold(t_train_squeezed, n_folds=5, shuffle=True, random_state=3)

classifiers = dict((covar_type+str(random_state), mixture.GMM(n_components=15, random_state=random_state,
                    covariance_type=covar_type, n_iter=100, min_covar = 0.0000001))
                   for covar_type in ['spherical', 'diag', 'tied', 'full']
                   for random_state in range(200,201))

# Loop over classifiers
for index, (name, classifier) in enumerate(classifiers.items()):
    
    i = 1
    
    # Loop over k-folds
    for train_index, test_index in skf:
        
        train_accuracy = []
        test_accuracy = []
    
        X_train_k, X_test_k = X_train_array[train_index], X_train_array[test_index]
        y_train_k, y_test_k = t_train_squeezed[train_index], t_train_squeezed[test_index]
    
        classifier.fit(X_train_k,y_train_k)
        y_train_pred = classifier.predict(X_train_k)

        x = (np.mean(y_train_pred.ravel() == y_train_k.ravel()) * 100)
        train_accuracy.append(x)
        #print 'Train accuracy for %s %s: %.2f' % (name, i, x)

        y_test_pred = classifier.predict(X_test_k)
        y = (np.mean(y_test_pred.ravel() == y_test_k.ravel()) * 100)
        test_accuracy.append(y)
        #print 'Test accuracy for %s %s: %.2f' % (name, i, y)
        
        i += 1

        #print 'Converged? %s' % classifier.converged_      
    
    if np.mean(test_accuracy) > 45.0:
        print 'MEAN Train accuracy for %s: %.2f' % (name, np.mean(train_accuracy))
        print 'MEAN Test accuracy for %s: %.2f' % (name, np.mean(test_accuracy))
    else:
        print 'Less than 45 %s' % name

Less than 45 full200
Less than 45 diag200
Less than 45 tied200
Less than 45 spherical200


MEAN Train accuracy for diag96: 54.71
MEAN Test accuracy for diag96: 54.17

In [11]:
from sklearn.metrics import confusion_matrix
classifiers['diag96'].fit(X_train, t_train)
train_preds = classifiers['diag96'].predict(X_train)
pd.DataFrame(confusion_matrix(t_train, train_preds),columns=util.malware_classes,index=util.malware_classes)

Unnamed: 0,Agent,AutoRun,FraudLoad,FraudPack,Hupigon,Krap,Lipler,Magania,None,Poison,Swizzor,Tdss,VB,Virut,Zbot
Agent,15,37,4,19,2,0,0,0,0,0,3,8,20,6,0
AutoRun,1,4,34,4,0,0,0,0,0,0,1,1,4,1,0
FraudLoad,0,14,0,8,0,0,0,0,0,0,0,1,4,10,0
FraudPack,8,4,0,16,0,0,0,0,0,0,0,0,4,0,0
Hupigon,2,11,0,17,0,0,0,0,0,0,0,2,4,5,0
Krap,0,32,2,2,0,0,0,0,0,0,0,3,0,0,0
Lipler,0,45,0,0,0,0,0,0,0,0,0,2,0,6,0
Magania,0,15,23,2,0,0,0,0,0,0,0,0,1,0,0
,9,215,18,19,0,2,1,1,1,4,0,10,1321,8,0
Poison,1,4,3,1,0,0,0,0,0,0,1,0,11,0,0


## Generative Probablistic Model - Implementation from HW2

In [17]:
# Code Credit:  HW 2

from scipy.stats import multivariate_normal
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as c

class GaussianGenerativeModel:
    def __init__(self, isSharedCovariance=False):
        self.isSharedCovariance = isSharedCovariance

    def fit(self, X, Y):
        nClasses = max(Y) + 1
        nFeatures = X.shape[1]
        N = X.shape[0]
        self.X = X
        self.Y = Y
        self.nClasses = nClasses
        self.nFeatures = nFeatures
        self.N = N
        assert(X.shape[0] == Y.shape[0])

        class_means, shared_covariance = self.__get_mean_and_covariance_matrix(X,
                Y, nFeatures, nClasses)

        # Get p(y) by normalize the distribution of counts
        class_counts = self.__getClassCounts(Y, nClasses)
        p_y = class_counts / (class_counts.sum())

        # Taking the log of the distributions allows us to get away with
        # addition instead of multiplication later.
        b = np.log(p_y)

        # Save all learned parameters
        self.class_means = class_means
        self.shared_covariance = shared_covariance
        self.b = b

    def __get_mean_and_covariance_matrix(self, X, Y, nfeatures, nclasses):
        means = []
        cov = []

        # Only used if self.isSharedCovariance is true
        shared_cov = np.zeros((nfeatures, nfeatures))

        # Filter by class, and calculate mean and covariance of each.
        for c in range(nclasses):
            rows_in_class = X[Y == c]
            means.append(np.mean(rows_in_class, axis=0))
            if self.isSharedCovariance:
                Cov_i = np.cov(rows_in_class.T)
                shared_cov += Cov_i*rows_in_class.shape[0]
            else:
                cov.append(np.cov(rows_in_class.T))

        # Return the shared covariance matrix if set that way, else return the
        # separate matricies.
        if self.isSharedCovariance:
            return np.array(means), shared_cov/X.shape[0]
        return np.array(means), cov

    def __gaussianProb(self, x, means, covariances, nClasses):
        class_probs = np.zeros(nClasses)
        for c in range(nClasses):
            if self.isSharedCovariance:
                class_probs[c] = multivariate_normal.pdf(x, mean=means[c],cov=covariances, allow_singular=True)
            else:
                class_probs[c] = multivariate_normal.pdf(x, mean=means[c],cov=covariances[c], allow_singular=True)
        return np.log(class_probs)

    def __getClassCounts(self, Y, nclasses):
        counts = np.zeros(nclasses)
        for y in Y:
            counts[y] += 1
        return counts

    def predict(self, X_to_predict):
        mus = self.class_means
        Sigma = self.shared_covariance
        b = self.b
        nClasses = mus.shape[0]
        gaussian_probs = np.zeros((X_to_predict.shape[0], nClasses))
        #return X_to_predict, mus, Sigma, nClasses
        for i in range(X_to_predict.shape[0]):
            gaussian_probs[i] = self.__gaussianProb(X_to_predict[i], mus, Sigma,nClasses)
        Y_hats = gaussian_probs + b
        predictions = np.argmax(Y_hats, axis=1)
        return predictions

    def visualize(self, output_file, width=3, show_charts=False):
        X = self.X

        # Create a grid of points
        x_min, x_max = min(X[:, 0] - width), max(X[:, 0] + width)
        y_min, y_max = min(X[:, 1] - width), max(X[:, 1] + width)
        xx,yy = np.meshgrid(np.arange(x_min, x_max, .05), np.arange(y_min,
            y_max, .05))

        # Flatten the grid so the values match spec for self.predict
        xx_flat = xx.flatten()
        yy_flat = yy.flatten()
        X_topredict = np.vstack((xx_flat,yy_flat)).T

        # Get the class predictions
        Y_hat = self.predict(X_topredict)
        Y_hat = Y_hat.reshape((xx.shape[0], xx.shape[1]))

        cMap = c.ListedColormap(['r','b','g'])

        # Visualize them.
        plt.figure()
        plt.pcolormesh(xx,yy,Y_hat, cmap=cMap)
        plt.scatter(X[:, 0], X[:, 1], c=self.Y, cmap=cMap)
        plt.savefig(output_file)
        if show_charts:
            plt.show()
            

In [19]:
# Make a function to perform cross-validation
# Loop over k-folds
def cross_val(classifier, X_train_array, t_train_squeezed, folds):
    # Split to make K-fold Cross Validation
    skf = StratifiedKFold(t_train_squeezed, n_folds=folds, shuffle=True, random_state=3)
    i = 0
    
    train_accuracy = []
    test_accuracy = []
    
    for train_index, test_index in skf:


        X_train_k, X_test_k = X_train_array[train_index], X_train_array[test_index]
        y_train_k, y_test_k = t_train_squeezed[train_index], t_train_squeezed[test_index]

        classifier.fit(X_train_k,y_train_k)
        y_train_pred = classifier.predict(X_train_k)

        x = (np.mean(y_train_pred.ravel() == y_train_k.ravel()) * 100)
        train_accuracy.append(x)
        print 'Train accuracy for fold %s: %.2f' % (i, x)

        y_test_pred = classifier.predict(X_test_k)
        y = (np.mean(y_test_pred.ravel() == y_test_k.ravel()) * 100)
        test_accuracy.append(y)
        print 'Test accuracy for fold %s: %.2f' % (i, y)

        i += 1

        #print 'Converged? %s' % classifier.converged_      

    print 'MEAN Train accuracy: %.2f' % np.mean(train_accuracy)
    print 'MEAN Test accuracy : %.2f' % np.mean(test_accuracy)
    
    return train_accuracy, test_accuracy

In [332]:
# Cross Validation for shared covariance - NOT STANDARDIZED DATA
ggm_shared = GaussianGenerativeModel(isSharedCovariance=True)
gmm_shared_cv = cross_val(ggm_shared, X_train_array, t_train_squeezed, 10)

Train accuracy for fold 1: 84.48
Test accuracy for fold 1: 80.06
Train accuracy for fold 3: 77.32
Test accuracy for fold 3: 75.72
Train accuracy for fold 5: 77.02
Test accuracy for fold 5: 75.48
Train accuracy for fold 7: 76.99
Test accuracy for fold 7: 78.64
Train accuracy for fold 9: 84.13
Test accuracy for fold 9: 81.82
Train accuracy for fold 11: 78.37
Test accuracy for fold 11: 76.95
Train accuracy for fold 13: 76.90
Test accuracy for fold 13: 80.46
Train accuracy for fold 15: 84.53
Test accuracy for fold 15: 81.37
Train accuracy for fold 17: 77.91
Test accuracy for fold 17: 76.14
Train accuracy for fold 19: 84.51
Test accuracy for fold 19: 82.18
MEAN Train accuracy: 80.21
MEAN Test accuracy : 78.88


In [379]:
# Cross Validation for shared covariance - STANDARDIZED DATA
ggm_shared = GaussianGenerativeModel(isSharedCovariance=True)
gmm_shared_cv_std = cross_val(ggm_shared, X_train_std, t_train_squeezed, 10)

Train accuracy for fold 1: 85.67
Test accuracy for fold 1: 81.33
Train accuracy for fold 3: 85.39
Test accuracy for fold 3: 86.26
Train accuracy for fold 5: 85.70
Test accuracy for fold 5: 81.94
Train accuracy for fold 7: 85.02
Test accuracy for fold 7: 85.76
Train accuracy for fold 9: 85.03
Test accuracy for fold 9: 82.79
Train accuracy for fold 11: 85.10
Test accuracy for fold 11: 84.74
Train accuracy for fold 13: 85.25
Test accuracy for fold 13: 87.30
Train accuracy for fold 15: 85.72
Test accuracy for fold 15: 83.33
Train accuracy for fold 17: 85.14
Test accuracy for fold 17: 83.99
Train accuracy for fold 19: 86.27
Test accuracy for fold 19: 82.84
MEAN Train accuracy: 85.43
MEAN Test accuracy : 84.03


In [333]:
# Cross Validation for sep covariance - NOT STANDARDIZED DATA
ggm_sep = GaussianGenerativeModel(isSharedCovariance=False)
gmm_sep_cv = cross_val(ggm_sep, X_train_array, t_train_squeezed, 10)

Train accuracy for fold 1: 78.41
Test accuracy for fold 1: 71.84
Train accuracy for fold 3: 80.35
Test accuracy for fold 3: 72.52
Train accuracy for fold 5: 80.44
Test accuracy for fold 5: 69.35
Train accuracy for fold 7: 73.17
Test accuracy for fold 7: 65.05
Train accuracy for fold 9: 72.35
Test accuracy for fold 9: 63.96
Train accuracy for fold 11: 79.37
Test accuracy for fold 11: 77.60
Train accuracy for fold 13: 80.93
Test accuracy for fold 13: 75.57
Train accuracy for fold 15: 79.75
Test accuracy for fold 15: 71.24
Train accuracy for fold 17: 81.04
Test accuracy for fold 17: 73.53
Train accuracy for fold 19: 80.96
Test accuracy for fold 19: 73.60
MEAN Train accuracy: 78.68
MEAN Test accuracy : 71.43


In [380]:
# Cross Validation for sep covariance - STANDARDIZED DATA
ggm_sep = GaussianGenerativeModel(isSharedCovariance=False)
gmm_sep_cv_std = cross_val(ggm_sep, X_train_std, t_train_squeezed, 10)

Train accuracy for fold 1: 93.86
Test accuracy for fold 1: 80.06
Train accuracy for fold 3: 92.39
Test accuracy for fold 3: 84.03
Train accuracy for fold 5: 92.11
Test accuracy for fold 5: 79.35
Train accuracy for fold 7: 93.19
Test accuracy for fold 7: 86.08
Train accuracy for fold 9: 93.12
Test accuracy for fold 9: 80.84
Train accuracy for fold 11: 92.94
Test accuracy for fold 11: 83.44
Train accuracy for fold 13: 93.70
Test accuracy for fold 13: 85.34
Train accuracy for fold 15: 92.16
Test accuracy for fold 15: 80.72
Train accuracy for fold 17: 93.63
Test accuracy for fold 17: 81.70
Train accuracy for fold 19: 93.60
Test accuracy for fold 19: 83.83
MEAN Train accuracy: 93.07
MEAN Test accuracy : 82.54


In [20]:
# Save Shared to file
ggm_shared = GaussianGenerativeModel(isSharedCovariance=True)
ggm_shared.fit(X_train_std, t_train_squeezed)
VERIFY = ggm_shared.predict(X_train_std)
print "Accuracy fitting training set: %s" % (sum(VERIFY == t_train_squeezed) / float(len(VERIFY)))
preds = ggm_shared.predict(X_test_std)
util.write_predictions(preds, np.squeeze(np.asarray(test_ids)), "gmm_shared.csv")

Accuracy fitting training set: 0.849643551523


In [394]:
# Confusion Matrix - SHARED
pd.DataFrame(confusion_matrix(t_train, VERIFY),columns=util.malware_classes,index=util.malware_classes)

Unnamed: 0,Agent,AutoRun,FraudLoad,FraudPack,Hupigon,Krap,Lipler,Magania,None,Poison,Swizzor,Tdss,VB,Virut,Zbot
Agent,36,1,1,0,5,0,2,1,37,0,9,1,14,3,4
AutoRun,3,37,1,0,0,0,0,0,6,0,0,0,2,0,1
FraudLoad,1,0,15,4,0,0,0,0,10,0,4,2,0,1,0
FraudPack,1,0,1,16,0,1,0,0,8,0,4,0,1,0,0
Hupigon,2,0,0,0,22,0,0,0,11,0,0,0,6,0,0
Krap,4,0,0,0,0,18,0,0,13,0,3,0,1,0,0
Lipler,0,0,0,0,0,0,52,0,1,0,0,0,0,0,0
Magania,0,0,0,0,1,0,0,24,13,0,1,0,2,0,0
,33,1,2,0,1,1,1,4,1479,1,16,2,52,16,0
Poison,1,0,0,0,0,0,0,0,14,2,0,0,4,0,0


In [398]:
# Save Sep to file
ggm_sep = GaussianGenerativeModel(isSharedCovariance=False)
ggm_sep.fit(X_train_std, t_train_squeezed)
VERIFY1 = ggm_sep.predict(X_train_std)
print "Accuracy fitting training set: %s" % (sum(VERIFY1 == t_train_squeezed) / float(len(VERIFY1)))
preds1 = ggm_sep.predict(X_test_std)
util.write_predictions(preds1,  np.squeeze(np.asarray(test_ids)), "gmm_sep.csv")

Accuracy fitting training set: 0.924497731692


In [399]:
# Confusion Matrix - SEP
pd.DataFrame(confusion_matrix(t_train, VERIFY1),columns=util.malware_classes,index=util.malware_classes)

Unnamed: 0,Agent,AutoRun,FraudLoad,FraudPack,Hupigon,Krap,Lipler,Magania,None,Poison,Swizzor,Tdss,VB,Virut,Zbot
Agent,87,0,0,0,0,0,0,0,22,0,0,0,1,4,0
AutoRun,0,43,0,0,0,0,0,0,2,1,0,0,4,0,0
FraudLoad,0,0,30,0,0,1,0,0,5,0,0,0,0,1,0
FraudPack,0,0,0,27,0,1,0,0,4,0,0,0,0,0,0
Hupigon,0,0,0,0,32,0,0,0,7,0,0,0,0,2,0
Krap,0,0,0,0,0,35,0,0,2,0,0,0,0,2,0
Lipler,0,0,0,0,0,0,53,0,0,0,0,0,0,0,0
Magania,0,0,0,0,0,0,0,31,10,0,0,0,0,0,0
,4,18,0,0,0,1,0,0,1490,0,2,2,27,65,0
Poison,0,0,0,0,0,0,0,0,7,10,0,0,1,3,0


## Random Forest

In [8]:
%%time
# fit estimator with best parameters
rf_best1 = RandomForestClassifier(n_estimators=100000,min_samples_split=5,random_state=4).fit(X_train_array,t_train_squeezed)

CPU times: user 7min 28s, sys: 4.43 s, total: 7min 32s
Wall time: 7min 34s


In [None]:
%%time
# Predict on test data
preds = rf_best1.predict(X_test_array)

In [None]:
%%time
# Write output to csv file
util.write_predictions(preds, test_ids, "rf.csv")

In [None]:
%%time
# calculate score of best estimator
rf_best.score(X_train, t_train)

In [None]:
%%time
# make predictions on training set
train_preds = rf_best.predict(X_train)

In [None]:
%%time
# make confusion matrix
pd.DataFrame(confusion_matrix(t_train, train_preds),columns=util.malware_classes,index=util.malware_classes)