In [7]:
# Imports
import numpy as np
import pandas as pd
import csv
import json
import os
import re
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score, classification_report, make_scorer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import multilayer_perceptron
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate, cross_val_score
import string
# from nltk.corpus import stopwords


filename = "preprocessed_data.txt"





# preprocessing data (optional)

skip the step if dataset.txt already exists

raw data from the Drebin dataset is expected in folder `DATA_DIR`

small and medium datasets are contained in this repository. the full dataset can
be downloaded from the Drebin webpage

in the `DATA_DIR` folder, a file `sha256_family.csv` is expected which indicates the malicious apps
and a subdirectory `feature_vectors` is expected which contains a file for every app
in the dataset

this step reads the raw data and creates a single file in the output path
containing the information from all apps

the resulting file has two columns separated by tab: first the category of
maliciousness (or "benign") and second a space-separated list of features

In [3]:
DATA_DIR = '/home/ckaestne/tmp'
FAMILY_FILENAME = 'sha256_family.csv'
FEATURES = 'feature_vectors'




def generate_data(path, targetFile):
    family_file = os.path.join(path, FAMILY_FILENAME)
    feature_path = os.path.join(path, FEATURES)
    
    # read the list of malicious apps
    with open(family_file) as file:
        reader = csv.reader(file, delimiter=',')
        next(reader, None)  # skip the headers
        malware_dict = {rows[0]: rows[1] for rows in reader}
    
    # load all the feature files in the DATA_DIR/feature_vectors directory
    # write them into single output file
    with open(targetFile, mode='w') as dataset:
        for file in os.listdir(feature_path):
            label = malware_dict.get(file, "benign")
            feature = extract_feature_naive(os.path.join(feature_path, file))
            dataset.write(label + '\t' + feature + "\n")


def extract_feature_naive(filename):
    feature = ''
    with open(filename, mode='r') as file:
        for line in file:
            line = line.strip()
            feature = ' '.join([feature, line.replace(" ", "_")])

    return feature[1:]


generate_data(DATA_DIR, filename)
print('Dataset generation completed.')


FileNotFoundError: [Errno 2] No such file or directory: '/home/ckaestne/tmp/sha256_family.csv'

# loading data and selecting features

the preprocessed data in `filename` contains all the features representing characteristics of the apps.
several characteristics include names and URLs, so the space is not limited to a finite set
of characteristics.

for prediction, it may make sense to select only a subset of these characteristics as features.
the code below makes such a selection and translates the data into a big dataframe with one
column per selected characteristic. that is, we have a potentially large list of (binary) features.

several characteristics are droped (call, activity, url, ...), but could be added back as features
again. It's also possible to further preprocess and group characteristics into fewer features or
into nonboolean features.

the `max_features` parameter of load data restricts the number of features (columns) used
to those that occur most frequently across all apps.


In [4]:
def feature_selection(x):
    # split the long feature string into multiple parts and drop a number of 
    # extra information that we won't use for modeling (calls, activities, urls)
    features = x.split(" ")
    return filter(lambda x: not (x.startswith("call") or x.startswith("activity") or x.startswith("url") or x.startswith("provider") or x.startswith("service_receiver")), features)

def load_data(filename, max_features=None):
    # create a data frame with pandas
    raw_data = pd.read_table(filename, sep='\t', header=None, names=['label', 'features'])
    # convert the labels from strings to binary values
    raw_data.label[raw_data.label != 'benign'] = 'malicious'
    raw_data = raw_data.dropna()

    # transform the data into occurrences,
    # which will be the features that we will feed into the model
    count_vect = CountVectorizer(analyzer=feature_selection, lowercase=False, max_features=max_features)
    counts = count_vect.fit_transform(raw_data['features'])
    
    # convert resulting matrix into dataframe
    df=pd.DataFrame(data=counts.toarray(), columns=count_vect.get_feature_names())
    df['label'] = raw_data.label

    return df.dropna()

df = load_data(filename)
X = df.drop("label", axis=1)
y = df["label"]

print(X.shape)

(5600, 1328)


# simple learning code


In [11]:
# split train vs test data
# from sklearn.model_selection import train_test_split
# from sklearn.neural_network import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                    random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)

mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(13, 13, 13), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

predictions = mlp.predict(X_test)
mlp.score(X_test, y_test)

print(classification_report(y_test,predictions))

# fit model
# model = multilayer_perceptron()
# model.fit(X_train,y_train)

# # accuracy of the model
# model.score(X_test, y_test)

              precision    recall  f1-score   support

      benign       0.99      0.99      0.99       545
   malicious       0.63      0.80      0.71        15

    accuracy                           0.98       560
   macro avg       0.81      0.89      0.85       560
weighted avg       0.98      0.98      0.98       560



# learning with crossvalidation

In [6]:
kfold = KFold(10, True, 1)
s_kfold = StratifiedKFold(10, True, 1)
model = MultinomialNB()

acc_scores = cross_val_score(model, X, y, cv=s_kfold)
print('Accuracy: ' + repr(np.mean(acc_scores)))

prec_scores = cross_val_score(model, X, y, scoring=make_scorer(precision_score, pos_label='malicious'), cv=s_kfold)
print('Precision: ' + repr(np.mean(prec_scores)))

recall_scores = cross_val_score(model, X, y, scoring=make_scorer(recall_score, pos_label='malicious'), cv=s_kfold)
print('Recall: ' + repr(np.mean(recall_scores)))

f1_scores = cross_val_score(model, X, y, scoring=make_scorer(fbeta_score, beta=1, pos_label='malicious'), cv=s_kfold)
print('F1 score: ' + repr(np.mean(f1_scores)))



Accuracy: 0.96
Precision: 0.49911380487001794
Recall: 0.7173160173160172
F1 score: 0.586356589296156
