In [23]:
import numpy as np
import pandas as pd 
import pickle
import os, sys
import base64
from os import listdir
from os.path import isfile, join
import binascii
from elftools.elf.elffile import ELFFile, ELFError
from pprint import pprint
from collections import Counter
import random
from sklearn.model_selection import train_test_split
from pickle import dump

random.seed(a=1234567)

In [24]:
# read a file and parse the text section of elf file into a string
# returns empty string when there is no text section, or file is not a valid elf file
def process_file(label, label_list, Binaries_list, filename, file_sections = ['.text', '.comment']):
    out_array = []
    with open(filename, 'rb') as f:
        try:
            elffile = ELFFile(f)
            section_list = {}
            for section in elffile.iter_sections():
                section_list[section.name] = section.data()
            
            for file_section in file_sections:
                try:
                    fbuf = section_list[file_section]
                    out_array.append(fbuf.hex())
                except:
                    out_array.append('')
            
            Binaries_list.append(out_array)  
            label_list.append(label)
            return 
        except ELFError:
            print("Error opening file")
            return 'NOTELF'

In [25]:
# Constant Strings for pre-final data
benign_dir_path = '/Users/gurjotsingh/Downloads/malware_training_data/benign_ELF/'
#benign_dir_path_esc = benign_dir_path.replace(" ", "\ ")
malware_dir_path = '/Users/gurjotsingh/Downloads/malware_training_data/malware_ELF/'
#malware_dir_path_esc = malware_dir_path.replace(" ", "\ ")


# Structures for Data extraction
Binaries_list = [] 
distinct_labels = []
label_list = []
num_duplicates = 0
meta_path = {}


In [26]:
# load all types of malware in separate arrays
malware_dirs = [f for f in listdir(malware_dir_path)]

# labels
distinct_labels = distinct_labels + malware_dirs + ['Benign']

# load all benign files names
benign_file_names = [benign_dir_path + f for f in listdir(benign_dir_path) if isfile(join(benign_dir_path, f))]

# add benign file paths to
meta_path['Benign'] = benign_file_names

#extract relevant sections
extract_sections = [".text",".comment",".note", ".symtab", ".strtab", ".rodata",'.init']

# load all malware file names and 
for i, dir in enumerate(malware_dirs):
    meta_path[dir] = [malware_dir_path + malware_dirs[i] + '/' + 
                     f for f in listdir(malware_dir_path+malware_dirs[i])
                        if isfile(join(malware_dir_path+malware_dirs[i], f))]

# make byte level array for text section of each file
for label in meta_path.keys():
    print(label, len(meta_path[label]))
    label_file_path = meta_path[label]
    for path in label_file_path:
        process_file(label, label_list, Binaries_list, path, extract_sections)

Benign 1915
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
DDoS 661
Error opening file
Error opening file
Backdoor 617
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Trojan 408
Error opening file
Error opening file
Error opening file
Error opening file
Error opening file
Error

In [27]:
print(Counter(label_list))

Counter({'Benign': 1886, 'DDoS': 659, 'Backdoor': 603, 'Botnet': 586, 'Virus': 435, 'Trojan': 373})


In [28]:
Binaries_df = pd.DataFrame(Binaries_list)

In [29]:
label_info = Counter(label_list).values()
print('Label Info: ', label_info)
print('Distinct Labels:', distinct_labels)
#print('Label list: ', label_list)
print('Total Unique Data points:', sum(label_info))
print('Number of Duplicates:', num_duplicates)

min_len =  min(Counter(label_list).values()) #Minimum number of data points per class
print('Minimum number of data points in a class:', min_len)

#Convert string labels to integers 
label_list_numeric = []
for ii in range(0, len(label_list)):
    label_index = distinct_labels.index(label_list[ii])
    label_list_numeric.append(label_index)

class_data_info = []
#picked_data_indices = []
for ii in range(0, len(distinct_labels)):
    class_data_info.append([])
    #picked_data_indices.append([])

#Collecting indices belong to each class    
for ii in range(0, len(label_list)):
    class_data_info[label_list_numeric[ii]].append(ii)

    
#num_distinct_datasets = sum(label_info)//4008
train_dps_per_class = 144
test_dps_per_class = 36
num_expected_dps_per_class = train_dps_per_class + test_dps_per_class
num_distinct_datasets = 1
#num_distinct_datasets = 2

print('Number of distinct datasets possible:', num_distinct_datasets)
print('Number of expected data points per class', num_expected_dps_per_class)
print('binaries length: ', len(Binaries_list))
print('Labels length: ', len(label_list))

Binaries_DataSets_train, Binaries_DataSets_test, Binaries_LabelSets_train, Binaries_LabelSets_test = train_test_split(Binaries_df, label_list)

print('len(Binaries_DataSets_train): ', len(Binaries_DataSets_train))
print('len(Binaries_DataSets_test): ', len(Binaries_DataSets_test))
print('len(Binaries_LabelSets_train): ', len(Binaries_LabelSets_train))
print('len(Binaries_LabelSets_test): ', len(Binaries_LabelSets_test))

with open("DistinctLabels.txt", "wb") as fp:   #Pickling
    pickle.dump(distinct_labels, fp)

Label Info:  dict_values([1886, 659, 603, 373, 586, 435])
Distinct Labels: ['DDoS', 'Backdoor', 'Trojan', 'Botnet', 'Virus', 'Benign']
Total Unique Data points: 4542
Number of Duplicates: 0
Minimum number of data points in a class: 373
Number of distinct datasets possible: 1
Number of expected data points per class 180
binaries length:  4542
Labels length:  4542
len(Binaries_DataSets_train):  3406
len(Binaries_DataSets_test):  1136
len(Binaries_LabelSets_train):  3406
len(Binaries_LabelSets_test):  1136


## TF-IDF N-Gram Features

In [30]:
file_name_train = 'Binaries_DataSets_train'
with open(file_name_train, 'wb') as f:
    pickle.dump(Binaries_DataSets_train, f)

file_name_test = 'Binaries_DataSets_test'
with open(file_name_test, 'wb') as f:
    pickle.dump(Binaries_DataSets_test, f)
    
file_name_train = 'Binaries_LabelSets_train'
with open(file_name_train, 'wb') as f:
    pickle.dump(Binaries_LabelSets_train, f)

file_name_test = 'Binaries_LabelSets_test'
with open(file_name_test, 'wb') as f:
    pickle.dump(Binaries_LabelSets_test, f)
    

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 75   
df_train = pd.DataFrame(index=np.arange(Binaries_DataSets_train.shape[0]),columns=np.arange(max_features*Binaries_DataSets_train.shape[1] ))    
df_test = pd.DataFrame(index=np.arange(Binaries_DataSets_test.shape[0]),columns=np.arange(max_features*Binaries_DataSets_test.shape[1] ))   
cumulator=0

for i in range(Binaries_DataSets_train.shape[1]):
    vectorizer_3 = TfidfVectorizer(ngram_range = (1,3), max_features = max_features,lowercase = False, analyzer='char')
    #fit TF-IDF model using training data and transform training data according to the fitted model 
    X_train = vectorizer_3.fit_transform(Binaries_DataSets_train[i])
    No_Columns = pd.DataFrame(X_train.toarray()).shape[1]
    df_train.iloc[: , cumulator:cumulator + No_Columns] = pd.DataFrame(X_train.toarray())
    
    #transform test data using the fitted model
    X_test = vectorizer_3.transform(Binaries_DataSets_test[i])
    df_test.iloc[: , cumulator:cumulator + No_Columns] = pd.DataFrame(X_test.toarray())
    
    cumulator += No_Columns
    
    dump(vectorizer_3, open('trans-'+extract_sections[i]+'.pkl', 'wb'))

df_train.drop(df_train.columns.to_series()[cumulator:], axis=1, inplace=True)
df_test.drop(df_test.columns.to_series()[cumulator:], axis=1, inplace=True)

df_train['label'] = Binaries_LabelSets_train
df_test['label'] = Binaries_LabelSets_test

df_train_name = 'BinaryBlobData_3Gram-TrainSet'
df_train.to_pickle(df_train_name)
df_test.to_csv(df_train_name+".csv")

df_test_name = 'BinaryBlobData_3Gram-TestSet'
df_test.to_pickle(df_test_name)
df_test.to_csv(df_test_name+".csv")

In [36]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,474,475,476,477,478,479,480,481,482,label
0,0.506563,0.158906,0.096983,0.004603,0.036185,0.014058,0.00827,0.011009,0.00702,0.022981,...,0.0,0.0,0.0,0.218472,0.152947,0.064736,0.06574,0.072528,0.0,Virus
1,0.776913,0.248151,0.102097,0.00587,0.008865,0.048785,0.019333,0.031271,0.030895,0.038869,...,0.0,0.0,0.273338,0.278223,0.0,0.0,0.0,0.461821,0.301072,Botnet
2,0.737796,0.49849,0.334775,0.019062,0.005499,0.035255,0.017152,0.03724,0.021475,0.030274,...,0.008853,0.0,0.013913,0.026553,0.0,0.0,0.0,0.149855,0.091947,Backdoor
3,0.612405,0.355976,0.250971,0.030575,0.011596,0.030612,0.020274,0.011532,0.013391,0.051799,...,0.059427,0.06043,0.0,0.071295,0.041593,0.052814,0.053633,0.0,0.0,Benign
4,0.790175,0.384125,0.169479,0.032044,0.02686,0.077384,0.03292,0.097904,0.02208,0.043736,...,0.0,0.0,0.038215,0.17504,0.034039,0.0,0.0,0.435821,0.0,Trojan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3401,0.639284,0.381291,0.259547,0.031753,0.004244,0.036066,0.031409,0.02285,0.019083,0.042658,...,0.104624,0.106388,0.0,0.125517,0.073226,0.09298,0.094422,0.0,0.0,Benign
3402,0.629901,0.288886,0.191522,0.003881,0.034313,0.031157,0.010931,0.01252,0.017141,0.033753,...,0.0,0.0,0.039384,0.120264,0.105243,0.0,0.0,0.14972,0.065071,DDoS
3403,0.583543,0.300195,0.208972,0.041722,0.006849,0.028732,0.015814,0.008621,0.008765,0.074791,...,0.102561,0.104291,0.0,0.123042,0.071783,0.091147,0.092561,0.102119,0.133147,Virus
3404,0.658666,0.314726,0.216913,0.003739,0.036219,0.022373,0.010476,0.016101,0.011272,0.026435,...,0.0,0.0,0.078719,0.150235,0.105176,0.0,0.0,0.0,0.0,Botnet


In [37]:
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,474,475,476,477,478,479,480,481,482,label
0,0.57751,0.57735,0.57719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Virus
1,0.638206,0.39012,0.276454,0.030793,0.012869,0.029044,0.022886,0.007198,0.016364,0.054142,...,0.060323,0.06134,0.094797,0.108553,0.04222,0.053609,0.054441,0.0,0.0,Benign
2,0.612908,0.36678,0.271867,0.036458,0.003295,0.02141,0.011873,0.001843,0.00823,0.067466,...,0.057142,0.058105,0.0,0.068553,0.039993,0.050782,0.05157,0.0,0.0,Benign
3,0.599128,0.345023,0.24478,0.028818,0.002387,0.01432,0.002458,0.0,0.014312,0.050077,...,0.057372,0.05834,0.045081,0.068829,0.040155,0.050987,0.051778,0.057125,0.0,Benign
4,0.561915,0.257127,0.182469,0.002624,0.029568,0.033529,0.015756,0.014856,0.011934,0.031802,...,0.0,0.0,0.0,0.192587,0.134826,0.057066,0.057951,0.0,0.0,Trojan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,0.5605,0.312415,0.222664,0.035486,0.004675,0.021352,0.011378,0.009391,0.015288,0.067076,...,0.058601,0.059589,0.046046,0.105454,0.041015,0.052079,0.052887,0.0,0.0,Benign
1132,0.573214,0.332221,0.228735,0.035779,0.018649,0.020371,0.00562,0.009159,0.005635,0.058606,...,0.059649,0.060655,0.0,0.07156,0.041748,0.053011,0.053833,0.0,0.0,Benign
1133,0.528933,0.206484,0.130698,0.003739,0.035177,0.022126,0.012163,0.016068,0.013025,0.022154,...,0.0,0.0,0.0,0.182226,0.127572,0.053996,0.054833,0.0,0.0,Trojan
1134,0.528556,0.160488,0.101002,0.012128,0.012691,0.022204,0.007187,0.010648,0.005706,0.052608,...,0.0,0.0,0.025196,0.134645,0.089773,0.028498,0.0,0.063856,0.0,Trojan


In [38]:
### Classification Report Function

In [39]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, \
precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

def plot_m2m_curve(ax, xs, ys, x_metric, y_metric, title):
    """ Helper function for plotting metric-to-metric curve. """
    # Plot metric-to-metric curve
    ax.plot(xs, ys)
    
    # Make sub-plot square
    ax.set_aspect("equal")
    # Set axes labels
    ax.set_xlabel(x_metric)
    ax.set_ylabel(y_metric)
    # Set title
    ax.set_title(title)
def evaluate_model(model, name, feat_test, y_test):
    """ Evaluate a classification model on the test set, then print and plot metrics. """
    # Make prediction from features
    pred_test = model.predict(feat_test)
    
    print(f"\n[ Evaluation result for {name} ]")
    # Print classification report
    print("Classification report:")
    print(classification_report(y_test, pred_test))

## Load BinaryBlob dataset and corresponding distinct labels (Types of Architectures)

In [40]:
import pandas as pd
import numpy as np
import pickle
import random
random.seed(a= 1234567)


print('Experiment in progres ...')

#Combining (1,2)-gram and 3-gram features to make the complete Training set 
df_train = pd.read_pickle('BinaryBlobData_3Gram-TrainSet')

#Combining (1,2)-gram and 3-gram features to make the complete Test set 
df_test = pd.read_pickle('BinaryBlobData_3Gram-TestSet')

print("df_test.shape: ", df_test.shape)


## Visualize the dataset in tabular format
print('Total Number of Training data samples:', df_train.shape[0])
print('Total Number of Testing data samples:', df_test.shape[0])
print('Number of features per Train data sample:', df_train.shape[1]-1)
print('Number of features per Test data sample:', df_test.shape[1]-1)
print('Distinct labels (Architecture Types):\n',distinct_labels)

## Converting labels  to integer values

def label_to_numeric(row):
    """Convert label to integers"""
    index_ = distinct_labels.index(row["label"])
    return index_

df_train["label"] = df_train.apply(label_to_numeric, axis=1)
df_test["label"] = df_test.apply(label_to_numeric, axis=1)

Experiment in progres ...
df_test.shape:  (1136, 484)
Total Number of Training data samples: 3406
Total Number of Testing data samples: 1136
Number of features per Train data sample: 483
Number of features per Test data sample: 483
Distinct labels (Architecture Types):
 ['DDoS', 'Backdoor', 'Trojan', 'Botnet', 'Virus', 'Benign']


In [42]:
## Removing randomly choosen class of data from the dataset and varying training data and evaluating accuracy for fixed (25%) test data
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier


import copy 

label_numerals = list(range(0,len(distinct_labels)))

df_train_tmp = copy.deepcopy(df_train)
df_test_tmp = copy.deepcopy(df_test)

print('Number of Labels:', len(label_numerals))
current_labels = [val for kk, val in enumerate(distinct_labels) if kk in label_numerals]
print('Labels:', current_labels)

#Train and Test labels
y_train = df_train_tmp.pop("label").values
y_test = df_test_tmp.pop("label").values

#Train and Test data
X_train = df_train_tmp.values
X_test = df_test_tmp.values


#DT
DT_model = DecisionTreeClassifier(random_state=1234567)
DT_model.fit(X_train, y_train)
evaluate_model(DT_model, "Decision Tree Classifier", X_test, y_test) 
DT_acc = np.round(DT_model.score(X_test, y_test)*100,2)

#RF
RF_model = RandomForestClassifier(random_state=1234567)
RF_model.fit(X_train, y_train)
evaluate_model(RF_model, "Random Forest Classifier", X_test, y_test) 
RF_acc = np.round(RF_model.score(X_test, y_test)*100,2)

#XGB
XGB_model = xgb.XGBClassifier(objective="multi:softprob", random_state=1234567, use_label_encoder=False, n_estimators = 100)
XGB_model.fit(X_train, y_train)
evaluate_model(XGB_model, "XGBoost Classifier", X_test, y_test) 
XGB_acc = np.round(XGB_model.score(X_test, y_test)*100,2)

#KNN
KNN_model = KNeighborsClassifier()
KNN_model.fit(X_train, y_train)
evaluate_model(KNN_model, "K NearestNeighbors Classifier", X_test, y_test)
KNN_acc = np.round(KNN_model.score(X_test, y_test)*100,2)

print('Experiment  Complete !!!')


Number of Labels: 6
Labels: ['DDoS', 'Backdoor', 'Trojan', 'Botnet', 'Virus', 'Benign']

[ Evaluation result for Decision Tree Classifier ]
Classification report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.83       160
           1       0.75      0.67      0.70       150
           2       0.41      0.63      0.50       105
           3       0.84      0.71      0.77       153
           4       0.60      0.50      0.55       111
           5       0.98      0.96      0.97       457

    accuracy                           0.80      1136
   macro avg       0.73      0.72      0.72      1136
weighted avg       0.82      0.80      0.80      1136


[ Evaluation result for Random Forest Classifier ]
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       160
           1       0.87      0.73      0.79       150
           2       0.51      0.60      0.55       105
  

In [44]:
#MLPClassifier
MLPC_model = MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=300, activation = 'relu', solver='adam', random_state=1234567)
#MLPC_model = MLPClassifier(hidden_layer_sizes=(150,150,100,100,50,50), max_iter=500, activation = 'relu', solver='adam', random_state=1234567)
MLPC_model.fit(X_train, y_train)
evaluate_model(MLPC_model, "MLPClassifier", X_test, y_test)
MLPC_acc = np.round(MLPC_model.score(X_test, y_test)*100,2)

print('Experiment Complete !!!')


[ Evaluation result for MLPClassifier ]
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       160
           1       0.59      0.89      0.71       150
           2       0.65      0.59      0.62       105
           3       0.93      0.73      0.82       153
           4       0.70      0.47      0.56       111
           5       0.97      0.98      0.97       457

    accuracy                           0.84      1136
   macro avg       0.79      0.76      0.77      1136
weighted avg       0.85      0.84      0.84      1136

Experiment Complete !!!


In [45]:
from sklearn.ensemble import StackingClassifier

estimators = [('RF', RandomForestClassifier(random_state=1234567)),
              ('XGB', xgb.XGBClassifier(objective="multi:softprob", random_state=1234567, use_label_encoder=False, n_estimators = 100)),
              ('KNN', KNeighborsClassifier())]

Stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=123))
Stacking_model.fit(X_train, y_train)
evaluate_model(Stacking_model, "Stacking Classifier", X_test, y_test) 
Stacking_model.score(X_test, y_test)


from sklearn.ensemble import BaggingClassifier
bagging_model = BaggingClassifier(random_state = 123, n_estimators = 100)
bagging_model.fit(X_train, y_train)
evaluate_model(bagging_model, "Bagging Classifier", X_test, y_test) #100 desicion trees
bagging_model.score(X_test, y_test)


[ Evaluation result for Stacking Classifier ]
Classification report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       160
           1       0.86      0.76      0.81       150
           2       0.52      0.63      0.57       105
           3       0.88      0.85      0.86       153
           4       0.68      0.67      0.67       111
           5       0.98      0.98      0.98       457

    accuracy                           0.86      1136
   macro avg       0.81      0.80      0.80      1136
weighted avg       0.87      0.86      0.87      1136


[ Evaluation result for Bagging Classifier ]
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.91       160
           1       0.86      0.75      0.80       150
           2       0.51      0.62      0.56       105
           3       0.90      0.84      0.87       153
           4       0.63      0.70      0.67    

0.8591549295774648