In [None]:
#cloning the malware dataset into the google colab environment.

!git clone https://github.com/iosifache/DikeDataset.git

In [None]:
# The code section contains all the pip install commands.

!pip install pefile
!pip install numpy
!pip install pandas
!pip install tensorflow

In [28]:
# This section contains all the import(s).

import pefile
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [13]:
# This cell contains all the constants used.

N = 200
Trainsize = 80
Testsize = 20
Decimal = 7
Maxvalue = 255
Layer1 = 8
Layer2 = 4
Layer3 = 1

benign_directory = '/content/DikeDataset/files/benign'
malware_directory = '/content/DikeDataset/files/malware'
labels_directory = '/content/DikeDataset/labels/malware.csv'

In [14]:
# This section extracts the data for a particular section name.

def extract_section_data(pe, section_name):
  for section in pe.sections:
    if section.Name.decode('utf-8').strip('\x00') == section_name:
      # Found the section with section name, converting the section data into hex and returning.
      return bytes.fromhex(section.get_data().hex())

  return None

In [15]:
# extract data from "text" section of the executable, convert it into an integer list, normalize and return

def get_normalized_data(file_path, section_name):
  try:
    pe = pefile.PE(file_path)
    section_data = extract_section_data(pe, section_name)
    integer_list = [int(byte) for byte in section_data]
    normalized_data = [round(value / Maxvalue, Decimal) for value in integer_list]
    return normalized_data
  except:
    print("Something unexpected happened")

In [16]:
# There are different types of malware files, consider only the trojan files for now.

categorized_data = pd.read_csv(labels_directory)

# sort the data as per the probability of it being a malware from the trojan family
categorized_data_sorted = categorized_data.sort_values(by = 'trojan', ascending = False)

trojan_files = categorized_data_sorted['hash'][:N]

# The 'files' variable contains N number of file names (without extension) from the trojan family
files = np.array(trojan_files)

In [17]:
# Get the text section data from the files which are contained in the 'files' variable
trojan_text_section = [] # this will be used as a list of lists where each list will contain the data about a particular trojan file
file_extension = '.exe'
section_name = '.text'
for name in files:
  relative_file_path = os.path.join(malware_directory, name + file_extension)
  byte_stream_data = get_normalized_data(relative_file_path, section_name)
  if byte_stream_data:
    # For the given file name found the byte stream data.
    trojan_text_section.append(byte_stream_data)
  else:
    print("No Data Found for the given file: ", name + file_extension)


In [None]:
# Till now found the trojan family malware data, now need a set of bening executables data.
benign_file_text_section = []
benign_dataset = os.listdir(benign_directory)
sample_benign_dataset = random.sample(benign_dataset, N)
for name in sample_benign_dataset:
  relative_file_path = os.path.join(benign_directory, name)
  byte_stream_data = get_normalized_data(relative_file_path, section_name)
  if byte_stream_data:
    benign_file_text_section.append(byte_stream_data)
  else:
    print("No Data Found for the given benign file: ", name)

print(len(benign_file_text_section))
print(len(trojan_text_section))

In [37]:
# Neural network training
# dataset - benign_file_text_section, trojan_text_section
# forming x_train and y_train

benign_features = max(len(list) for list in benign_file_text_section)
malware_features = max(len(list) for list in trojan_text_section)

number_of_features = max(benign_features, malware_features)

for i in range(0, len(benign_file_text_section)):
  arr = np.array(benign_file_text_section[i])
  benign_file_text_section[i] = np.pad(arr, (0, number_of_features-len(arr)), mode='constant')

for i in range(len(trojan_text_section)):
  arr = np.array(trojan_text_section[i])
  trojan_text_section[i] = np.pad(arr, (0, number_of_features-len(arr)), mode='constant')

benign_file_text_section = np.array(benign_file_text_section)
trojan_text_section = np.array(trojan_text_section)

x_train = np.vstack((benign_file_text_section[:Trainsize, :], trojan_text_section[:Trainsize, :]))
y_train = np.hstack((np.zeros(Trainsize), np.ones(Trainsize)))

x_test = np.vstack((benign_file_text_section[-Testsize:, :], trojan_text_section[-Testsize:, :]))
y_test = np.hstack((np.zeros(Testsize), np.ones(Testsize)))

In [None]:
# building the neural net

model = models.Sequential([
    layers.Dense(Layer1, activation='relu', input_shape=(number_of_features,)),  # Adjust input shape based on your features
    layers.Dense(Layer2, activation='relu'),
    layers.Dense(Layer3, activation='sigmoid')  # Binary classification, use 'softmax' for multi-class
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Use 'categorical_crossentropy' for multi-class
              metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs = 50, batch_size = 16, validation_split= 0.2)

In [40]:
# Test the model by doing predictions

predictions = model.predict(x_test)
binary_predictions = (predictions > 0.5).astype(int)

accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions)
recall = recall_score(y_test, binary_predictions)
f1 = f1_score(y_test, binary_predictions)
conf_matrix = confusion_matrix(y_test, binary_predictions)

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

Evaluation Metrics:
Accuracy: 0.8
Precision: 0.9285714285714286
Recall: 0.65
F1-Score: 0.7647058823529412
Confusion Matrix:
[[19  1]
 [ 7 13]]
