In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import requests
import shutil
import random
import json
import pprint
from tqdm import tqdm
import re
import statistics 
from collections import defaultdict as dd
import sklearn
from sklearn import preprocessing
import pickle
import time
from ast import literal_eval 
from distutils.dir_util import copy_tree

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dataset_path = "drive/My Drive/Malware_Analysis_Dataset"
os.listdir(dataset_path)

In [None]:
!apt-get install p7zip-full

In [None]:
# !wget https://mettl-miscellaneous-public.s3.ap-south-1.amazonaws.com/client_public_data/369004/Static_Analysis_Data.7z
# !p7zip -d Static_Analysis_Data.7z
!p7zip -d /content/drive/My\ Drive/Malware_Analysis_Dataset/Static_Analysis_Data.7z -o /content/

In [None]:
# !wget https://mettl-miscellaneous-public.s3.ap-south-1.amazonaws.com/client_public_data/369004/Dynamic_Analysis_Data_Part1.7z
# !p7zip -d Dynamic_Analysis_Data_Part1.7z
!p7zip -d /content/drive/My\ Drive/Malware_Analysis_Dataset/Dynamic_Analysis_Dataset_Part1.7z -o /content/

In [None]:
# !wget https://mettl-miscellaneous-public.s3.ap-south-1.amazonaws.com/client_public_data/369004/Dynamic_Analysis_Dataset_Part2.7z
# !p7zip -d Dynamic_Analysis_Dataset_Part2.7z
!p7zip -d /content/drive/My\ Drive/Malware_Analysis_Dataset/Dynamic_Analysis_Dataset_Part2.7z -o /content/

In [None]:
!du -sh static/
!du -sh dynamic/

In [None]:
from google.colab import drive
# drive.mount('/content/drive')
# drive.flush_and_unmount()

In [None]:
!gsutil cp Dynamic_Analysis_Data_Part1.7z Dynamic_Analysis_Data_Part2.7z Static_Analysis_Data.7z drive/My\ Drive/Malware_Analysis_Dataset

In [None]:
dyn_analysis_1 = "/content/Dynamic_Analysis_Data_Part1"
ben_dyn_pth1 = "/content/Dynamic_Analysis_Data_Part1/Benign"
mal_dyn_pth1 = "/content/Dynamic_Analysis_Data_Part1/Malware"

dyn_analysis_2 = "/content/Dynamic_Analysis_Data_Part2"
ben_dyn_pth2 = "/content/Dynamic_Analysis_Data_Part2/Benign"
mal_dyn_pth2 = "/content/Dynamic_Analysis_Data_Part2/Malware"

stat_analysis = "/content/Static_Analysis_Data"
ben_stat_pth = "/content/Static_Analysis_Data/Benign"
mal_stat_pth = "/content/Static_Analysis_Data/Malware"


dynamic_analysis = "/content/dynamic"
static_analysis = "/content/static"

if os.path.exists(dynamic_analysis):
  shutil.rmtree(dynamic_analysis)
if os.path.exists(static_analysis):
  shutil.rmtree(static_analysis)


dynamic_train = os.path.join(dynamic_analysis, "train")
dynamic_test = os.path.join(dynamic_analysis, "test")
static_train = os.path.join(static_analysis, "train")
static_test = os.path.join(static_analysis, "test")
dynamic_train_benign = os.path.join(dynamic_train, "benign")
dynamic_train_malware = os.path.join(dynamic_train, "malware")
dynamic_test_benign = os.path.join(dynamic_test, "benign")
dynamic_test_malware = os.path.join(dynamic_test, "malware")
static_train_benign = os.path.join(static_train, "benign")
static_train_malware = os.path.join(static_train, "malware")
static_test_benign = os.path.join(static_test, "benign")
static_test_malware = os.path.join(static_test, "malware")

dirs = [dynamic_analysis, static_analysis, dynamic_train, dynamic_test, static_train, static_test, dynamic_train_benign,
        dynamic_train_malware, dynamic_test_benign,dynamic_test_malware, static_train_benign, static_train_malware,
        static_test_benign, static_test_malware]  

for dir in dirs:
  os.mkdir(dir)

In [None]:
def moveFiles(CUR_PATH, TO_PATH):
  files = os.listdir(CUR_PATH)
  [shutil.move(os.path.join(CUR_PATH, file), os.path.join(TO_PATH, file)) for file in files]

#merge all malwares to same folder
def merge_malwares(malware_dir):
  for malware in os.listdir(malware_dir):
    malware_path = os.path.join(malware_dir, malware)
    moveFiles(malware_path, malware_dir)
    os.rmdir(malware_path)

def split_train_test(source, train_dir, test_dir, split_size=0.75):
  files = os.listdir(source)
  random.shuffle(files)
  split_index = int(split_size*len(files))
  train_files = files[:split_index]
  test_files = files[split_index:]
  [shutil.move(os.path.join(source, file), train_dir) for file in train_files]
  [shutil.move(os.path.join(source, file), test_dir) for file in test_files]
  shutil.rmtree(source)

In [None]:
merge_malwares(mal_dyn_pth1)
merge_malwares(mal_dyn_pth2)
merge_malwares(mal_stat_pth)

In [None]:
split_train_test(mal_dyn_pth1, dynamic_train_malware, dynamic_test_malware)
split_train_test(ben_dyn_pth1, dynamic_train_benign, dynamic_test_benign)
split_train_test(mal_dyn_pth2, dynamic_train_malware, dynamic_test_malware)
split_train_test(ben_dyn_pth2, dynamic_train_benign, dynamic_test_benign)
split_train_test(mal_stat_pth, static_train_malware, static_test_malware)
split_train_test(ben_stat_pth, static_train_benign, static_test_benign)
shutil.rmtree(dyn_analysis_1)
shutil.rmtree(dyn_analysis_2)
shutil.rmtree(stat_analysis)

In [None]:
print("Dynamic: ", end="")
print(len(os.listdir(dynamic_train_benign)),len(os.listdir(dynamic_test_benign)), len(os.listdir(dynamic_train_malware)),len(os.listdir(dynamic_test_malware)))
print("Static: ", end="")
print(len(os.listdir(static_train_benign)),len(os.listdir(static_test_benign)), len(os.listdir(static_train_malware)),len(os.listdir(static_test_malware)))

In [None]:
file_features_needed = [
  "files_modified", "files_opened", "files_copied", "files_failed", "files_created", "files_written",
  "files_exists", "files_deleted", "dll_loaded","regkey_read", "regkey_opened", "regkey_written","regkey_deleted",
  "directory_enumerated", "directory_removed", "directory_created"
]
file_exts = [
  "bak", "bat", "bmp", "cfg", "clb", "com", "com", "dat", "db", "dll", "doc", 
  "docx", "exe", "ico", "ime", "inf", "ini", "jpeg", "jpg", "js", "lnk", "log", 
  "otf", "pdf", "pnf", "png", "reg", "rtf", "sav", "sys", "tmp", "txt", "xls", "xlsx", "xml"
]
api_calls = [
  "NtOpenSection", "NtWaitForSingleObject", "GetAsyncKeyState", 
  "NtDeleteValueKey", "WSARecv", "getaddrinfo", "InternetGetConnectedState", 
  "NtCreateEvent", "GetFileVersionInfoSizeW", "GetAdaptersAddresses", 
  "NtMakeTemporaryObject", "NtRenameKey", "HttpSendRequestA", 
  "GetLocalTime", "NetUserGetLocalGroups", "FindFirstFileExW", 
  "CryptRetrieveObjectByUrlW", "NtReadVirtualMemory", "HttpAddRequestHeadersA", 
  "RegOpenKeyExW", "NtDelayExecution", "InternetCrackUrlA", "SetErrorMode", "ShellExecuteExW", 
  "RegOpenKeyExA", "HttpSendRequestW", "HttpAddRequestHeadersW", "GetCursorPos", "JsEval", 
  "GetUserNameW", "WinHttpSetTimeouts", "WaitForDebugEvent", "FindWindowExA", "GetUserNameA", 
  "NtCreateFile", "TransmitFile", "GetSystemTimeAsFileTime", "WinHttpOpen", "NtLoadDriver", 
  "GetDiskFreeSpaceA", "NtCreateProcess", "NtDeleteKey", "WinHttpQueryHeaders", 
  "InternetSetOptionA", "CryptGenKey", "recvfrom", "CryptEncrypt", "sendto", "NtSuspendThread",
  "NtQueryInformationFile", "RegCreateKeyExW", "GetSystemTime", "DeviceIoControl", 
  "WSASendTo", "FindFirstChangeNotificationW", "NtQueryKey", "OpenServiceA", 
  "WriteProcessMemory", "WSARecvFrom", "NtSetContextThread", "HttpEndRequestW", 
  "RegQueryValueExA", "RemoveDirectoryW", "EnumWindows", "OpenServiceW", "NtSetValueKey", 
  "LookupPrivilegeValueW", "NtQueryValueKey", "RegCreateKeyExA", "RemoveDirectoryA", 
  "HttpEndRequestA", "RegQueryValueExW", "WSASocketW", "NetUserGetInfo", "SetWindowsHookExW",
  "ExitWindowsEx", "WSASend", "WinHttpGetProxyForUrl", "StartServiceA", 
  "NtDeviceIoControlFile", "NtReadFile", "CryptCreateHash", "FindWindowExW", "NtWriteFile"
  , "LdrGetDllHandle", "WinHttpSendRequest", "RtlDecompressBuffer", "NtQuerySystemInformation", 
  "NtEnumerateValueKey", "CreateDirectoryExW", "CreateThread", "NtLoadKey", 
  "SetupDiGetClassDevsA", "SetUnhandledExceptionFilter", "NtQuerySystemTime", 
  "GetVolumeNameForVolumeMountPointW", "DnsQuery_A", "CryptDecrypt", "recv", 
  "SetupDiGetClassDevsW", "NtProtectVirtualMemory", "SHGetFolderPathW", "RegDeleteValueW",
  "GetDiskFreeSpaceExA", "socket", "RegSetValueExW", "WriteConsoleA", 
  "LdrGetProcedureAddress", "NtOpenThread", "CopyFileA", "CopyFileW", 
  "RegSetValueExA", "GetDiskFreeSpaceExW", "NtEnumerateKey", "NtOpenDirectoryObject", 
  "LdrLoadDll", "NtWriteVirtualMemory", "URLDownloadToFileW", "WriteConsoleW",
  "CreateToolhelp32Snapshot", "SendNotifyMessageA", "RegCloseKey", "NtOpenEvent", 
  "NtSetInformationFile", "HttpSendRequestExW", "NtCreateKey", "WinHttpConnect", 
  "MoveFileWithProgressW", "ioctlsocket", "WSAStartup", "NtTerminateThread",
  "DbgUiWaitStateChange", "NtTerminateProcess", "send", "shutdown", 
  "SendNotifyMessageW", "COleScript_ParseScriptText", "HttpSendRequestExA", 
  "select", "NtQueryFullAttributesFile", "CreateRemoteThread", "GetSystemMetrics", 
  "NtQueueApcThread", "WSASocketA", "CreateServiceA", "WinHttpSetOption", 
  "InternetCloseHandle", "DeleteFileA", "NtLoadKey2", "CryptExportKey", 
  "CryptImportPublicKeyInfo", "NtAllocateVirtualMemory", "ReadProcessMemory", 
  "CreateDirectoryW", "DeleteFileW", "VirtualProtectEx", "CreateServiceW", "listen", 
  "NtCreateThread", "GetComputerNameW", "NtResumeThread", "CryptAcquireContextA",
  "setsockopt", "InternetReadFile", "CoCreateInstance", "RegEnumKeyExW", "FindNextFileW",
  "ObtainUserAgentString", "CryptAcquireContextW", "DnsQuery_W", "NtCreateNamedPipeFile"
  , "GetComputerNameA", "NtReplaceKey", "RegEnumKeyExA", "closesocket", 
  "NtGetContextThread", "RtlCreateUserThread", "RegEnumValueW", "NtCreateSection", 
  "StartServiceW", "WinHttpGetIEProxyConfigForCurrentUser", "SetWindowsHookExA",
  "NtOpenMutant", "InternetOpenA", "NtDeleteFile", "NSPStartup", "IsDebuggerPresent", 
  "RegEnumValueA", "WinHttpReceiveResponse", "InternetOpenW", "CreateProcessInternalW", 
  "connect", "RegDeleteKeyA", "NtDuplicateObject", "RegNotifyChangeKeyValue", 
  "NtQueryMultipleValueKey", "HttpOpenRequestA", "OpenSCManagerW", "GetSystemInfo", 
  "NtCreateProcessEx", "accept", "FindWindowW", "ControlService", "NtClose",
  "RegDeleteKeyW", "CryptHashData", "NtOpenProcess", "FindWindowA", "HttpOpenRequestW",
  "NtFreeVirtualMemory", "Process32NextW", "GetLastInputInfo", "InternetConnectW", 
  "UnhookWindowsHookEx", "InternetWriteFile", "GetDiskFreeSpaceW", "NtSaveKeyEx", 
  "RegEnumKeyW", "InternetConnectA", "NtSaveKey", "SetWindowLongA", 
  "CDocument_write", "WSAConnect", "RegDeleteValueA", "CopyFileExW", 
  "NtMapViewOfSection", "SetupDiGetDeviceRegistryPropertyW", "Process32FirstW", 
  "DeleteService", "LsaOpenPolicy", "NtOpenFile", "RegQueryInfoKeyW",
  "NtUnmapViewOfSection", "NtQueryDirectoryFile", "NetGetJoinInformation", 
  "FindFirstFileExA", "gethostbyname", "DecodeImage", "NtQueryAttributesFile", 
  "RegQueryInfoKeyA", "NtCreateMutant", "GetAddrInfoW", "InternetOpenUrlA", 
  "WSAAccept", "bind", "NtOpenKey", "InternetCrackUrlW", "DnsQuery_UTF8", 
  "CoInternetSetFeatureEnabled", "NtResumeProcess", "OpenSCManagerA", 
  "GetFileVersionInfoW", "CryptDecodeObjectEx", "InternetOpenUrlW", 
  "OpenSCManagerA", "WinHttpOpenRequest", "SetupDiGetDeviceRegistryPropertyA"
]
call_categories = [
  "system", "filesystem", "browser", "com", "crypto", "process", "synchronization", 
  "registry", "misc", "services", "windows", "device", "network", "threading", 
  "hooking", "__notification__"
]

cuckoo_signatures = [
  "recon_beacon", "recon_checkip", "mimics_agent", "antiav_detectreg", "packer_upx",
  "packer_vmprotect", "packer_armadillo_regkey", "removes_zoneid_ads", "antiemu_wine_func",
  "network_tor", "browser_helper_object", "disables_wfp", "antivirus_virustotal", "bootkit",
  "disables_browser_warn", "browser_addon", "antiav_avast_libs", "disables_system_restore",
  "antivm_generic_disk_setupapi", "antivm_vmware_files", "packer_entropy", "browser_startpage", "recon_fingerprint",
  "banker_spyeye_mutexes", "disables_uac", "banker_zeus_mutex", "bitcoin_opencl", "modify_uac_prompt",
  "antivm_vmware_devices", "infostealer_browser", "antisandbox_unhook", "antiav_servicestop",
  "spoofs_procname", "infostealer_mail", "persistence_ads", "persistence_service", "stealth_file",
  "sniffer_winpcap", "driver_load", "spreading_autoruninf", "recon_programs", "antiav_detectfile",
  "rat_xtreme_mutexes", "packer_armadillo_mutex", "deepfreeze_mutex", "injection_createremotethread", "modifies_certs",
  "antivm_generic_services", "antivm_generic_diskreg", "process_interest", "antivm_generic_bios", 
  "antisandbox_sleep", "network_icmp","injection_explorer", "darkcomet_regkeys", "antisandbox_suspend", 
  "network_tor_service", "copies_self", "pdf_page", "antianalysis_detectreg", "stealth_hiddenreg", "mimics_filetime",
  "rat_pcclient","reads_self", "modify_proxy", "stealth_network", "antisandbox_mouse_hook",
  "antisandbox_sunbelt_libs", "antisandbox_productid", "network_http", "stealth_hide_notifications",
  "antisandbox_sboxie_libs", "browser_security", "stealth_window", "ransomware_recyclebin", "deletes_self", 
  "banker_cridex", "banker_zeus_p2p", "stealth_webhistory", "rat_plugx_mutexes","antidbg_devices", 
  "antivm_generic_scsi", "exec_crash", "antivm_generic_disk", "encrypted_ioc", "network_bind", "dropper",
  "antivm_generic_cpu", "creates_nullvalue", "injection_rwx", "antidbg_windows", "disables_windowsupdate",
  "rat_poisonivy_mutexes", "polymorphic", "modify_security_center_warnings", "prevents_safeboot", "infostealer_im",
  "infostealer_bitcoin", "injection_runpe", "rat_spynet", "virus", "persistence_autorun", "infostealer_keylog",
  "multiple_useragents", "bypass_firewall", "origin_langid", "process_needed", "infostealer_ftp",
  "bot_russkill", "rat_fynloski_mutexes", "antiemu_wine_reg", "stealth_timeout"
]

In [None]:
#extract network features
def get_network_features(data):
  # network_features = []
  dns_req_types = ["A", "AAAA", "MX", "SRV", "TXT", "PTR"]
  # for file in tqdm(os.listdir(path)[:10]):
  file_features = dd(lambda: 0)
    # with open(os.path.join(path, file), "r") as f:
  # data = json.load(f)
  network = data["network"]
  file_features["udp"] = len(network["udp"])
  file_features["ips"] = len(set([ip["src"] for ip in network["udp"]]))
  file_features["udp_dest_ports"] = len(set([ip["dst"] for ip in network["udp"]]))
  file_features["irc"] = len(network["irc"])
  file_features["http"] = len(network["http"])
  file_features["smtp"] = len(network["smtp"])
  file_features["tcp"] = len(network["tcp"])
  file_features["icmp"] = len(network["icmp"])
  file_features["hosts"] = len(network["hosts"])
  file_features["dns"] = len(network["dns"])
  for x in dns_req_types:
    file_features[x] = 0
  for x in [1,2,3,4]:
    key = "dom_freq_" + str(x)
    file_features[key] = 0
  for x in network["dns"]:
    file_features[x["type"]] += 1
  file_features["domains"] = len(network["domains"])
  for x in network["domains"]:
    domain_levels = x["domain"].split(".")
    if len(domain_levels) >= 5:
      file_features["dom_freq_4"] += 1
    elif len(domain_levels) >= 2:
      key = "dom_freq_" + str(len(domain_levels) - 1)
      file_features[key] += 1
  # network_features.append(file_features)
  return file_features

def get_file_system_features(data):
  # filesystem_features = []
  # for file in tqdm(os.listdir(path)[:100]):
  file_features = dd(lambda: 0)
    # with open(os.path.join(path, file), "r") as f:
  for ext in file_exts:
    file_features["files_modified_"+ext] = 0
  for ext in file_exts:
    file_features["files_opened_"+ext] = 0
  for ext in file_exts:
    file_features["files_created_"+ext] = 0
  for ext in file_exts:
    file_features["files_deleted_"+ext] = 0
  # data = json.load(f)
  behaviour = data["behavior"]
  generics = behaviour["generic"]
  for generic in generics:
    summary = generic["summary"]
    parameters = summary.keys()
    for x in file_features_needed:
      file_features[x] = 0
    if "file_recreated" in parameters:
      file_features["files_modified"] = len(summary["file_recreated"])
      for file in summary["file_recreated"]:
        ext_split = file.split(".")
        ext = ext_split[len(ext_split) - 1]
        if ext in file_exts:
          file_features["files_modified_"+ext] += 1
    if "file_opened" in parameters:
      file_features["files_opened"] = len(summary["file_opened"])
      for file in summary["file_opened"]:
        ext_split = file.split(".")
        ext = ext_split[len(ext_split) - 1]
        if ext in file_exts:
          file_features["files_opened_"+ext] += 1
    if "file_copied" in parameters:
      file_features["files_copied"] = len(summary["file_copied"])
    if "file_failed" in parameters:
      file_features["files_failed"] = len(summary["file_failed"])
    if "file_created" in parameters:
      file_features["files_created"] = len(summary["file_created"])
      for file in summary["file_created"]:
        ext_split = file.split(".")
        ext = ext_split[len(ext_split) - 1]
        if ext in file_exts:
          file_features["files_created_"+ext] += 1
    if "file_written" in parameters:
      file_features["files_written"] = len(summary["file_written"])
    if "file_exists" in parameters:
      file_features["files_exists"] = len(summary["file_exists"])
    if "file_deleted" in parameters:
      file_features["files_deleted"] = len(summary["file_deleted"])
      for file in summary["file_deleted"]:
        ext_split = file.split(".")
        ext = ext_split[len(ext_split) - 1]
        if ext in file_exts:
          file_features["files_deleted_"+ext] += 1
    if "dll_loaded" in parameters:
      file_features["dll_loaded"] = len(summary["dll_loaded"])
    if "regkey_opened" in parameters:
      file_features["regkey_opened"] = len(summary["regkey_opened"])
    if "regkey_read" in parameters:
      file_features["regkey_read"] = len(summary["regkey_read"])
    if "regkey_written" in parameters:
      file_features["regkey_written"] = len(summary["regkey_written"])
    if "regkey_deleted" in parameters:
      file_features["regkey_deleted"] = len(summary["regkey_deleted"])
    if "directory_enumerated" in parameters:
      file_features["directory_enumerated"] = len(summary["directory_enumerated"])
    if "directory_removed" in parameters:
      file_features["directory_removed"] = len(summary["directory_removed"])
    if "directory_created" in parameters:
      file_features["directory_created"] = len(summary["directory_created"])
  # filesystem_features.append(file_features)
  return file_features

def get_processes_features(data):
  # processes_features = []
  # for file in tqdm(os.listdir(path)[:10]):
  file_features = dd(lambda: 0)
    # with open(os.path.join(path, file), "r") as f:
  for x in call_categories:
    file_features["cat_count_"+x] = 0
    file_features["cat_per_"+x] = 0
  for x in api_calls:
    file_features[x] = 0
  # data = json.load(f)
  processes = data["behavior"]["processes"]
  for process in processes:
    if len(process["calls"]) != 0:
      totcalls = 0
      for call in process["calls"]:
        if call["category"] in call_categories:
          file_features["cat_count_"+call["category"]] += 1
        if call["api"] in api_calls:
          file_features[call["api"]] += 1
        totcalls += 1
      for x in call_categories:
        file_features["cat_per_"+x] = file_features["cat_count_"+x]/totcalls
  # processes_features.append(file_features)
  return file_features

def get_misc_features(data):
  file_features = dd(lambda: 0)
  signatures = data["signatures"]
  severity = []
  for x in cuckoo_signatures:
    file_features[x] = 0
  for signature in signatures:
    severity.append(signature['severity'])
    name = signature["name"]
    if name in cuckoo_signatures:
      file_features[name] += 1
  if len(severity) == 0:
    severity = [0]
  severity = statistics.mean(severity)
  file_features["severity"] = severity
  virus_detected_count = 0
  total_scans = 0
  try:
    virus_scans = data['virustotal']['scans']
    total_scans = len(virus_scans.keys())
    for scanner in virus_scans.keys():
      if virus_scans[scanner]["detected"] == True:
        virus_detected_count += 1
  except:
    pass
  if total_scans == 0:
    total_scans = 1
  file_features["scans_perc"] = (virus_detected_count/total_scans)*100
  generic_behaviour = data["behavior"]["generic"]
  dll_loaded = 0
  for behaviour in generic_behaviour:
    try:
      dll_loaded += len(behaviour["summary"]["dll_loaded"])
    except:
      pass
  file_features["dll_loaded"] = dll_loaded
  processes = data["behavior"]["processes"]
  file_features["processes"] = len(processes)
  return file_features


In [None]:
filesystem_features = get_file_system_features(dynamic_train_malware)
for x in filesystem_features:
  print(len(x.keys()))

process_features = get_processes_features(dynamic_train_malware)
for x in process_features:
  print(len(x.keys()))

network_features = get_network_features(dynamic_train_malware)
for x in network_features:
  print(len(x.keys()))

In [None]:
def get_features(path):
  features_required = []
  for file in tqdm(os.listdir(path)):
    file_features = dd()
    with open(os.path.join(path, file), "r") as f:
      data = json.load(f)
      filesystem_features = get_file_system_features(data)
      network_features = get_network_features(data)
      process_features = get_processes_features(data)
      misc_features = get_misc_features(data)
      file_features.update(filesystem_features)
      file_features.update(network_features)
      file_features.update(process_features)
      file_features.update(misc_features)
      features_required.append(file_features)
  return features_required

def get_features_of_file(folder, file):
  file_features = dd()
  with open(os.path.join(folder, file), "r") as f:
    data = json.load(f)
    filesystem_features = get_file_system_features(data)
    network_features = get_network_features(data)
    process_features = get_processes_features(data)
    misc_features = get_misc_features(data)
    file_features.update(filesystem_features)
    file_features.update(network_features)
    file_features.update(process_features)
    file_features.update(misc_features)
  return file_features

In [None]:
import multiprocessing
from joblib import Parallel, delayed

malwares = os.listdir(dynamic_train_malware)
benigns = os.listdir(dynamic_train_benign)
num_cores = multiprocessing.cpu_count()
malware_results = Parallel(n_jobs=num_cores)(delayed(get_features_of_file)(dynamic_train_malware, i) for i in tqdm(malwares))


In [None]:
benign_results = Parallel(n_jobs=num_cores)(delayed(get_features_of_file)(dynamic_train_benign, i) for i in tqdm(benigns, position=0, leave=True))

In [None]:
with open('dynamic_malware_features.p', 'wb') as fp:
    pickle.dump(malware_results, fp, protocol=pickle.HIGHEST_PROTOCOL)
with open('dynamic_benign_features.p', 'wb') as fp:
    pickle.dump(benign_results, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
meta_data_features = [
  "NumberOfSections",
  "TimeDateStamp",
  "CompileTimeIndicator", # 1 if timedatestamp > curdate
  "AddressOfEntryPoint",
  "BaseOfCode",
  "NumberOfSymbols",
  "MajorLinkerVersion",
  "MinorLinkerVersion",
  "MajorOperatingSystemVersion",
  "MinorOperatingSystemVersion",
  "MajorImageVersion",
  "MinorImageVersion",
  "CheckSum",
  "NumberOfDLLs",
  "NumberOfFunctions"
] 

dlls_required = [
  'kernel32.dll',
 'advapi32.dll',
 'user32.dll',
 'gdi32.dll',
 'ws2_32.dll',
 'ntdll.dll',
 'crypt32.dll',
 'shell32.dll',
 'wsock32.dll',
 'wininet.dll',
 'msvcrt.dll'
]

file_packing_features = [
  "SizeOfRawData"
]

function_calls = [
 'RegCloseKey',
 'RegOpenKey',
 'RegQueryValue',
 'RegSetValue',
 'RtlCreateRegistryKey',
 'RtlWriteRegistryValue',
 'CheckRemoteDebuggerPresent',
 'FindWindow',
 'GetLastError',
 'IsDebuggerPresent',
 'sleep',
 'OutputDebugString',
 'GetAdaptersInfo',
 'FindWindow',
 'GetTickCount',
 'NtSettInformationProcessDebugActiveProcess',
 'QueryPerformanceCounter',
 'NtQueryInformationProcess',
 'VirtualAllocEx',
 'LoadLibrary',
 'VirtualFree',
 'GetProcAddress',
 'LdrLoadDll',
 'LoadResourceVirtualProtectEx',
 'CommandLineToArg',
 'ShellExecute',
 'system',
 'WinExec',
 'SetWindowsHook',
 'RegisterHotKey',
 'GetKeyState',
 'MapVirtualKey',
 'listen',
 'socket',
 'accept',
 'bind',
 'connect',
 'send',
 'recv',
 'FtpPutFile',
 'InternetOpen',
 'InternetOpenUrl',
 'InternetWriteFile',
 'ConnetNamedPipe',
 'PeekNamedPike',
 'gethostbyname',
 'inetaddrInternetReadFie',
 'BitBlt',
 'GetDC',
 'CryptDecrypt',
 'CryptGenRandom',
 'CryptAcqureContext',
 'SetPrivilege',
 'LookupPrivilege',
 'CreateRemoteThread',
 'WriteProcessMemory',
 'ReadProcessMemory',
 'OpenProcess',
 'NtOpenProcess',
 'NtReadVirtualMemory',
 'NtWriteVirtualMemory',
 'CreateFile',
 'CreateFileMapping',
 'CreateMutex',
 'CreateProcess',
 'CreateService',
 'ControlServiceOpenSCManager',
 'StartServiceCtrlDispatcher',
 'CreateRemoteThread',
 'WriteProcessMemory',
 'ReadProcessMemory',
 'OpenProcess',
 'NtOpenProcess',
 'NtReadVirtualMemory',
 'NtWriteVirtualMemory',
 'MapViewofFile',
 'Module32First',
 'Module32Next',
 'OpenMutex',
 'OpenProcess',
 'QueueUserAPC',
 'SetFileTime',
 'SfcTerminateWeatherThread',
 'SuspendThread',
 'Thread32First',
 'Thread32Next',
 'WriteProcessMemoryResumeThread',
 'DllCanUnloadNow',
 'DllGetClassObject',
 'DllInstall',
 'DllRegisterServer',
 'DllUnregisterServer',
 'NetScheduleJobAdd',
 'FindFirstFile',
 'FindNextFile',
 'FindResource',
 'WSAStartup'
]

In [None]:
def get_dll_and_function(line):
  dll_func = line.split()
  dll_func = dll_func[0].split(".")
  dll = ".".join(dll_func[:2])
  function = dll_func[len(dll_func) - 1]
  return dll.lower(), function

def get_features_static(lines):
  file_features = dd(lambda: 0)
  dlls = set()
  functions = set()
  dll_func_regex = re.compile(r'Hint\[\d*\]')
  for x in function_calls:
    file_features[x] = 0
  for x in dlls_required:
    file_features[x] = 0
  for x in meta_data_features:
    file_features[x] = 0
  for line in lines:
    line = line.replace(":","")
    params = line.split()
    for x in meta_data_features:
      if x in line:
        value = literal_eval(params[params.index(x) + 1])
        file_features[x] = value
      elif x == "CompileTimeIndicator":
        if "TimeDateStamp" in line:
          now_time = int(time.time()) 
          value = literal_eval(params[params.index("TimeDateStamp") + 1])
          if value > now_time:
            file_features[x] = 1
          else:
            file_features[x] = 0
      if dll_func_regex.search((line)):
        dll, function = get_dll_and_function(line)
        dlls.add(dll)
        functions.add(function)
        if dll in dlls_required:
          file_features[dll] += 1
        if function in function_calls:
          file_features[function] += 1
    file_features["NumberOfDLLs"] = len(dlls)
    file_features["NumberOfFunctions"] = len(functions)
  return file_features

def get_static_file_features(path, folder):
  files = os.listdir(os.path.join(path, folder))
  file = files[1]
  path = os.path.join(path, folder)
  with open(os.path.join(path, file)) as f:
    lines = f.readlines()
    features = get_features_static(lines)
    return features

In [None]:
dir1 = static_train_malware
dir2 = static_train_benign

static_benign_features = []
static_malware_features = []

malwares = os.listdir(dir1)
benigns = os.listdir(dir2)

In [None]:
for folder in tqdm(benigns,position=0, leave=True):
  try:
    feats = get_static_file_features(dir2, folder)
    static_benign_features.append(feats)
  except:
    shutil.rmtree(os.path.join(dir2, folder))

In [None]:
for folder in tqdm(malwares,position=0, leave=True):
  try:
    feats = get_static_file_features(dir1, folder)
    static_malware_features.append(feats)
  except:
    shutil.rmtree(os.path.join(dir1, folder))

In [None]:
with open('static_benign_features.json', 'w') as fp:
    json.dump(static_benign_features, fp)
with open('static_malware_features.json', 'w') as fp:
    json.dump(static_malware_features, fp)

In [None]:
if os.path.isdir("train_features") == False:
  os.mkdir("train_features")

with open('static_benign_features.json', 'r') as fp:
    data = json.load(fp)
    os.remove('static_benign_features.json')
    with open("train_features/static_benign_features.p", "wb") as f:
      pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('static_malware_features.json', 'r') as fp:
    data = json.load(fp)
    os.remove('static_malware_features.json')
    with open("train_features/static_malware_features.p", "wb") as f:
      pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
s1 = os.listdir(static_test_benign)[:5]
s2 = os.listdir(static_test_malware)[:5]
s3 = os.listdir(dynamic_test_benign)[:5]
s4 = os.listdir(dynamic_test_malware)[:5]

x = s1 + s3
x.sort()
print(x)

os.mkdir("/content/testing")
move_to = "/content/testing"

for x in s1:
  os.mkdir(os.path.join(move_to, x))
  copy_tree(os.path.join(static_test_benign, x), os.path.join(move_to, x))

for x in s2:
  os.mkdir(os.path.join(move_to, x))
  copy_tree(os.path.join(static_test_malware, x), os.path.join(move_to, x))

for x in s3:
  shutil.copy(os.path.join(dynamic_test_benign, x), move_to)

for x in s4:
  shutil.copy(os.path.join(dynamic_test_malware, x), move_to)


In [None]:
!zip -r /content/testing.zip /content/testing

In [None]:
!pip install -U scikit-learn

In [None]:
static_malware_path = "static_malware_features.p"
static_benign_path = "static_benign_features.p"

In [None]:
with open(static_malware_path, "rb") as f:
  static_malware_data = pickle.load(f)
with open(static_benign_path, "rb") as f:
  static_benign_data = pickle.load(f)


static_malware = pd.DataFrame(static_malware_data)
malware_label = [1]*len(static_malware_data)
static_malware["label"] = malware_label

static_benign = pd.DataFrame(static_benign_data)
benign_label = [0]*len(static_benign_data)
static_benign["label"] = benign_label

static_dataset = pd.concat([static_malware, static_benign], axis=0)
static_train_labels = static_dataset["label"]
static_train_data = static_dataset.drop(["label"], axis=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
clf = RandomForestClassifier()
clf.fit(static_train_data, static_train_labels)

with open("models/static_model.p", "wb") as f:
  pickle.dump(clf,f,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
test_malwares = os.listdir(static_test_malware)
test_benigns = os.listdir(static_test_benign)
static_benign_features = []
for folder in tqdm(test_benigns,position=0, leave=True):
  try:
    feats = get_static_file_features(static_test_benign, folder)
    static_benign_features.append(feats)
  except:
    pass
static_malware_features = []
for folder in tqdm(test_malwares,position=0, leave=True):
  try:
    feats = get_static_file_features(static_test_malware, folder)
    static_malware_features.append(feats)
  except:
    pass

In [None]:
static_benign_test = pd.DataFrame(static_benign_features)
static_malware_test = pd.DataFrame(static_malware_features)
static_test_data = pd.concat([static_benign_test, static_malware_test])
static_test_labels = np.array([0]*len(static_benign_test) + [1]*len(static_malware_test))

if os.path.isdir("test_features") == False:
  os.mkdir("test_features")


with open("test_features/static_test_features.p", "wb") as f:
  pickle.dump((static_test_data, static_test_labels),f,protocol=pickle.HIGHEST_PROTOCOL)


preds_static = clf.predict(static_test_data.values)
print(sklearn.metrics.classification_report(static_test_labels, preds_static))

In [None]:
dynamic_malware_train_data = []
dynamic_benign_train_data = []

with open("train_features/dynamic_malware_features.p", "rb") as f:
  dynamic_malware_train_data = pickle.load(f)
with open("train_features/dynamic_benign_features.p", "rb") as f:
  dynamic_benign_train_data = pickle.load(f)

dyn_mal = pd.DataFrame(dynamic_malware_train_data)
dyn_ben = pd.DataFrame(dynamic_benign_train_data)
dynamic_train_labels = np.array([1]*len(dyn_mal) + [0]*len(dyn_ben))
dynamic_train_data = pd.concat([dyn_mal, dyn_ben])

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
clf1 = RandomForestClassifier()
clf1.fit(dynamic_train_data, dynamic_train_labels)

with open("models/dynamic_model.p", "wb") as f:
  pickle.dump(clf1,f,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
benf = []
for file in tqdm(os.listdir(dynamic_test_benign), position=0, leave=True):
  f = get_features_of_file(dynamic_test_benign, file)
  benf.append(f)
malf = []
for file in tqdm(os.listdir(dynamic_test_malware), position=0, leave=True):
  f = get_features_of_file(dynamic_test_malware, file)
  malf.append(f)

In [None]:
benign_dynamic_data = pd.DataFrame(benf)
malware_dynamic_data = pd.DataFrame(malf)
dynamic_test_data = pd.concat([benign_dynamic_data, malware_dynamic_data])
dynamic_test_labels = np.array([0]*len(benign_dynamic_data) + [1]*len(malware_dynamic_data))

with open("test_features/dynamic_test_features.p", "wb") as f:
  pickle.dump((dynamic_test_data, dynamic_test_labels),f,protocol=pickle.HIGHEST_PROTOCOL)


preds_dynamic = clf1.predict(dynamic_test_data)
print(sklearn.metrics.classification_report(dynamic_test_labels, preds_dynamic))

In [None]:
# dynamic_test_data, dynamic_test_labels
# static_test_data, static_test_labels
# dynamic_train_data, dynamic_train_labels
# static_train_data, static_train_labels

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
svc = SVC()
svc.fit(dynamic_train_data, dynamic_train_labels)
svc.score(dynamic_test_data, dynamic_test_labels)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(dynamic_train_data, dynamic_train_labels)
rfc.score(dynamic_test_data, dynamic_test_labels)

In [None]:
def get_score(model, x_train, y_train, x_test, y_test):
  model.fit(x_train, y_train)
  preds = model.predict(x_test)
  return (model.score(x_test, y_test), sklearn.metrics.classification_report(y_true=y_test, y_pred=preds))

In [None]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer

def my_scorer(y_true, y_pred):
  print(classification_report(y_true, y_pred))
  return accuracy_score(y_true, y_pred)

In [None]:
from sklearn.model_selection import cross_val_score
dynamic_data = pd.concat([dynamic_train_data, dynamic_test_data], axis=0)
dynamic_labels = np.concatenate([dynamic_train_labels, dynamic_test_labels])
cross_val_score(RandomForestClassifier(), dynamic_data , dynamic_labels , scoring=make_scorer(my_scorer))

In [None]:
static_data = pd.concat([static_train_data, static_test_data])
static_labels = np.concatenate([static_train_labels, static_test_labels])
cross_val_score(RandomForestClassifier(), static_data, static_labels, scoring=make_scorer(my_scorer))

In [None]:
def get_as_string(a):
  b = re.findall(r'\w+', a)
  s = ''
  for x in b:
    s += '"' + x + '", '
  return s

In [None]:
static_model_path = "models/static_model.p"
dynamic_model_path = "models/dynamic_model.p"


dataset_path = "testing/"

files = os.listdir(dataset_path)


def classify_dynamic(file):
    try:
        file_features = get_features_of_file(dataset_path, file)
        data = pd.DataFrame([file_features])
        return dynamic_model.predict(data.values.reshape(1, -1))[0]
    except:
        return random.randint(0, 1)


def classify_static(folder):
    try:
        file_features = get_static_file_features(dataset_path, folder)
        data = pd.DataFrame([file_features])
        return static_model.predict(data.values.reshape(1, -1))[0]
    except:
        return random.randint(0, 1)


def write_csv(output):
    with open("output.csv", 'w') as f:
        for key in output.keys():
            f.write("%s,%s\n" % (key, output[key]))


def load_models():
    with open(static_model_path, "rb") as f:
        static_model = pickle.load(f)
    with open(dynamic_model_path, "rb") as f:
        dynamic_model = pickle.load(f)

    return static_model, dynamic_model


output = dd(lambda: "Benign")
static_model, dynamic_model = load_models()

for file in files:
    if os.path.isfile(os.path.join(dataset_path, file)):
        prediction = classify_dynamic(file)
        key = file.split(".")[0]
        if prediction == 0:
            output[key] = "Benign"
        else:
            output[key] = "Malware"
    else:
        prediction = classify_static(file)
        key = file.split(".")[0]
        if prediction == 0:
            output[key] = "Benign"
        else:
            output[key] = "Malware"
