# PE files sorter v1.2

In [None]:
%%time
import pefile
import os
import array
import sys
import argparse
import os
import shutil
import struct
from os import path
import glob
import time
import sys
import hashlib
from collections import defaultdict

IMAGE_FILE_MACHINE_I386=332
IMAGE_FILE_MACHINE_IA64=512
IMAGE_FILE_MACHINE_AMD64=34404

class bcolors:
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    RED = '\033[91m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    
def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk

def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


def remove_duplicates(path):
    """relies on the md5 function above to remove duplicate files"""
    if not os.path.isdir(path):  # make sure the given directory exists
        print('specified directory does not exist!')
        return
    
    md5_dict = defaultdict(list)
    for root, dirs, files in os.walk(path):  # the os.walk function allows checking subdirectories too...
        for filename in files:
            try:
                filepath = os.path.join(root, filename)
                print(bcolors.GREEN + "Working: %s." % filepath + bcolors.ENDC) 
                file_md5 = md5(filepath)
                md5_dict[file_md5].append(filepath)
            except:
                print (bcolors.RED + "Working Error" + bcolors.ENDC)
    for key in md5_dict:
        try:
            file_list = md5_dict[key]
            while len(file_list) > 1:
                item = file_list.pop()
                print(bcolors.BLUE + "Duplicate: %s. (removing it...)" % item + bcolors.ENDC)    
                os.remove(item)
        except:
                print (bcolors.RED + "Removing Error" + bcolors.ENDC)
    
        
    print('Done!')


def ChehckIsPEObject(filename):
    """check if object is pe or not"""
    # check if the input file exists
    if not os.path.exists(filename):
        parser.error("cannot access input file '%s'" % args[0])
        return False

    # check if the input file is executable
    if not os.path.isfile(filename):
        print ('The given arg is not a file.')
        return False
    else:
        try:
            pe = pefile.PE(filename)
            if not (pe.is_exe() or pe.is_dll()):
                print ('Input file should be executable (PE format: exe or dll)')
                return False
        except pefile.PEFormatError as e:
            print("[-] PEFormatError: %s" % e.value)
            return False

    return True 

def GetBinaryType(filepath):
    """retrieve architecture of pe file 86\64"""
    f=open(filepath, "rb")
    machine = 0
    s=f.read(2)
    f.seek(60)
    s=f.read(4)
    header_offset=struct.unpack("<L", s)[0]
    f.seek(header_offset+4)
    s=f.read(2)
    machine=struct.unpack("<H", s)[0]
    f.close()
    return machine


if __name__ == '__main__':
    
    # Classify set legitimate \ malware
    legitimate = True
    
    # Copy not pe objects to not_pe \ not_pe_clean dirs
    copy_not_pe = False 
    
    # Clean dublicate files matched by md5 hash
    clean_dublicates = False
    
    if legitimate == True:
        root_dir = 'Z:/training_set/not_classified_clean/'
        not_pe_dir = 'Z:/training_set/not_pe_clean/'
        net_pe_dir = 'Z:/training_set/net_pe_clean/'
        x32_dll_dir = 'Z:/training_set/x32_dll_clean/'
        x64_dll_dir = 'Z:/training_set/x64_dll_clean/'
        x32_exe_dir = 'Z:/training_set/x32_exe_clean/'
        x64_exe_dir = 'Z:/training_set/x64_exe_clean/'
    else:
        root_dir = 'Z:/training_set/VirusSignList_Free_140817/'
        not_pe_dir = 'Z:/training_set/not_pe/'
        net_pe_dir = 'Z:/training_set/net_pe/'
        x32_dll_dir = 'Z:/training_set/x32_dll/'
        x64_dll_dir = 'Z:/training_set/x64_dll/'
        x32_exe_dir = 'Z:/training_set/x32_exe/'
        x64_exe_dir = 'Z:/training_set/x64_exe/'
        
    # Remove dublicate files
    if clean_dublicates == True:
        remove_duplicates(root_dir)
        exit()
    
    # scan all files from selected dir
    x = [entry for entry in os.scandir(root_dir+'.') if entry.is_file()]
    i = 0
    
    for f in x:
        print("%d. %s" % (i, f))
        
        is_dotnet = False
                
        # check if valid PE file
        if ChehckIsPEObject(f):
            
            # load pe file to pelib
            try:
                pe = pefile.PE(f)
            except:
                print("[-] pefile open file error: %s")
            
            # check if PE is dotnet and move to net_pe dir
            try:
                isDotNet = pe.OPTIONAL_HEADER.DATA_DIRECTORY[14]
                if isDotNet.VirtualAddress != 0 and isDotNet.Size != 0:
                    print (bcolors.BLUE + "%s PE is .NET executable" % (os.path.abspath(f)) + bcolors.ENDC)
                    shutil.copy(f, net_pe_dir+os.path.basename(f))
                    is_dotnet = True
            except:
                    print (bcolors.RED + "%s error")
            
            # check if PE is x32 EXE OR DLL and move to dir
            if GetBinaryType(f) == IMAGE_FILE_MACHINE_I386:
                
                if(pe.is_exe() and is_dotnet == False):
                    try:
                        print (bcolors.GREEN + "%s PE is x32 EXE" % (os.path.abspath(f)) + bcolors.ENDC)
                        shutil.copy(f, x32_exe_dir+os.path.basename(f))
                    except:
                        print (bcolors.RED + "Error" + bcolors.ENDC)
                    
                # if PE is x32 DLL
                elif(pe.is_dll() and is_dotnet == False):
                    try:
                        print (bcolors.GREEN + "%s PE is x32 DLL" % (os.path.abspath(f)) + bcolors.ENDC)
                        shutil.copy(f, x32_dll_dir+os.path.basename(f))
                    except:
                        print (bcolors.RED + "Error" + bcolors.ENDC)
            
          
            # check if PE is x64 EXE OR DLL and move to dir        
            elif GetBinaryType(f) == IMAGE_FILE_MACHINE_AMD64:
                if(pe.is_exe() and is_dotnet == False):
                    try:
                        print (bcolors.GREEN + "%s PE is x64 EXE" % (os.path.abspath(f))+ bcolors.ENDC)
                        shutil.copy(f, x64_exe_dir+os.path.basename(f))
                    except:
                        print (bcolors.RED + "Error" + bcolors.ENDC)
                    
                # if PE is x64 DLL    
                elif(pe.is_dll() and is_dotnet == False):
                    try:
                        print (bcolors.GREEN + "%s PE is x64 DLL" % (os.path.abspath(f))+ bcolors.ENDC)
                        shutil.copy(f, x64_dll_dir+os.path.basename(f))
                    except:
                        print (bcolors.RED + "Error" + bcolors.ENDC)
                        
            pe.close()
                                                        
        # check if PE is NOT EXE OR DLL and move to dir           
        else:
            print (bcolors.RED + "%s is NOT a PE file" % (os.path.abspath(f))+ bcolors.ENDC)
            if copy_not_pe == True:
                try:
                    shutil.copy2(f, not_pe_dir+os.path.basename(f))
                except:
                    print (bcolors.RED + "Error" + bcolors.ENDC)
            
        i = i + 1
            
               
print (colors.BLUE + "DONE.")        