In [1]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

In [4]:
# In order to optimize memory usage it is useful to make certain features get certain types
# the dictionary below has been taken from https://www.kaggle.com/theoviel/load-the-totality-of-the-data
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', # was 'float32'
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', # was 'float16'
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }


# provide either train or test .csv file here:
malware = pd.read_csv('train.csv', dtype=dtypes)
#malware_test = pd.read_csv('test.csv', dtype=dtypes)


# this column is 100% repeated so we mighta as well drop it here
malware.drop(['Census_OSArchitecture'], axis = 1, inplace = True)
#malware_test.drop(['Census_OSArchitecture'], axis = 1, inplace = True)

In [3]:
nums = ['Census_ProcessorCoreCount',
        'Census_PrimaryDiskTotalCapacity',
        'Census_SystemVolumeTotalCapacity',
        'Census_TotalPhysicalRAM',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches',
        'Census_InternalPrimaryDisplayResolutionHorizontal',
        'Census_InternalPrimaryDisplayResolutionVertical',
        'Census_InternalBatteryNumberOfCharges']


## apparently these might be the true numericals

# All the columns which have a real continuous data
#continuous_columns = [ 
#    'Census_ProcessorCoreCount',
#    'Census_PrimaryDiskTotalCapacity',
#    'Census_SystemVolumeTotalCapacity',
#    'Census_TotalPhysicalRAM',
#    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
#    'Census_InternalPrimaryDisplayResolutionHorizontal',
#    'Census_InternalPrimaryDisplayResolutionVertical',
#    'Census_InternalBatteryNumberOfCharges',
#    'Census_OSBuildNumber',
#    'Census_OSBuildRevision',
#    'Census_ThresholdOptIn',
#    'OsBuild'
#]

In [4]:
# handle numerical columns first
# to be fair we only need to deal with missing data (or maybe even not? LGBM can handle them)
# we can keep these columns in floats because we have saved a lot of space
# by encoding categories with integers!

for column in nums:
    if malware[column].isnull().any():
        malware[column].fillna(malware[column].mean(), inplace=True)

In [5]:
for col in nums:
    print(col, malware[col].isnull().any())
    
# for some reason there seems to be a bug when we want to take a sum of the non-NaN
# values in columns of type float16 - the sum is always infinity and so the mean is NaN...
# however LGBM can handle missing data? so we are more or less okay with it?

Census_ProcessorCoreCount True
Census_PrimaryDiskTotalCapacity False
Census_SystemVolumeTotalCapacity False
Census_TotalPhysicalRAM False
Census_InternalPrimaryDiagonalDisplaySizeInInches False
Census_InternalPrimaryDisplayResolutionHorizontal False
Census_InternalPrimaryDisplayResolutionVertical False
Census_InternalBatteryNumberOfCharges False


In [6]:
# LGBM can handle categorical variables encoded with integers
# that's wonderful news !

all_cols = [col for col in malware.columns if col != "MachineIdentifier"]

for col in all_cols:
    if col not in nums:
        if malware[col].isnull().any():
            malware[col] = malware[col].astype('category')
            malware[col] = malware[col].cat.add_categories("NaN")
            malware[col].fillna("NaN", inplace=True)
            malware[col] = malware[col].cat.codes
        else:
            malware[col] = malware[col].astype('category')
            malware[col] = malware[col].cat.codes
            
# getting back the original values through:
# malware[col].cat.categories[0]

In [8]:
with open('LGBM_test.pickle', 'wb') as handle:
    pickle.dump(malware, handle, protocol=pickle.HIGHEST_PROTOCOL)