In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import math
import gc
gc.enable()
gc.collect()

0

In [0]:
# LOAD AND FREQUENCY-ENCODE
FE = ['AppVersion','AvSigVersion','Census_OSVersion']
# LOAD AND ONE-HOT-ENCODE
OHE = [ 'IsProtected', 'Census_IsAlwaysOnAlwaysConnectedCapable',
       'Census_FlightRing', 'Census_HasOpticalDiskDrive',
       'Census_OSArchitecture', 'Census_GenuineStateName',
       'Census_IsTouchEnabled', 'Census_InternalBatteryType', 'SmartScreen',
       'Wdft_IsGamer', 'AVProductsInstalled', 'Census_PowerPlatformRoleName',
       'OrganizationIdentifier', 'AVProductStatesIdentifier',
       'Census_PrimaryDiskTypeName', 'Census_MDC2FormFactor',
       'Census_ProcessorCoreCount', 'Census_ChassisTypeName',
       'Census_InternalBatteryNumberOfCharges',
       'Census_ActivationChannel', 'Census_IsSecureBootEnabled',
       'Census_InternalPrimaryDisplayResolutionHorizontal',
       'Census_TotalPhysicalRAM', 'Census_OSBranch',
       'Census_OSWUAutoUpdateOptionsName', 'OsPlatformSubRelease',
       'OsBuildLab', 'Census_OSEdition', 'Census_OSUILocaleIdentifier',
       'Census_InternalPrimaryDiagonalDisplaySizeInInches',
       'Census_PrimaryDiskTotalCapacity',
       'Census_FirmwareManufacturerIdentifier', 'Census_OSInstallTypeName',
       'LocaleEnglishNameIdentifier', 'Wdft_RegionIdentifier',
       'GeoNameIdentifier', 'Census_OSBuildRevision',
       'Census_OEMNameIdentifier', 'CountryIdentifier',
       'Census_OEMModelIdentifier', 'Census_ProcessorModelIdentifier'
       , 'CityIdentifier', 'Census_FirmwareVersionIdentifier',
       'Census_SystemVolumeTotalCapacity']

In [0]:
# LOAD ALL AS CATEGORIES
dtypes = {}
for x in FE+OHE: dtypes[x] = 'category'
dtypes['MachineIdentifier'] = 'str'
dtypes['HasDetections'] = 'int8'

In [0]:
gc.collect()

72115

In [0]:
# LOAD CSV FILE
df_train = pd.read_csv('/content/drive/My Drive/Microsoft Malware/train.csv', usecols=dtypes.keys(), dtype=dtypes)
print ('Loaded',len(df_train),'rows of TRAIN.CSV!')
gc.collect()

Loaded 8921483 rows of TRAIN.CSV!


358365

In [0]:
# DOWNSAMPLE
sm = 2000000
df_train = df_train.sample(sm)
print ('Only using',sm,'rows to train and validate')
x=gc.collect()

Only using 2000000 rows to train and validate


In [0]:
import math

# CHECK FOR NAN
def nan_check(x):
    if isinstance(x,float):
        if math.isnan(x):
            return True
    return False

# FREQUENCY ENCODING
def encode_FE(df,col,verbose=1):
    d = df[col].value_counts(dropna=False)
    n = col+"_FE"
    df[n] = df[col].map(d)/d.max()
    if verbose==1:
        print('FE encoded',col)
    return [n]

# ONE-HOT-ENCODE ALL CATEGORY VALUES THAT COMPRISE MORE THAN
# "FILTER" PERCENT OF TOTAL DATA AND HAS SIGNIFICANCE GREATER THAN "ZVALUE"
def encode_OHE(df, col, filter, zvalue, tar='HasDetections', m=0.5, verbose=1):
    cv = df[col].value_counts(dropna=False)
    cvd = cv.to_dict()
    vals = len(cv)
    th = filter * len(df)
    sd = zvalue * 0.5/ math.sqrt(th)
    #print(sd)
    n = []; ct = 0; d = {}
    for x in cv.index:
        try:
            if cv[x]<th: break
            sd = zvalue * 0.5/ math.sqrt(cv[x])
        except:
            if cvd[x]<th: break
            sd = zvalue * 0.5/ math.sqrt(cvd[x])
        if nan_check(x): r = df[df[col].isna()][tar].mean()
        else: r = df[df[col]==x][tar].mean()
        if abs(r-m)>sd:
            nm = col+'_BE_'+str(x)
            if nan_check(x): df[nm] = (df[col].isna()).astype('int8')
            else: df[nm] = (df[col]==x).astype('int8')
            n.append(nm)
            d[x] = 1
        ct += 1
        if (ct+1)>=vals: break
    if verbose==1:
        print('OHE encoded',col,'- Created',len(d),'booleans')
    return [n,d]

# ONE-HOT-ENCODING from dictionary
def encode_OHE_test(df,col,dt):
    n = []
    for x in dt: 
        n += encode_BE(df,col,x)
    return n

# BOOLEAN ENCODING
def encode_BE(df,col,val):
    n = col+"_BE_"+str(val)
    if nan_check(val):
        df[n] = df[col].isna()
    else:
        df[n] = df[col]==val
    df[n] = df[n].astype('int8')
    return [n]

In [0]:
cols = []; dd = []

# ENCODE NEW
for x in FE:
    cols += encode_FE(df_train,x)
for x in OHE:
    tmp = encode_OHE(df_train,x,0.005,5)
    cols += tmp[0]; dd.append(tmp[1])
print('Encoded',len(cols),'new variables')
x =gc.collect()
# REMOVE OLD
for x in FE+OHE:
    del df_train[x]
print('Removed original',len(FE+OHE),'variables')
x = gc.collect()

FE encoded AppVersion
FE encoded AvSigVersion
FE encoded Census_OSVersion
OHE encoded IsProtected - Created 2 booleans
OHE encoded Census_IsAlwaysOnAlwaysConnectedCapable - Created 2 booleans
OHE encoded Census_FlightRing - Created 1 booleans
OHE encoded Census_HasOpticalDiskDrive - Created 1 booleans
OHE encoded Census_OSArchitecture - Created 2 booleans
OHE encoded Census_GenuineStateName - Created 1 booleans
OHE encoded Census_IsTouchEnabled - Created 1 booleans
OHE encoded Census_InternalBatteryType - Created 4 booleans
OHE encoded SmartScreen - Created 4 booleans
OHE encoded Wdft_IsGamer - Created 2 booleans
OHE encoded AVProductsInstalled - Created 3 booleans
OHE encoded Census_PowerPlatformRoleName - Created 4 booleans
OHE encoded OrganizationIdentifier - Created 3 booleans
OHE encoded AVProductStatesIdentifier - Created 9 booleans
OHE encoded Census_PrimaryDiskTypeName - Created 3 booleans
OHE encoded Census_MDC2FormFactor - Created 6 booleans
OHE encoded Census_ProcessorCoreCo

In [0]:
gc.collect()

0

In [0]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.callbacks import LearningRateScheduler
from keras.optimizers import Adam

#SPLIT TRAIN AND VALIDATION SET
X_train, X_val, Y_train, Y_val = train_test_split(
    df_train[cols], df_train['HasDetections'], test_size = 0.3)

# BUILD MODEL
model = Sequential()
model.add(Dense(100,input_dim=len(cols)))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(100))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(lr=0.01), loss="binary_crossentropy", metrics=["accuracy"])
annealer = LearningRateScheduler(lambda x: 1e-2 * 0.95 ** x)

# TRAIN MODEL
#earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
#mcp_save = ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')
#reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min')
model.fit(X_train,Y_train, batch_size=32, epochs = 8, callbacks=[annealer], validation_data = (X_val,Y_val), verbose=1)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 1400000 samples, validate on 600000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fb0517a3908>

In [0]:
del df_train
del X_train, X_val, Y_train, Y_val
gc.collect()

49

In [0]:
pred = np.zeros((7853253,1))
id = 1
chunksize = 500000
for df_test in pd.read_csv('/content/drive/My Drive/Microsoft Malware/test.csv', 
            chunksize = chunksize, usecols=list(dtypes.keys())[0:-1], dtype=dtypes):
    print ('Loaded',len(df_test),'rows of TEST.CSV!')
    # ENCODE TEST
    cols = []
    for x in FE:
        cols += encode_FE(df_test,x,verbose=0)
    for x in range(len(OHE)):
        cols += encode_OHE_test(df_test,OHE[x],dd[x])
    # PREDICT TEST
    end = (id)*chunksize
    if end>7853253: end = 7853253
    pred[(id-1)*chunksize:end] = model.predict(df_test[cols])
    print('  encoded and predicted part',id)
    id += 1

Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 1
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 2
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 3
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 4
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 5
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 6
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 7
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 8
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 9
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 10
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 11
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 12
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 13
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 14
Loaded 500000 rows of TEST.CSV!
  encoded and predicted part 15
Loaded 353253 rows of TEST.CSV!
  encoded and pre

In [0]:
gc.collect()

16

In [0]:
del df_test
gc.collect()

682

In [0]:
df_test = pd.read_csv('/content/drive/My Drive/Microsoft Malware/sample_submission.csv', usecols=['MachineIdentifier'])
df_test['HasDetections'] = pred
df_test.to_csv('NN@2.csv', index=False)