In [1]:
import pandas as pd
import gc 
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }


train = pd.read_csv('train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
test  = pd.read_csv('test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')

gc.collect()



0

In [2]:

gc.collect()

0

In [3]:
print(train['MachineIdentifier'])

0                0
1                1
2                2
3                3
4                4
            ...   
8921478    8921478
8921479    8921479
8921480    8921480
8921481    8921481
8921482    8921482
Name: MachineIdentifier, Length: 8921483, dtype: uint64


In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
for usecol in train.columns.tolist()[1:-1]:

    train[usecol] = train[usecol].astype('str')
    test[usecol] = test[usecol].astype('str')
    le = LabelEncoder().fit(
            np.unique(train[usecol].unique().tolist()+
                      test[usecol].unique().tolist()))
    train[usecol] = le.transform(train[usecol])+1
    test[usecol]  = le.transform(test[usecol])+1
    agg_tr = (train
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Train'}, axis=1))
    agg_te = (test
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Test'}, axis=1))

    agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
    #Select values with more than 1000 observations
    agg = agg[(agg['Train'] > 1200)].reset_index(drop=True)
    agg['Total'] = agg['Train'] + agg['Test']
    #Drop unbalanced values
    agg = agg[(agg['Train'] / agg['Total'] > 0.25) & (agg['Train'] / agg['Total'] < 0.9)]
    agg[usecol+'Copy'] = agg[usecol]

    train[usecol] = (pd.merge(train[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    test[usecol]  = (pd.merge(test[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    del le, agg_tr, agg_te, agg, usecol
    gc.collect()
          
y_train = np.array(train['HasDetections'])

del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()

0

In [5]:
from scipy.sparse import vstack

ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(train)
m = 100000
train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
test  = vstack([ohe.transform(test[i*m:(i+1)*m])  for i in range(test.shape[0] // m +  1)])

gc.collect()

0

In [6]:
print(train.shape[1])
print(len(y_train))
cols=train.shape[1]

#train=train.reshape(train.shape[0],train.shape[1],1)
print(train.shape)
#print(train[0])

7036
8921483
(8921483, 7036)


In [11]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation, Input, Add
from keras.layers import Conv1D, MaxPooling1D, Flatten
from keras.models import Model

model_input = Input(shape=(cols,), name='model_input_h')

dense11 = Dense(1024, name='dense1_1')(model_input)
dropo11=Dropout(0.4)(dense11)
batn11=BatchNormalization()(dropo11)
act11=Activation('relu')(batn11)

dense12 = Dense(512, name='dense1_2')(act11)
dropo12=Dropout(0.4)(dense12)
batn12=BatchNormalization()(dropo12)
act12=Activation('relu')(batn12)

dense13 = Dense(cols, name='dense1_3')(act12)
dropo13=Dropout(0.4)(dense13)
batn13=BatchNormalization()(dropo13)
act13=Activation('relu')(batn13)

m1=Add()([model_input,act13])
act14=Activation('softmax')(m1)

dense21 = Dense(1024, name='dense2_1')(act14)
dropo21=Dropout(0.4)(dense21)
batn21=BatchNormalization()(dropo21)
act21=Activation('relu')(batn21)

dense22 = Dense(512, name='dense2_2')(act21)
dropo22=Dropout(0.4)(dense22)
batn22=BatchNormalization()(dropo22)
act22=Activation('relu')(batn22)

dense23 = Dense(cols, name='dense2_3')(act22)
dropo23=Dropout(0.4)(dense23)
batn23=BatchNormalization()(dropo23)
act23=Activation('relu')(batn23)

m2=Add()([model_input,act23])
act2=Activation('softmax')(m2)

dense31 = Dense(256, name='dense31')(act2)
dropo31=Dropout(0.4)(dense31)
batn31=BatchNormalization()(dropo31)
act31=Activation('relu')(batn31)

dense32 = Dense(128, name='dense32')(act31)
dropo32=Dropout(0.4)(dense32)
batn32=BatchNormalization()(dropo32)
act32=Activation('relu')(batn32)

dense33 = Dense(64, name='dense33')(act32)
dropo33=Dropout(0.4)(dense33)
batn33=BatchNormalization()(dropo33)
act33=Activation('relu')(batn33)


model_output=Dense(1, activation='sigmoid')(act33)
model= Model(inputs=model_input,
                      outputs=model_output,
                      name='model')



model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
model_input_h (InputLayer)      (None, 7036)         0                                            
__________________________________________________________________________________________________
dense1_1 (Dense)                (None, 1024)         7205888     model_input_h[0][0]              
__________________________________________________________________________________________________
dropout_10 (Dropout)            (None, 1024)         0           dense1_1[0][0]                   
__________________________________________________________________________________________________
batch_normalization_10 (BatchNo (None, 1024)         4096        dropout_10[0][0]                 
______________________________________________________________________________________________

model_input = Input(shape=(cols,), name='model_input_h')

dense11 = Dense(cols//2, name='dense1_1')(model_input)
dropo11=Dropout(0.4)(dense11)
batn11=BatchNormalization()(dropo11)
act11=Activation('relu')(batn11)

dense12 = Dense(cols//4, name='dense1_2')(act11)
dropo12=Dropout(0.4)(dense12)
batn12=BatchNormalization()(dropo12)
act12=Activation('relu')(batn12)

dense32 = Dense(1024, name='dense32')(act12)
dropo32=Dropout(0.4)(dense32)
batn32=BatchNormalization()(dropo32)
act32=Activation('relu')(batn32)

dense33 = Dense(512, name='dense33')(act32)
dropo33=Dropout(0.4)(dense33)
batn33=BatchNormalization()(dropo33)
act33=Activation('relu')(batn33)

dense34 = Dense(256, name='dense34')(act33)
dropo34=Dropout(0.4)(dense34)
batn34=BatchNormalization()(dropo34)
act34=Activation('relu')(batn34)

dense35 = Dense(128, name='dense35')(act34)
dropo35=Dropout(0.4)(dense35)
batn35=BatchNormalization()(dropo35)
act35=Activation('relu')(batn35)

dense36 = Dense(32, name='dense36')(act35)
dropo36=Dropout(0.4)(dense36)
batn36=BatchNormalization()(dropo36)
act36=Activation('relu')(batn36)


model_output=Dense(1, activation='sigmoid')(act36)
model= Model(inputs=model_input,
                      outputs=model_output,
                      name='model')



model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

dense21 = Dense(128, name='dense2_1')(m1)
dropo21=Dropout(0.4)(dense21)
batn21=BatchNormalization()(dropo21)
act21=Activation('relu')(batn21)

dense22 = Dense(256, name='dense2_2')(act21)
dropo22=Dropout(0.4)(dense22)
batn22=BatchNormalization()(dropo22)
act22=Activation('relu')(batn22)

dense23 = Dense(128, name='dense2_3')(act22)
dropo23=Dropout(0.4)(dense23)
batn23=BatchNormalization()(dropo23)
act23=Activation('relu')(batn23)

m2=Add()([dense21,act23])




dense21 = Dense(1024, name='dense2_1')(m1)
dropo21=Dropout(0.4)(dense21)
batn21=BatchNormalization()(dropo21)
act21=Activation('relu')(batn21)

dense22 = Dense(2048, name='dense2_2')(act21)
dropo22=Dropout(0.4)(dense22)
batn22=BatchNormalization()(dropo22)
act22=Activation('relu')(batn22)

dense23 = Dense(1024, name='dense2_3')(act22)
dropo23=Dropout(0.4)(dense23)
batn23=BatchNormalization()(dropo23)
act23=Activation('relu')(batn23)

m2=Add()([dense21,act23])

model = Sequential()
model.add(Dense(100,input_dim=cols))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(100))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1, activation='sigmoid'))

In [12]:
gc.collect()
history=model.fit(train,  # training data
          y_train,                       # labels (left shift of the target sequences)
          batch_size=500, epochs=2, validation_split=0.1,shuffle=True)

Train on 8029334 samples, validate on 892149 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x1f096361048>

In [None]:
gc.collect()
submission = pd.read_csv('sample_submission.csv')
submission['HasDetections'] = model.predict(test)
submission.to_csv('test_submission11.csv', index=False)

In [None]:
gc.collect()