In [1]:
import pandas as pd, sys
from ctypes import cdll, CDLL
try:
    cdll.LoadLibrary("libc.so.6")
    libc = CDLL("libc.so.6")
    libc.malloc_trim(0)
except (OSError, AttributeError):
    libc = None

__old_del = getattr(pd.DataFrame, '__del__', None)

def __new_del(self):
    if __old_del:
        __old_del(self)
    libc.malloc_trim(0)

if libc:
    print('Applying monkeypatch for pd.DataFrame.__del__', file=sys.stderr)
    pd.DataFrame.__del__ = __new_del
else:
    print('Skipping monkeypatch for pd.DataFrame.__del__: libc or malloc_trim() not found', file=sys.stderr)

Applying monkeypatch for pd.DataFrame.__del__


In [2]:
Debug = False

import numpy as np, pandas as pd, gc, random
import matplotlib.pyplot as plt

def load(x):
    ignore = ['MachineIdentifier']
    if x in ignore: return False
    else: return True

if Debug:
    df_train = pd.read_csv('/home/centos/moon/ktw/microsoft-malware-prediction/data/train.csv',dtype='category',usecols=load,nrows=10000)
else:
    df_train = pd.read_csv('/home/centos/moon/ktw/microsoft-malware-prediction/data/train.csv',dtype='category',usecols=load)
df_train['HasDetections'] = df_train['HasDetections'].astype('int8')
if 5244810 in df_train.index:
    df_train.loc[5244810,'AvSigVersion'] = '1.273.1144.0'
    df_train['AvSigVersion'].cat.remove_categories('1.2&#x17;3.1144.0',inplace=True)

if Debug:
    df_test = pd.read_csv('/home/centos/moon/ktw/microsoft-malware-prediction/data/test.csv',dtype='category',usecols=load,nrows=10000)
else:
    df_test = pd.read_csv('/home/centos/moon/ktw/microsoft-malware-prediction/data/test.csv',dtype='category',usecols=load)
    
print('Loaded',len(df_train),'rows of TRAIN and',len(df_test),'rows of TEST')

  res = method(*args, **kwargs)


Loaded 8921483 rows of TRAIN and 7853253 rows of TEST


In [3]:
def encode_FE(df,col):
    vc = df[col].value_counts(dropna=False, normalize=True).to_dict()
    nm = col+'_FE'
    df[nm] = df[col].map(vc)
    df[nm] = df[nm].astype('float32')
    return [nm]

def encode_FE2(df1, df2, col):
    df = pd.concat([df1[col],df2[col]])
    vc = df.value_counts(dropna=False, normalize=True).to_dict()
    nm = col+'_FE2'
    df1[nm] = df1[col].map(vc)
    df1[nm] = df1[nm].astype('float32')
    df2[nm] = df2[col].map(vc)
    df2[nm] = df2[nm].astype('float32')
    return [nm]

def factor_data(df_train, df_test, col):
    df_comb = pd.concat([df_train[col],df_test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    # MAKE SMALLEST LABEL 1, RESERVE 0
    df_comb += 1
    # MAKE NAN LARGEST LABEL (need to remove attype('str') above)
    df_comb = np.where(df_comb==0, df_comb.max()+1, df_comb)
    df_train[col] = df_comb[:len(df_train)]
    df_test[col] = df_comb[len(df_train):]
    del df_comb
    
def reduce_memory(df,col):
    mx = df[col].max()
    if mx<256:
            df[col] = df[col].astype('uint8')
    elif mx<65536:
        df[col] = df[col].astype('uint16')
    else:
        df[col] = df[col].astype('uint32')
        
def relax_data(df_train, df_test, col):
    cv1 = pd.DataFrame(df_train[col].value_counts().reset_index().rename({col:'train'},axis=1))
    cv2 = pd.DataFrame(df_test[col].value_counts().reset_index().rename({col:'test'},axis=1))
    cv3 = pd.merge(cv1,cv2,on='index',how='outer')
    factor = len(df_test)/len(df_train)
    cv3['train'].fillna(0,inplace=True)
    cv3['test'].fillna(0,inplace=True)
    cv3['remove'] = False
    cv3['remove'] = cv3['remove'] | (cv3['train'] < len(df_train)/10000)
    cv3['remove'] = cv3['remove'] | (factor*cv3['train'] < cv3['test']/3)
    cv3['remove'] = cv3['remove'] | (factor*cv3['train'] > 3*cv3['test'])
    cv3['new'] = cv3.apply(lambda x: x['index'] if x['remove']==False else 0,axis=1)
    cv3['new'],_ = cv3['new'].factorize(sort=True)
    cv3.set_index('index',inplace=True)
    cc = cv3['new'].to_dict()
    df_train[col] = df_train[col].map(cc)
    reduce_memory(df_train,col)
    df_test[col] = df_test[col].map(cc)
    reduce_memory(df_test,col)
    
def display_memory(df_train, df_test):
    print(len(df_train),'rows of training data use',df_train.memory_usage(deep=True).sum()//1e6,'Mb memory!')
    print(len(df_test),'rows of test data use',df_test.memory_usage(deep=True).sum()//1e6,'Mb memory!')

def categorize(df_train, df_test, cols):
    for col in cols:
        df_train[col] = df_train[col].astype('category')
        df_test[col] = df_test[col].astype('category')

In [4]:
from datetime import datetime, date, timedelta

datedictAS = np.load('/home/centos/moon/ktw/microsoft-malware-prediction/data/AvSigVersionTimestamps.npy', allow_pickle=True)[()]
df_train['DateAS'] = df_train['AvSigVersion'].map(datedictAS)
df_test['DateAS'] = df_test['AvSigVersion'].map(datedictAS)

datedictOS = np.load('/home/centos/moon/ktw/microsoft-malware-prediction/data/OSVersionTimestamps.npy', allow_pickle=True)[()]
df_train['DateOS'] = df_train['Census_OSVersion'].map(datedictOS)
df_test['DateOS'] = df_test['Census_OSVersion'].map(datedictOS)

df_train['AppVersion2'] = df_train['AppVersion'].map(lambda x: np.int(x.split('.')[1]))
df_test['AppVersion2'] = df_test['AppVersion'].map(lambda x: np.int(x.split('.')[1]))

df_train['Lag1'] = df_train['DateAS'] - df_train['DateOS']
df_train['Lag1'] = df_train['Lag1'].map(lambda x: x.days//7)
df_test['Lag1'] = df_test['DateAS'] - df_test['DateOS']
df_test['Lag1'] = df_test['Lag1'].map(lambda x: x.days//7)

df_train['Lag5'] = datetime(2018,7,26) - df_train['DateAS']
df_train['Lag5'] = df_train['Lag5'].map(lambda x: x.days//1)
df_train.loc[ df_train['Lag5']<0, 'Lag5' ] = 0
df_test['Lag5'] = datetime(2018,9,27) - df_test['DateAS'] #PUBLIC TEST
df_test['Lag5'] = df_test['Lag5'].map(lambda x: x.days//1)
df_test.loc[ df_test['Lag5']<0, 'Lag5' ] = 0
df_train['Lag5'] = df_train['Lag5'].astype('float32') # allow for NAN
df_test['Lag5'] = df_test['Lag5'].astype('float32') # allow for NAN

df_train['driveA'] = df_train['Census_SystemVolumeTotalCapacity'].astype('float')/df_train['Census_PrimaryDiskTotalCapacity'].astype('float')
df_test['driveA'] = df_test['Census_SystemVolumeTotalCapacity'].astype('float')/df_test['Census_PrimaryDiskTotalCapacity'].astype('float')
df_train['driveA'] = df_train['driveA'].astype('float32') 
df_test['driveA'] = df_test['driveA'].astype('float32') 

df_train['driveB'] = df_train['Census_PrimaryDiskTotalCapacity'].astype('float') - df_train['Census_SystemVolumeTotalCapacity'].astype('float')
df_test['driveB'] = df_test['Census_PrimaryDiskTotalCapacity'].astype('float') - df_test['Census_SystemVolumeTotalCapacity'].astype('float')
df_train['driveB'] = df_train['driveB'].astype('float32') 
df_test['driveB'] = df_test['driveB'].astype('float32') 

cols6=['Lag1']
cols8=['Lag5','driveB','driveA']

del df_train['DateAS'], df_train['DateOS'] #, df_train['DateBL']
del df_test['DateAS'], df_test['DateOS'] #, df_test['DateBL']
del datedictAS, datedictOS
x=gc.collect()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ipykernel import kernelapp as app


In [5]:
cols3 = []
FE = ['Census_OSVersion', 'Census_OSBuildRevision', 'Census_InternalBatteryNumberOfCharges', 'AvSigVersion', 'Lag1']
for col in FE:
    cols3 += encode_FE(df_train, col)
    encode_FE(df_test, col)
    
FE2 = ['CountryIdentifier', 'Census_InternalBatteryNumberOfCharges']
for col in FE2:
    cols3 += encode_FE2(df_train, df_test, col)

In [6]:
CE = ['CountryIdentifier', 'SkuEdition', 'Firewall', 'Census_ProcessorCoreCount', 'Census_OSUILocaleIdentifier', 'Census_FlightRing']

In [7]:
cols = [x for x in df_train.columns if x not in ['HasDetections']+CE+cols3+cols6+cols8]
cols2 = CE; ct = 1
    
for col in cols.copy():
    rate = df_train[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.98:
        del df_train[col]
        del df_test[col]
        cols.remove(col)
        ct += 1

rmv3=['Census_OSSkuName', 'OsVer', 'Census_OSArchitecture', 'Census_OSInstallLanguageIdentifier']
rmv4=['SMode']
for col in rmv3+rmv4:
    del df_train[col]
    del df_test[col]
    cols.remove(col)
    ct +=1
    
print('Removed',ct,'variables')
x=gc.collect()

Removed 18 variables


In [8]:
print('Factorizing...')
for col in cols+cols2+cols6:
    factor_data(df_train, df_test, col)
print('Relaxing data...')
for col in cols+cols2: relax_data(df_train, df_test, col)
print('Optimizing memory...')
for col in cols+cols2+cols6:
    reduce_memory(df_train, col)
    reduce_memory(df_test, col)

categorize(df_train, df_test, cols2)
    
print('Number of variables is',len(cols+cols2+cols3+cols6+cols8))
display_memory(df_train, df_test)

Factorizing...
Relaxing data...
Optimizing memory...
Number of variables is 76
8921483 rows of training data use 1025.0 Mb memory!
7853253 rows of test data use 895.0 Mb memory!


In [9]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

pred_val = np.zeros(len(df_test))
folds = StratifiedKFold(n_splits=5, shuffle=True)

ct = 0
for idxT, idxV in folds.split(df_train[cols+cols2+cols3+cols6], df_train['HasDetections']):
    # TRAIN LGBM
    ct += 1; print('####### FOLD ',ct,'#########')
    df_trainA = df_train.loc[idxT]
    df_trainB = df_train.loc[idxV]
    model = lgb.LGBMClassifier(n_estimators=10000, colsample_bytree=0.5, objective='binary', num_leaves=2048,
            max_depth=-1, learning_rate=0.04)
    h=model.fit(df_trainA[cols+cols2+cols3+cols6+cols8], df_trainA['HasDetections'], eval_metric='auc',
            eval_set=[(df_trainB[cols+cols2+cols3+cols6+cols8], df_trainB['HasDetections'])], verbose=200,
            early_stopping_rounds=100)
    
    # PREDICT TEST
    del df_trainA, df_trainB; x=gc.collect()
    idx = 0; ct2 = 1; chunk = 1000000
    print('Predicting test...')
    while idx < len(df_test):
        idx2 = min(idx + chunk, len(df_test) )
        idx = range(idx, idx2)
        pred_val[idx] += model.predict_proba(df_test.iloc[idx][cols+cols2+cols3+cols6+cols8])[:,1]
        #print('Finished predicting part',ct2)
        ct2 += 1; idx = idx2

####### FOLD  1 #########




[200]	valid_0's auc: 0.74054	valid_0's binary_logloss: 0.595471
[400]	valid_0's auc: 0.743858	valid_0's binary_logloss: 0.592108
[600]	valid_0's auc: 0.744298	valid_0's binary_logloss: 0.59161
[800]	valid_0's auc: 0.744347	valid_0's binary_logloss: 0.591506
Predicting test...
####### FOLD  2 #########




[200]	valid_0's auc: 0.740718	valid_0's binary_logloss: 0.595204
[400]	valid_0's auc: 0.744029	valid_0's binary_logloss: 0.59182
[600]	valid_0's auc: 0.744474	valid_0's binary_logloss: 0.591322
[800]	valid_0's auc: 0.744575	valid_0's binary_logloss: 0.591152
Predicting test...
####### FOLD  3 #########




[200]	valid_0's auc: 0.73987	valid_0's binary_logloss: 0.595791
[400]	valid_0's auc: 0.743192	valid_0's binary_logloss: 0.592453
[600]	valid_0's auc: 0.743564	valid_0's binary_logloss: 0.592043
[800]	valid_0's auc: 0.743703	valid_0's binary_logloss: 0.591869
Predicting test...
####### FOLD  4 #########




[200]	valid_0's auc: 0.740546	valid_0's binary_logloss: 0.595354
[400]	valid_0's auc: 0.743828	valid_0's binary_logloss: 0.592012
[600]	valid_0's auc: 0.744291	valid_0's binary_logloss: 0.591509
[800]	valid_0's auc: 0.744425	valid_0's binary_logloss: 0.591325
Predicting test...
####### FOLD  5 #########




[200]	valid_0's auc: 0.740478	valid_0's binary_logloss: 0.59534
[400]	valid_0's auc: 0.743681	valid_0's binary_logloss: 0.592099
[600]	valid_0's auc: 0.744219	valid_0's binary_logloss: 0.59152
[800]	valid_0's auc: 0.74429	valid_0's binary_logloss: 0.591394
[1000]	valid_0's auc: 0.744343	valid_0's binary_logloss: 0.591307
Predicting test...


In [10]:
del df_train; x=gc.collect()
df_test = pd.read_csv('/home/centos/moon/ktw/microsoft-malware-prediction/data/test.csv',
            usecols=['MachineIdentifier','AvSigVersion'], nrows=len(pred_val))

from datetime import datetime
datedictAS = np.load('/home/centos/moon/ktw/microsoft-malware-prediction/data/AvSigVersionTimestamps.npy', allow_pickle=True)[()]
df_test['Date'] = df_test['AvSigVersion'].map(datedictAS)
df_test['HasDetections'] = pred_val / 5.0
df_test['X'] = df_test['Date'] - datetime(2018,11,20,4,0) 
df_test['X'] = df_test['X'].map(lambda x: x.total_seconds()/86400)
df_test['X'].fillna(0,inplace=True)
s = 5.813888
df_test['F'] = 1.0
df_test['F'] = 1 - df_test['X']/s
df_test.loc[df_test['X']<=0,'F'] = 1.0
df_test.loc[df_test['X']>s,'F'] = 0
df_test['HasDetections'] *= df_test['F']

In [11]:
df_test[['MachineIdentifier','HasDetections']].to_csv('/home/centos/moon/ktw/microsoft-malware-prediction/data/submission11.csv', index=False)