In [1]:
# usual imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
from datetime import datetime

submission = pd.read_csv('/content/gdrive/My Drive/Notebooks Colab/testPredsNNPytorchKaggle.csv')
df_test = pd.read_csv('/content/gdrive/My Drive/Notebooks Colab/test.csv', usecols=['MachineIdentifier','AvSigVersion'])
datedictAS = np.load('/content/gdrive/My Drive/Notebooks Colab/AvSigVersionTimestamps.npy', allow_pickle=True)[()]

df_test['Date'] = df_test['AvSigVersion'].map(datedictAS)
df_test['HasDetections'] = submission['HasDetections'] / 5.0
df_test['X'] = df_test['Date'] - datetime(2018,11,20,4,0) 
df_test['X'] = df_test['X'].map(lambda x: x.total_seconds() / 86400)
df_test['X'].fillna(0,inplace=True)
s = 5.813888
df_test['F'] = 1.0
df_test['F'] = 1 - df_test['X']/s
df_test.loc[df_test['X']<=0,'F'] = 1.0
df_test.loc[df_test['X']>s,'F'] = 0
df_test['HasDetections'] *= df_test['F']
df_test[['MachineIdentifier','HasDetections']].to_csv('/content/gdrive/My Drive/Notebooks Colab/testPredsNNPytorchKaggle_corrected.csv', index=False)

In [4]:
df_test[['MachineIdentifier','HasDetections']].head()

Unnamed: 0,MachineIdentifier,HasDetections
0,0000010489e3af074adeac69c53e555e,0.001588857
1,00000176ac758d54827acd545b6315a5,5.497672e-06
2,0000019dcefc128c2d4387c1273dae1d,2.363148e-06
3,0000055553dc51b1295785415f1a224d,7.174444e-10
4,00000574cefffeca83ec8adf9285b2bf,0.07921727


In [5]:
import pandas as pd
import numpy as np
import pickle
import gc

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data loading

In [7]:
def load(x):
    ignore = ['MachineIdentifier']
    if x in ignore: return False
    else: return True

df_train = pd.read_csv('/content/gdrive/My Drive/Notebooks Colab/train.csv',dtype='category',usecols=load)
df_train['HasDetections'] = df_train['HasDetections'].astype('int8')

if 5244810 in df_train.index:
    df_train.loc[5244810,'AvSigVersion'] = '1.273.1144.0'
    df_train['AvSigVersion'].cat.remove_categories('1.2&#x17;3.1144.0',inplace=True)
df_test = pd.read_csv('/content/gdrive/My Drive/Notebooks Colab/test.csv',dtype='category',usecols=load)
datedictAS = np.load('/content/gdrive/My Drive/Notebooks Colab/AvSigVersionTimestamps.npy', allow_pickle=True)[()]
datedictOS = np.load('/content/gdrive/My Drive/Notebooks Colab/OSVersionTimestamps.npy', allow_pickle=True)[()]

# Pre-processing 

## Feature engineering:

Getting dates for OS and Defender version

In [8]:
df_train['DateAS'] = df_train['AvSigVersion'].map(datedictAS)
df_test['DateAS'] = df_test['AvSigVersion'].map(datedictAS)
df_train['DateOS'] = df_train['Census_OSVersion'].map(datedictOS)
df_test['DateOS'] = df_test['Census_OSVersion'].map(datedictOS)

The second number in Defender version tells us if it is up to date

In [9]:
df_train['AppVersion2'] = df_train['AppVersion'].map(lambda x: np.int(x.split('.')[1]))
df_test['AppVersion2'] = df_test['AppVersion'].map(lambda x: np.int(x.split('.')[1]))

Check if Defender is behind OS update

In [10]:
df_train['Lag1'] = df_train['DateAS'] - df_train['DateOS']
df_train['Lag1'] = df_train['Lag1'].map(lambda x: x.days//7)
df_test['Lag1'] = df_test['DateAS'] - df_test['DateOS']
df_test['Lag1'] = df_test['Lag1'].map(lambda x: x.days//7)

Check ratio of size of OS partition and total hard drive space _"Savy users install multiple operating systems and have a lower ratio. Savy users have reduced HasDetections."_

In [11]:
df_train['driveA'] = df_train['Census_SystemVolumeTotalCapacity'].astype('float')/df_train['Census_PrimaryDiskTotalCapacity'].astype('float')
df_test['driveA'] = df_test['Census_SystemVolumeTotalCapacity'].astype('float')/df_test['Census_PrimaryDiskTotalCapacity'].astype('float')
df_train['driveA'] = df_train['driveA'].astype('float32') 
df_test['driveA'] = df_test['driveA'].astype('float32') 

Amount of hard drive not used in OS. _"Responsible users manager their hard drives well. Responsible users have reduced HasDetections."_

In [12]:
df_train['driveB'] = df_train['Census_PrimaryDiskTotalCapacity'].astype('float') - df_train['Census_SystemVolumeTotalCapacity'].astype('float')
df_test['driveB'] = df_test['Census_PrimaryDiskTotalCapacity'].astype('float') - df_test['Census_SystemVolumeTotalCapacity'].astype('float')
df_train['driveB'] = df_train['driveB'].astype('float32') 
df_test['driveB'] = df_test['driveB'].astype('float32')

cols6=['Lag1']
cols8=['driveB','driveA']

del df_train['DateAS'], df_train['DateOS']
del df_test['DateAS'], df_test['DateOS']
del datedictAS, datedictOS

x = gc.collect()

More popular countries are more frequently attacked, therefore there is correlation between infection rate and frequency of appereance of a country code. Apparently, there is also a correlation between the number of charges and the detection rate. Maybe because it tells is wether it is a laptop or desktop PC?

In [14]:
cols3 = []

def encode_FE2(df1, df2, col):
    df = pd.concat([df1[col],df2[col]])
    vc = df.value_counts(dropna=False, normalize=True).to_dict()
    nm = col+'_FE2'
    df1[nm] = df1[col].map(vc)
    df1[nm] = df1[nm].astype('float32')
    df2[nm] = df2[col].map(vc)
    df2[nm] = df2[nm].astype('float32')
    return [nm]


FE2 = ['CountryIdentifier', 'Census_InternalBatteryNumberOfCharges']
for col in FE2:
    cols3 += encode_FE2(df_train, df_test, col)

## Data cleaning

Removing apparently irrelevant variables. These are variables which are highly correlated with other variable or have more than 98% values in one category. I know we were unsure about removing these, but several notebooks showed that this can be removed without affecting performance

In [15]:
# FACTORIZE
def factor_data(df_train, df_test, col):
    df_comb = pd.concat([df_train[col],df_test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    # MAKE SMALLEST LABEL 1, RESERVE 0
    df_comb += 1
    # MAKE NAN LARGEST LABEL (need to remove attype('str') above)
    df_comb = np.where(df_comb==0, df_comb.max()+1, df_comb)
    df_train[col] = df_comb[:len(df_train)]
    df_test[col] = df_comb[len(df_train):]
    del df_comb
    
# OPTIMIZE MEMORY
def reduce_memory(df,col):
    mx = df[col].max()
    if mx<256:
            df[col] = df[col].astype('uint8')
    elif mx<65536:
        df[col] = df[col].astype('uint16')
    else:
        df[col] = df[col].astype('uint32')
        
# REDUCE CATEGORY CARDINALITY
def relax_data(df_train, df_test, col):
    cv1 = pd.DataFrame(df_train[col].value_counts().reset_index().rename({col:'train'},axis=1))
    cv2 = pd.DataFrame(df_test[col].value_counts().reset_index().rename({col:'test'},axis=1))
    cv3 = pd.merge(cv1,cv2,on='index',how='outer')
    factor = len(df_test)/len(df_train)
    cv3['train'].fillna(0,inplace=True)
    cv3['test'].fillna(0,inplace=True)
    cv3['remove'] = False
    cv3['remove'] = cv3['remove'] | (cv3['train'] < len(df_train)/10000)
    cv3['remove'] = cv3['remove'] | (factor*cv3['train'] < cv3['test']/3)
    cv3['remove'] = cv3['remove'] | (factor*cv3['train'] > 3*cv3['test'])
    cv3['new'] = cv3.apply(lambda x: x['index'] if x['remove']==False else 0,axis=1)
    cv3['new'],_ = cv3['new'].factorize(sort=True)
    cv3.set_index('index',inplace=True)
    cc = cv3['new'].to_dict()
    df_train[col] = df_train[col].map(cc)
    reduce_memory(df_train,col)
    df_test[col] = df_test[col].map(cc)
    reduce_memory(df_test,col)
    
# DISPLAY MEMORY STATISTICS
def display_memory(df_train, df_test):
    print(len(df_train),'rows of training data use',df_train.memory_usage(deep=True).sum()//1e6,'Mb memory!')
    print(len(df_test),'rows of test data use',df_test.memory_usage(deep=True).sum()//1e6,'Mb memory!')

# CONVERT TO CATEGORIES
def categorize(df_train, df_test, cols):
    for col in cols:
        df_train[col] = df_train[col].astype('category')
        df_test[col] = df_test[col].astype('category')
        

CE = ['CountryIdentifier', 'SkuEdition', 'Firewall', 'Census_ProcessorCoreCount', 'Census_OSUILocaleIdentifier', 'Census_FlightRing']

cols = [x for x in df_train.columns if x not in ['HasDetections']+CE+cols3+cols6+cols8]
cols2 = CE
ct = 1
    
for col in cols.copy():
    rate = df_train[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.98:
        del df_train[col]
        del df_test[col]
        cols.remove(col)
        ct += 1

rmv3=['Census_OSSkuName', 'OsVer', 'Census_OSArchitecture', 'Census_OSInstallLanguageIdentifier']
rmv4=['SMode']
for col in rmv3+rmv4:
    del df_train[col]
    del df_test[col]
    cols.remove(col)
    ct +=1
    
print('Removed',ct,'variables')
x=gc.collect()

print('Factorizing...')
for col in cols+cols2+cols6:
    factor_data(df_train, df_test, col)
print('Relaxing data...')
for col in cols+cols2: relax_data(df_train, df_test, col)
print('Optimizing memory...')
for col in cols+cols2+cols6:
    reduce_memory(df_train, col)
    reduce_memory(df_test, col)
# Converting 6 variables to categorical
categorize(df_train, df_test, cols2)
    
print('Number of variables is',len(cols+cols2+cols3+cols6+cols8))
display_memory(df_train, df_test)

Removed 18 variables
Factorizing...
Relaxing data...
Optimizing memory...
Number of variables is 70
8921483 rows of training data use 811.0 Mb memory!
7853253 rows of test data use 706.0 Mb memory!


In [18]:
df_train = df_train[cols+cols2+cols3+cols6+cols8]
df_test = df_test[cols+cols2+cols3+cols6+cols8]

In [19]:
for column in df_train:
    if hasattr(df_train[column], 'cat'):
        print(f"{column} ({df_train[column].dtype}):\n Cardinality: {df_train[column].nunique()} \t Has NaN: {df_train[column].isna().any()}")
    else:
        print(f"{column} ({df_train[column].dtype}):\n Cardinality: {df_train[column].nunique()} \t Has NaN: {df_train[column].isna().any()}, \t range: [{np.min(df_train[column])}, {np.max(df_train[column])}]")

EngineVersion (uint8):
 Cardinality: 30 	 Has NaN: False, 	 range: [0, 29]
AppVersion (uint8):
 Cardinality: 47 	 Has NaN: False, 	 range: [0, 46]
AvSigVersion (uint8):
 Cardinality: 166 	 Has NaN: False, 	 range: [0, 165]
RtpStateBitfield (uint8):
 Cardinality: 8 	 Has NaN: False, 	 range: [0, 7]
DefaultBrowsersIdentifier (uint8):
 Cardinality: 40 	 Has NaN: False, 	 range: [0, 39]
AVProductStatesIdentifier (uint8):
 Cardinality: 207 	 Has NaN: False, 	 range: [0, 206]
AVProductsInstalled (uint8):
 Cardinality: 6 	 Has NaN: False, 	 range: [0, 5]
AVProductsEnabled (uint8):
 Cardinality: 6 	 Has NaN: False, 	 range: [0, 5]
CityIdentifier (uint16):
 Cardinality: 1273 	 Has NaN: False, 	 range: [0, 1272]
OrganizationIdentifier (uint8):
 Cardinality: 24 	 Has NaN: False, 	 range: [0, 23]
GeoNameIdentifier (uint8):
 Cardinality: 153 	 Has NaN: False, 	 range: [0, 152]
LocaleEnglishNameIdentifier (uint8):
 Cardinality: 123 	 Has NaN: False, 	 range: [0, 122]
Platform (uint8):
 Cardinality: 

In [20]:
for column in df_test:
    if hasattr(df_test[column], 'cat'):
        print(f"{column} ({df_test[column].dtype}):\n Cardinality: {df_test[column].nunique()} \t Has NaN: {df_test[column].isna().any()}")
    else:
        print(f"{column} ({df_test[column].dtype}):\n Cardinality: {df_test[column].nunique()} \t Has NaN: {df_test[column].isna().any()}, \t range: [{np.min(df_train[column])}, {np.max(df_train[column])}]")

EngineVersion (uint8):
 Cardinality: 30 	 Has NaN: False, 	 range: [0, 29]
AppVersion (uint8):
 Cardinality: 47 	 Has NaN: False, 	 range: [0, 46]
AvSigVersion (uint8):
 Cardinality: 166 	 Has NaN: False, 	 range: [0, 165]
RtpStateBitfield (uint8):
 Cardinality: 8 	 Has NaN: False, 	 range: [0, 7]
DefaultBrowsersIdentifier (uint8):
 Cardinality: 40 	 Has NaN: False, 	 range: [0, 39]
AVProductStatesIdentifier (uint8):
 Cardinality: 207 	 Has NaN: False, 	 range: [0, 206]
AVProductsInstalled (uint8):
 Cardinality: 6 	 Has NaN: False, 	 range: [0, 5]
AVProductsEnabled (uint8):
 Cardinality: 6 	 Has NaN: False, 	 range: [0, 5]
CityIdentifier (uint16):
 Cardinality: 1273 	 Has NaN: False, 	 range: [0, 1272]
OrganizationIdentifier (uint8):
 Cardinality: 24 	 Has NaN: False, 	 range: [0, 23]
GeoNameIdentifier (uint8):
 Cardinality: 153 	 Has NaN: False, 	 range: [0, 152]
LocaleEnglishNameIdentifier (uint8):
 Cardinality: 123 	 Has NaN: False, 	 range: [0, 122]
Platform (uint8):
 Cardinality: 

In [21]:
with open('/content/gdrive/My Drive/Notebooks Colab/kaggleDF.pickle', 'wb') as handle:
    pickle.dump((df_train, df_test), handle, protocol=pickle.HIGHEST_PROTOCOL)

# NaN filtering (Optional with LGBM I think (?)) and scale normalization (necessary for parametric models)

In [None]:
def standardNaNFilling(X):
    for column in X:
        if hasattr(X[column], 'cat'):
            X[column] = X[column].cat.add_categories("NaN")
            X[column].fillna("NaN", inplace=True)
        elif X[column].dtype in ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32', 'uint64', 'int64']:
            X[column].fillna(X[column].mode(), inplace=True)
        elif X[column].dtype in ['float16', 'float32', 'float64']:
            X[column].fillna(X[column].mean(), inplace=True)

def scaleNormalization(xTrain, xTest):
    normConstants = {}
    for column in xTrain:
        if not hasattr(xTrain[column], 'cat') and np.max(np.abs(xTrain[column])) > 1:
            normConstants[column] = (np.mean(xTrain[column]), np.std(xTrain[column]))
            if xTrain[column].dtype in ['uint8', 'int8', 'uint16', 'int16', 'float16']:
                columnDType = 'float16'
            elif xTrain[column].dtype in ['uint32', 'int32', 'float32']:
                columnDType = 'float32'
            elif xTrain[column].dtype in ['uint64', 'int64', 'float64']:
                columnDType = 'float64'
            xTrain[column] = (xTrain[column].astype(columnDType) - normConstants[column][0]) / normConstants[column][1]
            xTest[column] = (xTest[column].astype(columnDType) - normConstants[column][0]) / normConstants[column][1]