In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
import lightgbm as lgb
from sklearn.metrics import (roc_curve, auc, accuracy_score, roc_auc_score)
import random
import shap

Define the supposedly numerical columns:

In [2]:
nums = ['Census_ProcessorCoreCount',
        'Census_PrimaryDiskTotalCapacity',
        'Census_SystemVolumeTotalCapacity',
        'Census_TotalPhysicalRAM',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches',
        'Census_InternalPrimaryDisplayResolutionHorizontal',
        'Census_InternalPrimaryDisplayResolutionVertical',
        'Census_InternalBatteryNumberOfCharges']

## apparently these might be the true numericals
#nums_2 = [  # All the columns which have a real continuous data
#    'Census_ProcessorCoreCount',
#    'Census_PrimaryDiskTotalCapacity',
#    'Census_SystemVolumeTotalCapacity',
#    'Census_TotalPhysicalRAM',
#    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
#    'Census_InternalPrimaryDisplayResolutionHorizontal',
#   'Census_InternalPrimaryDisplayResolutionVertical',
#    'Census_InternalBatteryNumberOfCharges',
#    'Census_OSBuildNumber',
#    'Census_OSBuildRevision',
#    'Census_ThresholdOptIn',
#    'OsBuild'
#]

Load the dataset with integer-encoded categorical variables to run through the LGBM classifier

In [3]:
# LOAD INTEGER ENCODED TRAININIG DATA 

with open('LGBM_train.pickle', 'rb') as handle:
    malware_train = pickle.load(handle)
    
cat_cols = [col for col in malware_train.columns if col not in nums and col != 'HasDetections']
#cat_cols_2 = [col for col in malware_train.columns if col not in nums_2 and col != 'HasDetections']

targets = malware_train["HasDetections"]
malware_train.drop(["HasDetections"], axis = 1, inplace = True)

Define the dictionary with the hyperparameters for the LGBM classifier obtained through Randomized Grid Search with 50 repetitions. In each, the training and validation was performed on 5% and 0.5% of the original dataset respectively (original dataset had almost 9 million observations).

In [4]:
int_enc_params = {
    'subsample': 1.0, 
    'reg_alpha': 100,
    'num_leaves': 90,
    'n_estimators': 200,
    'min_child_samples': 1700, 
    'learning_rate': 0.1, 
    'colsample_bytree': 0.9,
    'boosting_type': 'gbdt',
    'objective' : 'binary',
    'n_jobs' : -1}

### Don't know if the dictionary below is correct - commented out just in case
#clean_params = {
#    'objective': 'binary',
#    'subsample': 0.9,
#     'reg_alpha': 5, 
#     'num_leaves': 110, 
#     'n_estimators': 200, 
#     'min_child_samples': 900,
#     'learning_rate': 0.1,
#     'colsample_bytree': 0.4, 
#     'boosting_type': 'gbdt'}

Create an LGBM Classifier instance and set the best hyperparameters determined by RGS

In [5]:
int_enc_lgbm = lgb.LGBMClassifier()
int_enc_lgbm.set_params(**int_enc_params)
#radom_lgbm.set_params(**{"early_stopping_rounds": None})

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=1700, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=200, n_jobs=-1, num_leaves=90,
               objective='binary', random_state=None, reg_alpha=100,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

Train the model with 15% of the data used as hold-out set

In [6]:
x_train, x_val, y_train, y_val = train_test_split(malware_train, targets, 
                                                test_size=0.15, stratify=targets,
                                                  random_state = 42)

In [7]:
int_enc_lgbm.fit(x_train, y_train,
                early_stopping_rounds = 100,
                eval_set=[(x_val, y_val)],
                feature_name = x_train.columns.to_list(),
                categorical_feature = cat_cols,
                eval_metric = 'auc', 
                verbose = 0)

with open('supposedly_best_lgbm.pickle', 'wb') as handle:
    pickle.dump(int_enc_lgbm, handle, protocol=pickle.HIGHEST_PROTOCOL)

Using categorical_feature in Dataset.
categorical_feature in Dataset is overridden.
New categorical_feature is ['AVProductStatesIdentifier', 'AVProductsEnabled', 'AVProductsInstalled', 'AppVersion', 'AutoSampleOptIn', 'AvSigVersion', 'Census_ActivationChannel', 'Census_ChassisTypeName', 'Census_DeviceFamily', 'Census_FirmwareManufacturerIdentifier', 'Census_FirmwareVersionIdentifier', 'Census_FlightRing', 'Census_GenuineStateName', 'Census_HasOpticalDiskDrive', 'Census_InternalBatteryType', 'Census_IsAlwaysOnAlwaysConnectedCapable', 'Census_IsFlightingInternal', 'Census_IsFlightsDisabled', 'Census_IsPenCapable', 'Census_IsPortableOperatingSystem', 'Census_IsSecureBootEnabled', 'Census_IsTouchEnabled', 'Census_IsVirtualDevice', 'Census_IsWIMBootEnabled', 'Census_MDC2FormFactor', 'Census_OEMModelIdentifier', 'Census_OEMNameIdentifier', 'Census_OSBranch', 'Census_OSBuildNumber', 'Census_OSBuildRevision', 'Census_OSEdition', 'Census_OSInstallLanguageIdentifier', 'Census_OSInstallTypeName',

In [11]:
######## SUBMISSION ##########

# LOAD INTEGER-ENCODED TEST DATA
with open('LGBM_test.pickle', 'rb') as handle:
    test = pickle.load(handle)
    
features = [col for col in test.columns if col != "MachineIdentifier"]
probs = int_enc_lgbm.predict_proba(test[features])

sub_df = pd.read_csv("sample_submission.csv")
sub_df['HasDetections'] = probs
sub_df.to_csv('lgbm_submission.csv', index=False)

The score achieved with the above method was 0.48004. We will now try to understand what was happening 'under the hood'.