In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import gc
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

Credit to Theo Viel for defining dtypes.

In [None]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [None]:
%time df_train = pd.read_csv('../input/train.csv', dtype=dtypes)

In [None]:
df_train.describe()

In [None]:
gc.collect

In [None]:
df_train.head(10)

There are numerous NaN(s) visible above, that will have to be dealt with at feature creation time. Let's look at the shape of the data and how it's split between "HasDetections", 0 and 1.

In [None]:
df_train.shape

In [None]:
df_train['HasDetections'].value_counts()

In [None]:
columns = df_train.columns
col = list(columns)
col.remove('HasDetections')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train[col], df_train['HasDetections'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, X_val.shape

In [None]:
y_train.shape, y_test.shape, y_val.shape

In [None]:
print('Train: ')
gbm = lgb.LGBMClassifier(objective='binary',
                         boosting_type='gbdt',
                         num_leaves=31,
                         learning_rate=0.05,
                         n_estimators=110)
gbm.fit(X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        early_stopping_rounds=100)

In [None]:
gbm.best_score_

In [None]:
feat_imp = dict(zip(df_train.columns,list(gbm.feature_importances_)))

In [None]:
print('Predict: ')
y_pred = gbm.predict(X_test, num_iteration=10)
print('log_loss: ', log_loss(y_test, y_pred))
print('roc_auc_score: ', roc_auc_score(y_test, y_pred))
print('# Features : ', len(list(gbm.feature_importances_)))
print('Features : ', df_train.columns)
print('Importance : ', list(gbm.feature_importances_))
print('Feature importance : ', dict(zip(df_train.columns,list(gbm.feature_importances_))))

In [None]:
model_params = {
        "boosting_type": 'gbdt',
        "objective": 'binary',
        "learning_rate": 0.05,
        "max_depth": 11,
        "num_leaves": 2048,
        "n_estimators": 120,
        "min_child_weight": 50,
        "min_child_samples": 20,
        "bagging_fraction": 0.8,
        "feature_fraction": 0.8,
        "bagging_freq": 5,
        "bagging_seed": 2018,
        "reg_alpha": 0.005,
        "reg_lambda": 0.1,
        "random_state": 1,
        "metric": "binary_logloss",
    }

In [None]:
gbm_train = lgb.Dataset(X_train, y_train)
gbm_test = lgb.Dataset(X_test, y_test, reference=gbm_train)
gbm = lgb.train(model_params,
                gbm_train,
                valid_sets=gbm_test,
                early_stopping_rounds=100)

In [None]:
gbm.best_score

In [None]:
print('Save model: ')
gbm.save_model('model.txt')
!ls -al