# Final Project: MicroSoft Malware Prediction - lightgbm.

### Name: Zhengyan Zhuo


In [1]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import gc
from tqdm import tqdm_notebook
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(filename='log.txt',level=logging.DEBUG, format='%(asctime)s %(message)s')

## 1. Data preparation

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'category',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'category',
        'AVProductStatesIdentifier':                            'category',
        'AVProductsInstalled':                                  'category',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'category',
        'CityIdentifier':                                       'category',
        'OrganizationIdentifier':                               'category',
        'GeoNameIdentifier':                                    'category',
        'LocaleEnglishNameIdentifier':                          'category',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'category',
        'OsSuite':                                              'category',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'category',
        'IeVerIdentifier':                                      'category',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'category',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'category',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'category',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [0]:
train = pd.read_csv('/content/drive/My Drive/train.csv', dtype=dtypes, low_memory=True)
test  = pd.read_csv('/content/drive/My Drive/test.csv', dtype=dtypes, low_memory=True)

In [0]:
train = pd.read_csv('C:/Users/ilbn2/Downloads/microsoft-malware-prediction/train.csv', dtype=dtypes, low_memory=True)
test  = pd.read_csv('C:/Users/ilbn2/Downloads/microsoft-malware-prediction/test.csv', dtype=dtypes, low_memory=True)

In [5]:
stats = []
for col in train.columns:
    stats.append((col, train[col].nunique(), train[col].isnull().sum() * 100 / train.shape[0], train[col].value_counts(normalize=True, dropna=False).values[0] * 100, train[col].dtype))
    
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)

Unnamed: 0,Feature,Unique_values,Percentage of missing values,Percentage of values in the biggest category,type
28,PuaMode,2,99.974119,99.974119,category
41,Census_ProcessorClass,3,99.589407,99.589407,category
8,DefaultBrowsersIdentifier,2017,95.141637,95.141637,category
68,Census_IsFlightingInternal,2,83.044030,83.044030,float16
52,Census_InternalBatteryType,78,71.046809,71.046809,category
...,...,...,...,...,...
1,ProductName,6,0.000000,98.935569,category
45,Census_HasOpticalDiskDrive,2,0.000000,92.281272,int8
54,Census_OSVersion,469,0.000000,15.845202,category
55,Census_OSArchitecture,3,0.000000,90.858045,category


In [6]:
good_cols = list(train.columns)
unbalanced = []
unbalanced2 = []
for col in train.columns:
    rate = train[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.9:
        unbalanced.append(col)
    elif rate > 0.8:
        unbalanced2.append(col)
print(unbalanced)
print(unbalanced2)
for col in unbalanced:
    good_cols.remove(col)
for col in unbalanced2:
    good_cols.remove(col)

['ProductName', 'IsBeta', 'RtpStateBitfield', 'IsSxsPassiveMode', 'DefaultBrowsersIdentifier', 'AVProductsEnabled', 'HasTpm', 'Platform', 'Processor', 'OsVer', 'IsProtected', 'AutoSampleOptIn', 'PuaMode', 'SMode', 'Firewall', 'UacLuaenable', 'Census_DeviceFamily', 'Census_ProcessorClass', 'Census_HasOpticalDiskDrive', 'Census_OSArchitecture', 'Census_IsPortableOperatingSystem', 'Census_IsFlightsDisabled', 'Census_FlightRing', 'Census_IsVirtualDevice', 'Census_IsPenCapable', 'Census_IsAlwaysOnAlwaysConnectedCapable']
['Census_ProcessorManufacturerIdentifier', 'Census_GenuineStateName', 'Census_IsFlightingInternal', 'Census_IsTouchEnabled']


In [0]:
def partitionby_hasdetection(col, top_n=10):
    top_n = top_n if train[col].nunique() > top_n else train[col].nunique()
    top_cat = list(train[col].value_counts(dropna=False).index[:top_n])
    df0 = train.loc[(train[col].isin(top_cat)) & (train['HasDetections'] == 1), col].value_counts().head(10).sort_index()
    df1 = train.loc[(train[col].isin(top_cat)) & (train['HasDetections'] == 0), col].value_counts().head(10).sort_index()
    data = [go.Bar(x=df0.index, y=df0.values, name='Has Detections'),
                   go.Bar(x=df1.index, y=df1.values, name='No Detections')]
    layout = go.Layout(dict(title = f"Counts of {col} by top-{top_n} categories",
                                xaxis = dict(title = f'{col}',
                                             showgrid=False,
                                             zeroline=False,
                                             showline=False,),
                                yaxis = dict(title = 'Counts',
                                             showgrid=False,
                                             zeroline=False,
                                             showline=False,),
                                ),
                           legend=dict(orientation="v"), barmode='group')
        
    py.iplot(dict(data=data, layout=layout))

In [8]:
for col in unbalanced2:
    partitionby_hasdetection(col)

In [0]:
def plot_feature(col, top_n=10):
    top_n = top_n if train[col].nunique() > top_n else train[col].nunique()
    print(f"{col} has {train[col].nunique()} unique values and type: {train[col].dtype}.")
    print(train[col].value_counts(normalize=True, dropna=False).head())
    df = train.groupby([col]).agg({'HasDetections': ['count', 'mean']})
    df = df.sort_values(('HasDetections', 'count'), ascending=False).head(top_n).sort_index()
    data = [go.Bar(x=df.index, y=df['HasDetections']['count'].values, name='counts'),
            go.Scatter(x=df.index, y=df['HasDetections']['mean'], name='Detections rate', yaxis='y2')]
    layout = go.Layout(dict(title = f"Counts of {col} by top-{top_n} categories and mean target value",
                       xaxis = dict(title = f'{col}', showgrid=False, zeroline=False,showline=False,),
                       yaxis = dict(title = 'Counts',showgrid=False,zeroline=False,showline=False,),
                       yaxis2=dict(title='Detections rate', overlaying='y', side='right')),legend=dict(orientation="v"))
    py.iplot(dict(data=data, layout=layout))

In [0]:
plot_feature('AppVersion')

AppVersion has 110 unique values and type: category.
4.18.1807.18075    0.576050
4.18.1806.18062    0.095380
4.12.16299.15      0.040338
4.10.209.0         0.030539
4.13.17134.1       0.028837
Name: AppVersion, dtype: float64


In [0]:
plot_feature('Census_OSVersion')

Census_OSVersion has 469 unique values and type: category.
10.0.17134.228    0.158452
10.0.17134.165    0.100848
10.0.16299.431    0.061262
10.0.17134.285    0.052713
10.0.16299.547    0.038878
Name: Census_OSVersion, dtype: float64


In [0]:
plot_feature('AvSigVersion')

AvSigVersion has 8531 unique values and type: category.
1.273.1420.0    0.011469
1.263.48.0      0.010987
1.275.1140.0    0.010899
1.275.727.0     0.010362
1.273.371.0     0.009748
Name: AvSigVersion, dtype: float64


In [0]:
plot_feature('Census_TotalPhysicalRAM')

Census_TotalPhysicalRAM has 3446 unique values and type: float32.
4096.0     0.458950
8192.0     0.246204
2048.0     0.123015
16384.0    0.059582
6144.0     0.044687
Name: Census_TotalPhysicalRAM, dtype: float64


In [0]:
good_cols.append('DefaultBrowsersIdentifier')
good_cols.append('AVProductsEnabled')
good_cols.append('Processor')
good_cols.append('IsProtected')
good_cols.append('Census_ProcessorClass')
good_cols.append('Census_OSArchitecture')
good_cols.append('Census_IsVirtualDevice')
good_cols.append('Census_IsAlwaysOnAlwaysConnectedCapable')
good_cols.append('Census_IsFlightingInternal')
good_cols.append('Census_IsTouchEnabled')
good_cols.remove('HasDetections')
#good_cols.remove('MachineIdentifier')

In [0]:
y = train['HasDetections']
train = train[good_cols]
test = test[good_cols]
good_cols.remove('MachineIdentifier')

In [11]:
print(train.columns, train.shape)
print(test.columns, test.shape)

Index(['MachineIdentifier', 'EngineVersion', 'AppVersion', 'AvSigVersion',
       'AVProductStatesIdentifier', 'AVProductsInstalled', 'CountryIdentifier',
       'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier',
       'LocaleEnglishNameIdentifier', 'OsBuild', 'OsSuite',
       'OsPlatformSubRelease', 'OsBuildLab', 'SkuEdition', 'IeVerIdentifier',
       'SmartScreen', 'Census_MDC2FormFactor', 'Census_OEMNameIdentifier',
       'Census_OEMModelIdentifier', 'Census_ProcessorCoreCount',
       'Census_ProcessorModelIdentifier', 'Census_PrimaryDiskTotalCapacity',
       'Census_PrimaryDiskTypeName', 'Census_SystemVolumeTotalCapacity',
       'Census_TotalPhysicalRAM', 'Census_ChassisTypeName',
       'Census_InternalPrimaryDiagonalDisplaySizeInInches',
       'Census_InternalPrimaryDisplayResolutionHorizontal',
       'Census_InternalPrimaryDisplayResolutionVertical',
       'Census_PowerPlatformRoleName', 'Census_InternalBatteryType',
       'Census_InternalBatteryNumberOf

In [0]:
train['OsBuildLab'] = train['OsBuildLab'].cat.add_categories(['0.0.0.0.0-0'])
train['OsBuildLab'] = train['OsBuildLab'].fillna('0.0.0.0.0-0')
test['OsBuildLab'] = test['OsBuildLab'].cat.add_categories(['0.0.0.0.0-0'])
test['OsBuildLab'] = test['OsBuildLab'].fillna('0.0.0.0.0-0')

In [0]:
def exp_feature(df):
    
    df['primary_drive_c_ratio'] = df['Census_SystemVolumeTotalCapacity']/ df['Census_PrimaryDiskTotalCapacity']
    df['non_primary_drive_MB'] = df['Census_PrimaryDiskTotalCapacity'] - df['Census_SystemVolumeTotalCapacity']
    df['aspect_ratio'] = df['Census_InternalPrimaryDisplayResolutionHorizontal']/ df['Census_InternalPrimaryDisplayResolutionVertical']
    df['ram_per_processor'] = df['Census_TotalPhysicalRAM']/ df['Census_ProcessorCoreCount']
    df['new_num_0'] = df['Census_InternalPrimaryDiagonalDisplaySizeInInches'] / df['Census_ProcessorCoreCount']
    df['new_num_1'] = df['Census_ProcessorCoreCount'] * df['Census_InternalPrimaryDiagonalDisplaySizeInInches']
    
    df['EngineVersion_2'] = df['EngineVersion'].apply(lambda x: x.split('.')[2]).astype('category')
    df['EngineVersion_3'] = df['EngineVersion'].apply(lambda x: x.split('.')[3]).astype('category')

    df['AppVersion_1'] = df['AppVersion'].apply(lambda x: x.split('.')[1]).astype('category')
    df['AppVersion_2'] = df['AppVersion'].apply(lambda x: x.split('.')[2]).astype('category')
    df['AppVersion_3'] = df['AppVersion'].apply(lambda x: x.split('.')[3]).astype('category')

    df['AvSigVersion_0'] = df['AvSigVersion'].apply(lambda x: x.split('.')[0]).astype('category')
    df['AvSigVersion_1'] = df['AvSigVersion'].apply(lambda x: x.split('.')[1]).astype('category')
    df['AvSigVersion_2'] = df['AvSigVersion'].apply(lambda x: x.split('.')[2]).astype('category')

    df['OsBuildLab_0'] = df['OsBuildLab'].apply(lambda x: x.split('.')[0]).astype('category')
    df['OsBuildLab_1'] = df['OsBuildLab'].apply(lambda x: x.split('.')[1]).astype('category')
    df['OsBuildLab_2'] = df['OsBuildLab'].apply(lambda x: x.split('.')[2]).astype('category')
    df['OsBuildLab_3'] = df['OsBuildLab'].apply(lambda x: x.split('.')[3]).astype('category')

    df['Census_OSVersion_0'] = df['Census_OSVersion'].apply(lambda x: x.split('.')[0]).astype('category')
    df['Census_OSVersion_1'] = df['Census_OSVersion'].apply(lambda x: x.split('.')[1]).astype('category')
    df['Census_OSVersion_2'] = df['Census_OSVersion'].apply(lambda x: x.split('.')[2]).astype('category')
    df['Census_OSVersion_3'] = df['Census_OSVersion'].apply(lambda x: x.split('.')[3]).astype('category')
    
    df['Census_IsFlightingInternal'] = df['Census_IsFlightingInternal'].fillna(1)
    df['Census_ThresholdOptIn'] = df['Census_ThresholdOptIn'].fillna(1)
    df['Census_IsWIMBootEnabled'] = df['Census_IsWIMBootEnabled'].fillna(1)
    df['Wdft_IsGamer'] = df['Wdft_IsGamer'].fillna(0)
    
    return df

In [0]:
train = exp_feature(train)
test = exp_feature(test)

In [0]:
usecols = list(train.columns)
usecols.remove('primary_drive_c_ratio')
usecols.remove('non_primary_drive_MB')
usecols.remove('aspect_ratio')
usecols.remove('ram_per_processor')
usecols.remove('new_num_0')
usecols.remove('new_num_1')
#usecols.remove('HasDetections')
usecols.remove('MachineIdentifier')

In [0]:
add_cat_feats = [
 'Census_OSBuildRevision',
 'OsBuildLab',
 'SmartScreen',
'AVProductsInstalled']
for col1 in add_cat_feats:
    for col2 in add_cat_feats:
        if col1 < col2:
            train[col1 + '__' + col2] = train[col1].astype(str) + train[col2].astype(str)
            train[col1 + '__' + col2] = train[col1 + '__' + col2].astype('category')
            
            test[col1 + '__' + col2] = test[col1].astype(str) + test[col2].astype(str)
            test[col1 + '__' + col2] = test[col1 + '__' + col2].astype('category')

In [0]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [18]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
gc.collect()

Mem. usage decreased to 2511.28 Mb (0.7% reduction)
Mem. usage decreased to 2251.73 Mb (0.7% reduction)


56392

In [19]:
print('Transform all features to category.\n')
for usecol in usecols:

    train[usecol] = train[usecol].astype('str')
    test[usecol] = test[usecol].astype('str')
    
    #Label Encoding
    le = LabelEncoder().fit(
            np.unique(train[usecol].unique().tolist()+
                      test[usecol].unique().tolist()))

    train[usecol] = le.transform(train[usecol])+1
    test[usecol]  = le.transform(test[usecol])+1

    fil_tr = (train
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Train'}, axis=1))
    fil_te = (test
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Test'}, axis=1))

    sparse_filter = pd.merge(fil_tr, fil_te, on=usecol, how='outer').replace(np.nan, 0)
    sparse_filter = sparse_filter[(sparse_filter['Train'] > 1000)].reset_index(drop=True)
    sparse_filter['Total'] = sparse_filter['Train'] + sparse_filter['Test']
    sparse_filter = sparse_filter[(sparse_filter['Train'] / sparse_filter['Total'] > 0.2) & (sparse_filter['Train'] / sparse_filter['Total'] < 0.8)]
    sparse_filter[usecol+'Copy'] = sparse_filter[usecol]

    train[usecol] = (pd.merge(train[[usecol]], 
                              sparse_filter[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    test[usecol]  = (pd.merge(test[[usecol]], 
                              sparse_filter[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    del le, fil_tr, fil_te, sparse_filter, usecol
    gc.collect()
          
gc.collect()


Transform all features to category.



0

In [20]:
del train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()
train.head(20)

Unnamed: 0,EngineVersion,AppVersion,AvSigVersion,AVProductStatesIdentifier,AVProductsInstalled,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IeVerIdentifier,SmartScreen,Census_MDC2FormFactor,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorModelIdentifier,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSEdition,Census_OSSkuName,Census_OSInstallTypeName,...,Wdft_IsGamer,Wdft_RegionIdentifier,DefaultBrowsersIdentifier,AVProductsEnabled,Processor,IsProtected,Census_ProcessorClass,Census_OSArchitecture,Census_IsVirtualDevice,Census_IsAlwaysOnAlwaysConnectedCapable,Census_IsFlightingInternal,Census_IsTouchEnabled,primary_drive_c_ratio,non_primary_drive_MB,aspect_ratio,ram_per_processor,new_num_0,new_num_1,EngineVersion_2,EngineVersion_3,AppVersion_1,AppVersion_2,AppVersion_3,AvSigVersion_0,AvSigVersion_1,AvSigVersion_2,OsBuildLab_0,OsBuildLab_1,OsBuildLab_2,OsBuildLab_3,Census_OSVersion_0,Census_OSVersion_1,Census_OSVersion_2,Census_OSVersion_3,Census_OSBuildRevision__OsBuildLab,Census_OSBuildRevision__SmartScreen,OsBuildLab__SmartScreen,AVProductsInstalled__Census_OSBuildRevision,AVProductsInstalled__OsBuildLab,AVProductsInstalled__SmartScreen
0,0,0,0,26978,2,145,24112,10,226,81,7,4,5,308,7,28,21,3,1231,0,31,1312,6049,1,0,3147,29,143,450,1687,2,72,29959,0,13,122,0,22,18,7,...,1,2,2610,2,2,2,4,1,1,1,2,1,0.62793,177489.0,1.599609,1024.0,4.726562,75.625,0,0,9,0,0,2,0,0,35,2,2,18,1,1,122,0,16517134.1.amd64fre.rs4_release.180410-1804,165nan,17134.1.amd64fre.rs4_release.180410-1804nan,1165,117134.1.amd64fre.rs4_release.180410-1804,1nan
1,57,30,6466,26978,2,216,41795,10,23,242,7,4,5,308,7,28,21,9,1231,0,31,1370,6049,1,3718,3147,43,84,374,1554,4,72,2,412,13,122,2,22,18,3,...,1,14,2610,2,2,2,4,1,1,1,2,1,0.214722,374555.0,1.77832,1024.0,3.474609,55.59375,56,0,4,8,2,2,45,2215,35,2,2,18,1,1,122,2,117134.1.amd64fre.rs4_release.180410-1804,1nan,17134.1.amd64fre.rs4_release.180410-1804nan,11,117134.1.amd64fre.rs4_release.180410-1804,1nan
2,0,0,0,26978,2,208,0,10,257,225,7,13,5,308,5,28,18,3,2230,162982,31,955,597,2,17014,3147,29,173,930,82,2,72,29959,0,13,122,0,5,3,7,...,1,9,2610,2,2,2,4,1,1,1,2,1,0.995117,566.0,1.777344,1024.0,5.375,86.0,0,0,9,0,0,2,0,0,35,2,2,18,1,1,122,0,16517134.1.amd64fre.rs4_release.180410-1804,165RequireAdmin,17134.1.amd64fre.rs4_release.180410-1804Requir...,1165,117134.1.amd64fre.rs4_release.180410-1804,1RequireAdmin
3,0,0,0,26978,2,210,62100,52,21,19,7,4,5,308,7,28,10,3,397,131543,31,1243,3173,3,0,3147,41,139,374,1554,2,72,29959,0,13,122,0,22,18,7,...,1,9,2610,2,2,2,4,1,1,1,2,1,0.952148,11359.0,1.77832,1024.0,4.625,74.0,0,0,9,0,0,2,0,0,35,2,2,18,1,1,122,0,22817134.1.amd64fre.rs4_release.180410-1804,228ExistsNotSet,17134.1.amd64fre.rs4_release.180410-1804Exists...,1228,117134.1.amd64fre.rs4_release.180410-1804,1ExistsNotSet
4,0,0,0,26978,2,90,76743,52,198,254,7,13,5,308,5,28,18,9,397,0,31,1461,6049,1,3013,4214,47,89,374,1554,4,62,1,0,13,122,0,5,3,8,...,1,1,2610,2,2,2,4,1,1,1,1,1,0.213623,375040.0,1.77832,1536.0,3.5,56.0,0,0,9,0,0,2,0,0,35,2,2,18,1,1,122,0,19117134.1.amd64fre.rs4_release.180410-1804,191RequireAdmin,17134.1.amd64fre.rs4_release.180410-1804Requir...,1191,117134.1.amd64fre.rs4_release.180410-1804,1RequireAdmin
5,0,0,0,26978,2,220,0,19,31,29,7,4,5,308,7,28,18,3,1804,0,17,3239,597,2,16761,4947,29,173,930,82,2,72,29959,0,13,122,0,22,18,7,...,1,7,2610,2,2,2,4,1,1,1,1,1,0.993164,802.0,1.777344,4096.0,10.75,43.0,0,0,9,0,0,2,0,0,35,2,2,18,1,1,122,0,16517134.1.amd64fre.rs4_release.180410-1804,165RequireAdmin,17134.1.amd64fre.rs4_release.180410-1804Requir...,1165,117134.1.amd64fre.rs4_release.180410-1804,1RequireAdmin
6,0,0,0,20716,3,199,0,52,284,268,7,13,5,308,5,28,21,9,1804,0,17,3573,6049,1,0,3147,43,125,611,1687,4,72,1,0,13,122,0,5,3,3,...,1,2,2610,2,2,2,4,1,1,1,2,1,0.961914,18238.0,1.777344,2048.0,8.601562,34.40625,0,0,9,0,0,2,0,0,35,2,2,18,1,1,122,0,16517134.1.amd64fre.rs4_release.180410-1804,165nan,17134.1.amd64fre.rs4_release.180410-1804nan,2165,217134.1.amd64fre.rs4_release.180410-1804,2nan
7,0,0,0,26978,2,220,43652,19,31,29,3,13,2,164,5,348,18,9,2483,0,17,2324,4181,1,0,3147,43,108,374,1554,4,62,1,163,2,47,1,5,3,9,...,1,7,2610,2,2,2,4,1,1,1,1,1,0.952637,14438.0,1.77832,2048.0,7.75,31.0,0,0,9,0,0,2,0,0,22,1,2,3,1,1,47,1,014393.0.amd64fre.rs1_release.160715-1616,0RequireAdmin,14393.0.amd64fre.rs1_release.160715-1616Requir...,10,114393.0.amd64fre.rs1_release.160715-1616,1RequireAdmin
8,0,0,0,26978,2,73,0,19,119,82,7,4,5,308,7,28,18,9,993,0,31,1792,4181,1,0,3147,43,109,930,82,4,62,1,0,13,122,0,22,18,8,...,1,7,2610,2,2,2,4,1,1,1,2,1,0.995605,1353.0,1.777344,1024.0,3.900391,62.40625,0,0,9,0,0,2,0,0,35,2,2,18,1,1,122,0,25417134.1.amd64fre.rs4_release.180410-1804,254RequireAdmin,17134.1.amd64fre.rs4_release.180410-1804Requir...,1254,117134.1.amd64fre.rs4_release.180410-1804,1RequireAdmin
9,0,0,0,22402,3,216,0,19,23,242,5,13,4,294,5,351,18,9,2537,68991,31,1343,8637,1,0,4947,43,108,374,1554,4,62,1,364,11,102,242,8,6,9,...,2,14,2610,2,2,2,4,1,1,1,2,1,0.213135,750617.0,1.77832,2048.0,3.875,62.0,0,0,9,0,0,2,0,2794,33,284,2,15,1,1,102,242,43116299.431.amd64fre.rs3_release_svc_escrow.1...,431RequireAdmin,16299.431.amd64fre.rs3_release_svc_escrow.1805...,2431,216299.431.amd64fre.rs3_release_svc_escrow.180...,2RequireAdmin


In [0]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
# folds = TimeSeriesSplit(n_splits=5)

In [0]:
def predict_chunk(model, test):
  initial_idx = 0
  chunk_size = 1000000
  current_pred = np.zeros(len(test))
  while initial_idx < test.shape[0]:
    final_idx = min(initial_idx + chunk_size, test.shape[0])
    idx = range(initial_idx, final_idx)
    current_pred[idx] = model.predict_proba(test.iloc[idx])[:,1]
    initial_idx = final_idx
  return current_pred


In [0]:
def train_model(X, label):
  print('\nLightGBM\n')
  prediction = np.zeros(len(test))
  for fold_n, (train_index, valid_index) in enumerate(folds.split(X, label)):
    gc.collect()
    print('Fold', fold_n + 1, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = label.iloc[train_index], label.iloc[valid_index]
    gc.collect()
    lgb_model = lgb.LGBMClassifier(max_depth=-1,
                                   n_estimators=30000,
                                   learning_rate=0.05,
                                   num_leaves=2**12-1,
                                   colsample_bytree=0.28,
                                   objective='binary', 
                                   n_jobs=-1)
                                   
    #xgb_model = xgb.XGBClassifier(max_depth=6,
    #                              n_estimators=30000,
    #                              colsample_bytree=0.2,
    #                              learning_rate=0.1,
    #                              objective='binary:logistic', 
    #                              n_jobs=-1)
    
                               
    lgb_model.fit(X_train, y_train, eval_metric='auc', 
                  eval_set=[(X_valid, y_valid)], 
                  verbose=100, early_stopping_rounds=100)             
    #xgb_model.fit(X_train, y_train, eval_metric='auc', 
    #              eval_set=[(X_valid, y_valid)], 
    #              verbose=1000, early_stopping_rounds=300)

    prediction += predict_chunk(lgb_model, test)
    #prediction += predict_chunk(xgb_model, test)
            
    del X_train, X_valid, y_train, y_valid
  prediction /= n_fold
  return prediction



In [24]:
print(train.shape)

(8921483, 89)


In [0]:
pre1 = train_model(train, y)
submission = pd.read_csv('/content/drive/My Drive/sample_submission.csv')
submission['HasDetections'] = pre1
submission.to_csv('/content/drive/My Drive/final_test.csv', index=False)


LightGBM

Fold 1 started at Sun Dec  1 21:01:36 2019
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.603952	valid_0's auc: 0.730456
[200]	valid_0's binary_logloss: 0.601434	valid_0's auc: 0.732643
[300]	valid_0's binary_logloss: 0.601457	valid_0's auc: 0.732695
Early stopping, best iteration is:
[249]	valid_0's binary_logloss: 0.601094	valid_0's auc: 0.733038
Fold 2 started at Sun Dec  1 21:51:00 2019
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.604287	valid_0's auc: 0.730139
[200]	valid_0's binary_logloss: 0.600993	valid_0's auc: 0.733213
[300]	valid_0's binary_logloss: 0.601203	valid_0's auc: 0.732988
Early stopping, best iteration is:
[240]	valid_0's binary_logloss: 0.600845	valid_0's auc: 0.733355
Fold 3 started at Sun Dec  1 22:36:50 2019
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.604536	valid_0's auc: 0.729915
[200]	valid_0's 