# Sai Ram

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
#import xgboost as xgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import roc_auc_score
import gc
gc.enable()

In [3]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [None]:
print('Download Train and Test Data.\n')
train = pd.read_csv('../input/train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
test  = pd.read_csv('../input/test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')


Download Train and Test Data.



In [None]:
print('Download Train and Test Data.\n')
train = pd.read_csv('../input/train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
test  = pd.read_csv('../input/test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')

gc.collect()

print('Transform all features to category.\n')
for usecol in train.columns.tolist()[1:-1]:

    train[usecol] = train[usecol].astype('str')
    test[usecol] = test[usecol].astype('str')
    
    #Fit LabelEncoder
    le = LabelEncoder().fit(
            np.unique(train[usecol].unique().tolist()+
                      test[usecol].unique().tolist()))

    #At the end 0 will be used for dropped values
    train[usecol] = le.transform(train[usecol])+1
    test[usecol]  = le.transform(test[usecol])+1

    agg_tr = (train
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Train'}, axis=1))
    agg_te = (test
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Test'}, axis=1))

    agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
    #Select values with more than 1000 observations
    agg = agg[(agg['Train'] > 1000)].reset_index(drop=True)
    agg['Total'] = agg['Train'] + agg['Test']
    #Drop unbalanced values
    agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]
    agg[usecol+'Copy'] = agg[usecol]

    train[usecol] = (pd.merge(train[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    test[usecol]  = (pd.merge(test[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    del le, agg_tr, agg_te, agg, usecol
    gc.collect()
          
y_train = np.array(train['HasDetections'])
train_ids = train.index
test_ids  = test.index

del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()

print("If you don't want use Sparse Matrix choose Kernel Version 2 to get simple solution.\n")

print('--------------------------------------------------------------------------------------------------------')
print('Transform Data to Sparse Matrix.')
print('Sparse Matrix can be used to fit a lot of models, eg. XGBoost, LightGBM, Random Forest, K-Means and etc.')
print('To concatenate Sparse Matrices by column use hstack()')
print('Read more about Sparse Matrix https://docs.scipy.org/doc/scipy/reference/sparse.html')
print('Good Luck!')
print('--------------------------------------------------------------------------------------------------------')

#Fit OneHotEncoder
ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(train)

#Transform data using small groups to reduce memory usage
m = 100000
train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
test  = vstack([ohe.transform(test[i*m:(i+1)*m])  for i in range(test.shape[0] // m +  1)])
save_npz('train.npz', train, compressed=True)
save_npz('test.npz',  test,  compressed=True)

del ohe, train, test
gc.collect()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

lgb_test_result  = np.zeros(test_ids.shape[0])
#lgb_train_result = np.zeros(train_ids.shape[0])
#xgb_test_result  = np.zeros(test_ids.shape[0])
#xgb_train_result = np.zeros(train_ids.shape[0])
counter = 0

print('\nLightGBM\n')

for train_index, test_index in skf.split(train_ids, y_train):
    
    print('Fold {}\n'.format(counter + 1))
    
    train = load_npz('train.npz')
    X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)])
    X_val = vstack([train[test_index[i*m:(i+1)*m]]  for i in range(test_index.shape[0] //  m + 1)])
    X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
    y_fit, y_val = y_train[train_index], y_train[test_index]
    
    del train
    gc.collect()

    lgb_model = lgb.LGBMClassifier(max_depth=-1,
                                   n_estimators=30000,
                                   learning_rate=0.05,
                                   num_leaves=2**12-1,
                                   colsample_bytree=0.28,
                                   objective='binary', 
                                   n_jobs=-1)
                                   
    #xgb_model = xgb.XGBClassifier(max_depth=6,
    #                              n_estimators=30000,
    #                              colsample_bytree=0.2,
    #                              learning_rate=0.1,
    #                              objective='binary:logistic', 
    #                              n_jobs=-1)
    
                               
    lgb_model.fit(X_fit, y_fit, eval_metric='auc', 
                  eval_set=[(X_val, y_val)], 
                  verbose=100, early_stopping_rounds=100)
                  
    #xgb_model.fit(X_fit, y_fit, eval_metric='auc', 
    #              eval_set=[(X_val, y_val)], 
    #              verbose=1000, early_stopping_rounds=300)

    #lgb_train_result[test_index] += lgb_model.predict_proba(X_val)[:,1]
    #xgb_train_result[test_index] += xgb_model.predict_proba(X_val)[:,1]
    
    del X_fit, X_val, y_fit, y_val, train_index, test_index
    gc.collect()
    
    test = load_npz('test.npz')
    test = csr_matrix(test, dtype='float32')
    lgb_test_result += lgb_model.predict_proba(test)[:,1]
    #xgb_test_result += xgb_model.predict_proba(test)[:,1]
    counter += 1
    
    del test
    gc.collect()
    
    #Stop fitting to prevent time limit error
    #if counter == 3 : break

#print('\nLigthGBM VAL AUC Score: {}'.format(roc_auc_score(y_train, lgb_train_result)))
#print('\nXGBoost VAL AUC Score: {}'.format(roc_auc_score(y_train, xgb_train_result)))

submission = pd.read_csv('../input/sample_submission.csv')
submission['HasDetections'] = lgb_test_result / counter
submission.to_csv('lgb_submission.csv', index=False)
#submission['HasDetections'] = xgb_test_result / counter
#submission.to_csv('xgb_submission.csv', index=False)
#submission['HasDetections'] = 0.5 * lgb_test_result / counter  + 0.5 * xgb_test_result / counter 
##submission.to_csv('lgb_xgb_submission.csv', index=False)

print('\nDone.')

In [None]:
import dask.dataframe as dd

In [None]:
import time
starttime = time.time()

In [None]:
time.time() - starttime

In [None]:
random_state = 42
np.random.seed(random_state)

# ** Strategy **
## ** Fingerprint like computation **** - append different variables as single identifier


In [None]:
dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'category',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'category',
        'AVProductStatesIdentifier':                            'category',
        'AVProductsInstalled':                                  'category',
        'AVProductsEnabled':                                    'category',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'category',
        'OrganizationIdentifier':                               'category',
        'GeoNameIdentifier':                                    'category',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'category',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'category',
        'IeVerIdentifier':                                      'category',
        'SmartScreen':                                          'category',
        'Firewall':                                             'category',
        'UacLuaenable':                                         'category',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'category',
        'Census_OEMModelIdentifier':                            'category',
        'Census_ProcessorCoreCount':                            'category',
        'Census_ProcessorManufacturerIdentifier':               'category',
        'Census_ProcessorModelIdentifier':                      'category',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'category',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'category',
        'Census_IsFlightsDisabled':                             'category',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'category',
        'Census_FirmwareManufacturerIdentifier':                'category',
        'Census_FirmwareVersionIdentifier':                     'category',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'category',
        'Census_IsVirtualDevice':                               'category',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'category',
        'Wdft_IsGamer':                                         'category',
        'Wdft_RegionIdentifier':                                'category',
        'HasDetections':                                        'int8'
        }

In [None]:
# dtypes = {
#         'MachineIdentifier':                                    'category',
#         'ProductName':                                          'category',
#         'EngineVersion':                                        'category',
#         'AppVersion':                                           'category',
#         'AvSigVersion':                                         'category',
#         'IsBeta':                                               'int8',
#         'RtpStateBitfield':                                     'float16',
#         'IsSxsPassiveMode':                                     'int8',
#         'DefaultBrowsersIdentifier':                            'float16',
#         'AVProductStatesIdentifier':                            'float32',
#         'AVProductsInstalled':                                  'float16',
#         'AVProductsEnabled':                                    'float16',
#         'HasTpm':                                               'int8',
#         'CountryIdentifier':                                    'int16',
#         'CityIdentifier':                                       'float32',
#         'OrganizationIdentifier':                               'float16',
#         'GeoNameIdentifier':                                    'float16',
#         'LocaleEnglishNameIdentifier':                          'int8',
#         'Platform':                                             'category',
#         'Processor':                                            'category',
#         'OsVer':                                                'category',
#         'OsBuild':                                              'int16',
#         'OsSuite':                                              'int16',
#         'OsPlatformSubRelease':                                 'category',
#         'OsBuildLab':                                           'category',
#         'SkuEdition':                                           'category',
#         'IsProtected':                                          'float16',
#         'AutoSampleOptIn':                                      'int8',
#         'PuaMode':                                              'category',
#         'SMode':                                                'float16',
#         'IeVerIdentifier':                                      'float16',
#         'SmartScreen':                                          'category',
#         'Firewall':                                             'float16',
#         'UacLuaenable':                                         'float32',
#         'Census_MDC2FormFactor':                                'category',
#         'Census_DeviceFamily':                                  'category',
#         'Census_OEMNameIdentifier':                             'float16',
#         'Census_OEMModelIdentifier':                            'float32',
#         'Census_ProcessorCoreCount':                            'float16',
#         'Census_ProcessorManufacturerIdentifier':               'float16',
#         'Census_ProcessorModelIdentifier':                      'float16',
#         'Census_ProcessorClass':                                'category',
#         'Census_PrimaryDiskTotalCapacity':                      'float32',
#         'Census_PrimaryDiskTypeName':                           'category',
#         'Census_SystemVolumeTotalCapacity':                     'float32',
#         'Census_HasOpticalDiskDrive':                           'int8',
#         'Census_TotalPhysicalRAM':                              'float32',
#         'Census_ChassisTypeName':                               'category',
#         'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
#         'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
#         'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
#         'Census_PowerPlatformRoleName':                         'category',
#         'Census_InternalBatteryType':                           'category',
#         'Census_InternalBatteryNumberOfCharges':                'float32',
#         'Census_OSVersion':                                     'category',
#         'Census_OSArchitecture':                                'category',
#         'Census_OSBranch':                                      'category',
#         'Census_OSBuildNumber':                                 'int16',
#         'Census_OSBuildRevision':                               'int32',
#         'Census_OSEdition':                                     'category',
#         'Census_OSSkuName':                                     'category',
#         'Census_OSInstallTypeName':                             'category',
#         'Census_OSInstallLanguageIdentifier':                   'float16',
#         'Census_OSUILocaleIdentifier':                          'int16',
#         'Census_OSWUAutoUpdateOptionsName':                     'category',
#         'Census_IsPortableOperatingSystem':                     'int8',
#         'Census_GenuineStateName':                              'category',
#         'Census_ActivationChannel':                             'category',
#         'Census_IsFlightingInternal':                           'float16',
#         'Census_IsFlightsDisabled':                             'float16',
#         'Census_FlightRing':                                    'category',
#         'Census_ThresholdOptIn':                                'float16',
#         'Census_FirmwareManufacturerIdentifier':                'float16',
#         'Census_FirmwareVersionIdentifier':                     'float32',
#         'Census_IsSecureBootEnabled':                           'int8',
#         'Census_IsWIMBootEnabled':                              'float16',
#         'Census_IsVirtualDevice':                               'float16',
#         'Census_IsTouchEnabled':                                'int8',
#         'Census_IsPenCapable':                                  'int8',
#         'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
#         'Wdft_IsGamer':                                         'float16',
#         'Wdft_RegionIdentifier':                                'float16',
#         'HasDetections':                                        'int8'
#         }

In [None]:
# read data

df_train = dd.read_csv('../input/train.csv',dtype=dtypes)
df_test = dd.read_csv('../input/test.csv',dtype=dtypes)

# df_train = pd.read_csv('../input/train.csv', dtype=dtypes, low_memory=True, index_col='MachineIdentifier')
# df_train['MachineIdentifier'] = df_train.index.astype('uint32')
# df_test = pd.read_csv('../input/test.csv', dtype=dtypes, low_memory=True, index_col='MachineIdentifier')
# df_test['MachineIdentifier'] = df_test.index.astype('uint32')

In [None]:
df_train.set_index('MachineIdentifier')
# df_test.set_index('MachineIdentifier')

In [None]:
train_index = df_train.index.tolist()

In [None]:
# del test_index

In [None]:
target = df_train.HasDetections.astype(int).apply(pd.to_numeric, downcast = 'unsigned')

In [None]:
df_train.drop('HasDetections',axis = 1, inplace=True)

In [None]:
df_train = pd.concat([df_train,df_test])

In [None]:
del df_test

In [None]:
# df_train.head()

In [None]:
df_train.shape

In [None]:
for col in df_train.dtypes[df_train.dtypes == 'object'].index.tolist():
    df_train[col] = df_train[col].astype('category')

In [None]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [None]:
mem_usage(df_train)

In [None]:
# df_train.head()

In [None]:
del dtypes

In [None]:
from __future__ import print_function
import sys

for var, obj in locals().items():
    print(var, sys.getsizeof(obj))

In [None]:
# del _46

In [None]:
# pd.set_option('display.max_rows', 30)
# pd.set_option('display.max_columns', 500)

In [None]:
numeric = ['Census_PrimaryDiskTotalCapacity','Census_SystemVolumeTotalCapacity','Census_TotalPhysicalRAM','Census_InternalPrimaryDiagonalDisplaySizeInInches','Census_InternalPrimaryDisplayResolutionHorizontal','Census_InternalPrimaryDisplayResolutionVertical','Census_InternalBatteryNumberOfCharges']

In [None]:
# df_train = df_train.loc[:,~df_train.columns.isin(numeric)].apply(lambda x: pd.factorize(x)[0]).apply(pd.to_numeric, downcast = 'unsigned')

In [None]:
# df_train.shape

In [None]:
df_train.loc[:,df_train.columns.isin(numeric)] = df_train.loc[:,df_train.columns.isin(numeric)].fillna(0).astype(int).apply(pd.to_numeric, downcast = 'unsigned')

In [None]:
# df_train = pd.concat([df_train, df_train_numrtic], axis=1, sort=False)

In [None]:
# del df_train_numrtic

In [None]:
# df_train.columns.tolist().isin(numeric)
# np.setdiff1d(df_train.columns.tolist(),numeric)

In [None]:
# pd.Categorical(df_train[column]).codes

In [None]:
for column in np.setdiff1d(df_train.columns.tolist(),numeric):
    df_train[column] = pd.Categorical(df_train[column]).codes

In [None]:
df_train.shape

In [None]:
del numeric

In [None]:
df_test = df_train.loc[~df_train.index.isin(train_index)]
df_train = df_train.loc[df_train.index.isin(train_index)]

In [None]:
del train_index

In [None]:
df_train = pd.concat([df_train, target], axis=1, sort=False)

In [None]:
del target

In [None]:
# df_train = pd.concat([df_train_obj, df_train_num, target], axis=1, sort=False)
# df_test = pd.concat([df_test_obj, df_test_num], axis=1, sort=False)

In [None]:
# df_test.head()

In [None]:
mem_usage(df_test)

In [None]:
mem_usage(df_train)

In [None]:
# df_test.to_csv('../input/df_test.csv')
# df_train.to_hdf('../input/df_train.csv')

In [None]:
test_ID = df_test.index.values
Y = df_train.HasDetections.values.astype(int)

In [None]:
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)

In [None]:
df_train = df_train.drop(['MachineIdentifier','HasDetections'], axis=1)
df_test = df_test.drop(['MachineIdentifier'], axis=1)

In [None]:
# df_train = df_train.fillna(value=0)
# df_test = df_test.fillna(value=0)

In [None]:
X = pd.concat([df_train,df_test], axis=0, sort=False, ignore_index=True).values

In [None]:
del df_train, df_test

In [None]:
# start training of GaussianNB

from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import QuantileTransformer

start_time = time.time()

In [None]:
_X = X[:len(Y)]
Xt = X[len(Y):]
X = _X

In [None]:
clf = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB())
clf.fit( X, Y )

In [None]:
y_diff = Y - clf.predict_proba( X )[:,1]

In [None]:
yt_nb = clf.predict_proba( Xt )[:,1]

In [None]:
Y = y_diff

In [None]:
# start training of LightGBM

import lightgbm as lgb
from sklearn.model_selection import KFold

In [None]:
n_predict = 0
valid = np.zeros( (len(test_ID),) )
for fold_id, (IDX_train, IDX_test) in enumerate(KFold(n_splits=10, random_state=random_state, shuffle=False).split(Y)):
	X_train = X[IDX_train]
	X_test = X[IDX_test]
	Y_train = Y[IDX_train]
	Y_test = Y[IDX_test]

	lgb_params = {
		"objective" : "regression",
		"metric" : "mse",
		"max_depth" : 2,
		"num_leaves" : 2,
		"learning_rate" : 0.055,
		"bagging_fraction" : 0.3,
		"feature_fraction" : 0.15,
		"lambda_l1" : 5,
		"lambda_l2" : 5,
		"bagging_seed" : fold_id+random_state,
		"verbosity" : 1,
		"seed": fold_id+random_state
	}

	lgtrain = lgb.Dataset(X_train, label=Y_train)
	lgtest = lgb.Dataset(X_test, label=Y_test)
	evals_result = {}
	lgb_clf = lgb.train(lgb_params, lgtrain, 35000, 
						valid_sets=[lgtrain, lgtest], 
						early_stopping_rounds=500, 
						verbose_eval=2000, 
						evals_result=evals_result)
	valid += lgb_clf.predict( Xt ).reshape((-1,))
	n_predict += 1
	if time.time() - start_time > 6900:
		break

In [None]:
valid = (valid / n_predict) + yt_nb
valid = np.clip( valid, 0.0, 1.0 )

pd.DataFrame({'MachineIdentifier':test_ID,'HasDetections':valid}).to_csv('submission.csv',index=False)
print('done.')