In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib
import lightgbm as lgb
import xgboost as xgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
import warnings
warnings.filterwarnings("ignore")
import gc

In [2]:
dict_dtypes = {
    'IsBeta': np.int8,                                                              
    'RtpStateBitfield': np.int8,                                                    
    'IsSxsPassiveMode': np.int8,                                                    
    'DefaultBrowsersIdentifier': np.int16,                                          
    'AVProductStatesIdentifier': np.int32,                                          
    'AVProductsInstalled': np.int8,                                                 
    'AVProductsEnabled': np.int8,
    'HasTpm': np.int8,                                                              
    'CountryIdentifier': np.int16,
    'CityIdentifier': np.int32,                                                     
    'OrganizationIdentifier': np.int8,
    'GeoNameIdentifier': np.int16,                                                  
    'LocaleEnglishNameIdentifier': np.int16,                                        
    'OsBuild': np.int16,                                                            
    'OsSuite': np.int16,                                                            
    'IsProtected': np.int8,                                                         
    'AutoSampleOptIn': np.int8,                                                     
    'SMode': np.int8,                                                               
    'IeVerIdentifier': np.int16,                                                    
    'Firewall': np.int8,                                                            
    'UacLuaenable': np.int32,                                                       
    'Census_OEMNameIdentifier': np.int16,                                           
    'Census_OEMModelIdentifier': np.int32,                                          
    'Census_ProcessorCoreCount': np.int16,
    'Census_ProcessorManufacturerIdentifier': np.int8,                              
    'Census_ProcessorModelIdentifier': np.int16,
    'Census_PrimaryDiskTotalCapacity': np.int32,                                    
    'Census_SystemVolumeTotalCapacity': np.int32,                                   
    'Census_HasOpticalDiskDrive': np.int8,                                          
    'Census_TotalPhysicalRAM': np.int32,                                            
    'Census_InternalPrimaryDiagonalDisplaySizeInInches': np.float16,                  
    'Census_InternalPrimaryDisplayResolutionHorizontal': np.int16,                  
    'Census_InternalPrimaryDisplayResolutionVertical': np.int16,
    'Census_InternalBatteryNumberOfCharges': np.int32,
    'Census_OSBuildNumber': np.int16,                                               
    'Census_OSBuildRevision': np.int32,                                             
    'Census_OSInstallLanguageIdentifier': np.int8,                                  
    'Census_OSUILocaleIdentifier': np.int16,                                        
    'Census_IsPortableOperatingSystem': np.int8,                                    
    'Census_IsFlightingInternal': np.int8,                                          
    'Census_IsFlightsDisabled': np.int8,                                            
    'Census_ThresholdOptIn': np.int8,                                               
    'Census_FirmwareManufacturerIdentifier': np.int16,
    'Census_FirmwareVersionIdentifier': np.int32,                                   
    'Census_IsSecureBootEnabled': np.int8,                                          
    'Census_IsWIMBootEnabled': np.int8,                                             
    'Census_IsVirtualDevice': np.int8,
    'Census_IsTouchEnabled': np.int8,                                               
    'Census_IsPenCapable': np.int8,                                                 
    'Census_IsAlwaysOnAlwaysConnectedCapable': np.int8,                             
    'Wdft_IsGamer': np.int8,
    'Wdft_RegionIdentifier': np.int8,
    'HasDetections': np.int8,
    'Census_InternalBatteryType_informed': np.int8,                                 
    'ProductName_index': np.int8,                                                   
    'Platform_index': np.int8,
    'Processor_index': np.int8,
    'OsPlatformSubRelease_index': np.int8,                                          
    'SkuEdition_index': np.int8,
    'PuaMode_index': np.int8,
    'SmartScreen_index': np.int8,
    'Census_MDC2FormFactor_index': np.int8,                                         
    'Census_DeviceFamily_index': np.int8,                                           
    'Census_ProcessorClass_index': np.int8,                                         
    'Census_PrimaryDiskTypeName_index': np.int8,
    'Census_ChassisTypeName_index': np.int8,
    'Census_PowerPlatformRoleName_index': np.int8,
    'Census_InternalBatteryType_index': np.int8,                                    
    'Census_OSArchitecture_index': np.int8,                                         
    'Census_OSBranch_index': np.int8,                                               
    'Census_OSEdition_index': np.int8,                                              
    'Census_OSSkuName_index': np.int8,                                              
    'Census_OSInstallTypeName_index': np.int8,                                      
    'Census_OSWUAutoUpdateOptionsName_index': np.int8,
    'Census_GenuineStateName_index': np.int8,                                       
    'Census_ActivationChannel_index': np.int8,
    'Census_FlightRing_index': np.int8,
    'OSBuild_diff': np.int8,
    'AvSigVersion_diff': np.int16,                                                  
    'OSBuild_fulldiff': np.int8,                                                    
    'AvSigVersion_fulldiff': np.int16,                                              
    'OsBuildLab_difftotal': np.int16,                                               
    'DateAvSigVersion_difftotal': np.int16,                                         
    'DateAvSigVersion_fulldifftotal': np.int16,
    'OsBuildLab_fulldifftotal': np.int16,
    'DateAvSigVersion_ratio': np.float16,                                              
    'OsBuildLab_ratio': np.float16,                                                    
    'DateAvSigVersion_fullratio': np.float16,
    'OsBuildLab_fullratio': np.float16,                                                
    'OsBuildLab_dayOfWeek': np.int8,                                                
    'AvSigVersion_dayOfWeek': np.int8, 
}

In [5]:
path = '../../data/train_final_0'

In [6]:
allFiles = glob.glob(path + "/*.csv")

list_ = []
n = 0

for file_ in allFiles:
#     mask = file_['Platform_index'] != 0
    df = pd.read_csv(file_, dtype=dict_dtypes, low_memory=True)
    list_.append(df)
    if n%10==0:
        print(n)
    n+=1

train = pd.concat(list_, axis = 0, ignore_index = True)

0
10
20
30


In [7]:
train.shape

(8921483, 100)

In [8]:
train_sub = train.sample(100000)
del train
gc.collect()

10

In [9]:
train_sub.shape

(100000, 100)

In [12]:
sel_cols = [c for c in train_sub.columns if c not in ['MachineIdentifier', 'HasDetections']]

In [13]:
X_train = train_sub.loc[:, sel_cols]
y_train = train_sub.loc[:,'HasDetections']

In [12]:
train_ids = X_train.index

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

5

Busqueda mejores parametros

In [15]:
params = {
    'learning_rate' : [0.1, 0.05],
    'num_leaves' : [91, 103],
    'feature_fraction': [0.6, 1, 0.3], 
    'bagging_fraction': [0.8, 1],
    'max_depth': [19, -1],
    'min_data_in_leaf': [30, 20],
    'colsample_bytree': [0.9, 0.5, 0.3]
}

kFolds = 5

In [16]:
mdl_lgb = lgb.LGBMClassifier(n_jobs=-1,
                             verbose = 50,
                             n_estimators=1000,
                             objective='binary',
                             bagging_freq=1,
                             boosting_type='gbdt',
                             metric='auc'
                            )

In [17]:
lgb_model = GridSearchCV(
    estimator=mdl_lgb,
    param_grid=params,
    n_jobs=1,
    cv=kFolds,
    verbose=1,
    scoring='roc_auc'
)

In [18]:
lgb_model.fit(X_train, y_train)

Fitting 5 folds for each of 1536 candidates, totalling 7680 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [14]:
lgb_model = lgb.LGBMClassifier(max_depth=-1,
                                   n_estimators=2000,
                                   learning_rate=0.05,
                                   num_leaves=91,
                                   colsample_bytree=0.28,
                                   objective='binary', 
                                   n_jobs=-1)

In [15]:
counter = 0
for train_index, test_index in skf.split(train_ids, y_train):
    
    print('Fold {}\n'.format(counter + 1))
    
#     X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)])
#     X_val = vstack([train[test_index[i*m:(i+1)*m]]  for i in range(test_index.shape[0] //  m + 1)])
#     X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
                              
    #xgb_model = xgb.XGBClassifier(max_depth=6,
    #                              n_estimators=30000,
    #                              colsample_bytree=0.2,
    #                              learning_rate=0.1,
    #                              objective='binary:logistic', 
    #                              n_jobs=-1)
    
                               
    lgb_model.fit(X_fit, y_fit, eval_metric='auc', 
                  eval_set=[(X_val, y_val)], 
                  verbose=100, early_stopping_rounds=20)
                  
    #xgb_model.fit(X_fit, y_fit, eval_metric='auc', 
    #              eval_set=[(X_val, y_val)], 
    #              verbose=1000, early_stopping_rounds=300)

    #lgb_train_result[test_index] += lgb_model.predict_proba(X_val)[:,1]
    #xgb_train_result[test_index] += xgb_model.predict_proba(X_val)[:,1]
    
    del X_fit, X_val, y_fit, y_val, train_index, test_index
    gc.collect()
    
    counter += 1

Fold 1

Training until validation scores don't improve for 20 rounds.
[100]	valid_0's binary_logloss: 0.618625	valid_0's auc: 0.716233
[200]	valid_0's binary_logloss: 0.608583	valid_0's auc: 0.725045
[300]	valid_0's binary_logloss: 0.604336	valid_0's auc: 0.729486
[400]	valid_0's binary_logloss: 0.601855	valid_0's auc: 0.732357
[500]	valid_0's binary_logloss: 0.600296	valid_0's auc: 0.734215
[600]	valid_0's binary_logloss: 0.599311	valid_0's auc: 0.735326
[700]	valid_0's binary_logloss: 0.598489	valid_0's auc: 0.7363
[800]	valid_0's binary_logloss: 0.598024	valid_0's auc: 0.736847
[900]	valid_0's binary_logloss: 0.59758	valid_0's auc: 0.737375
[1000]	valid_0's binary_logloss: 0.597154	valid_0's auc: 0.737883
[1100]	valid_0's binary_logloss: 0.596755	valid_0's auc: 0.738358
[1200]	valid_0's binary_logloss: 0.596402	valid_0's auc: 0.738772
[1300]	valid_0's binary_logloss: 0.596163	valid_0's auc: 0.739058
[1400]	valid_0's binary_logloss: 0.595887	valid_0's auc: 0.73937
[1500]	valid_0's bi

In [17]:
for i, j in zip(X_train.columns, lgb_model.feature_importances_):
    print(i, j)

IsBeta 0
RtpStateBitfield 1365
IsSxsPassiveMode 525
DefaultBrowsersIdentifier 1447
AVProductStatesIdentifier 8784
AVProductsInstalled 1449
AVProductsEnabled 751
HasTpm 191
CountryIdentifier 8252
CityIdentifier 5093
OrganizationIdentifier 1978
GeoNameIdentifier 4427
LocaleEnglishNameIdentifier 5066
OsBuild 1157
OsSuite 872
IsProtected 615
AutoSampleOptIn 5
SMode 624
IeVerIdentifier 2749
Firewall 636
UacLuaenable 367
Census_OEMNameIdentifier 3562
Census_OEMModelIdentifier 4840
Census_ProcessorCoreCount 1442
Census_ProcessorManufacturerIdentifier 280
Census_ProcessorModelIdentifier 5333
Census_PrimaryDiskTotalCapacity 3366
Census_SystemVolumeTotalCapacity 5609
Census_HasOpticalDiskDrive 492
Census_TotalPhysicalRAM 2240
Census_InternalPrimaryDiagonalDisplaySizeInInches 4819
Census_InternalPrimaryDisplayResolutionHorizontal 2152
Census_InternalPrimaryDisplayResolutionVertical 1984
Census_InternalBatteryNumberOfCharges 3044
Census_OSBuildNumber 1194
Census_OSBuildRevision 5395
Census_OSInsta

In [20]:
joblib.dump(lgb_model, '../saved_models/lgbc_model_0.pkl')

['../saved_models/lgbc_model_0.pkl']

In [5]:
model = joblib.load('../saved_models/lgbc_model_0.pkl')

In [23]:
del X_train
del y_train
gc.collect()

0

In [6]:
path = '../data/test_final_0'

In [7]:
allFiles = glob.glob(path + "/*.csv")

list_ = []
n = 0

for file_ in allFiles:
#     mask = file_['Platform_index'] != 0
    df = pd.read_csv(file_, dtype=dict_dtypes, low_memory=True)
    list_.append(df)
    if n%10==0:
        print(n)
    n+=1

test = pd.concat(list_, axis = 0, ignore_index = True)

0
10
20
30


In [9]:
test.shape

(7853253, 100)

In [8]:
sel_cols = [c for c in test.columns if c not in ['MachineIdentifier', 'HasDetections']]

In [10]:
X_test = test.loc[:, sel_cols]
X_machines = test.loc[:,'MachineIdentifier']
del test
gc.collect()

0

In [11]:
preds = model.predict_proba(X_test)

In [12]:
len(preds)

7853253

In [13]:
preds_1 = preds[:,1]

In [14]:
len(preds_1)

7853253

In [15]:
len(X_machines)

7853253

In [16]:
df_prds = pd.DataFrame({'MachineIdentifier': X_machines, 'HasDetections': preds_1})

In [17]:
df_prds.head()

Unnamed: 0,MachineIdentifier,HasDetections
0,000037d9ea7b7cfbf476b433586e5113,0.446491
1,000055ea914f7ef72f9ad26b5cdf4195,0.389861
2,00009cf494f93eab0732fe30814791ec,0.536218
3,00009dbf12762fca1d5d6624ca4a260c,0.482819
4,0000fae061a473c3ce173f5e545c36fe,0.590998


In [18]:
df_prds.to_csv('../submissions/lgb_model_0.csv', index=None)