In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
#import xgboost as xgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import roc_auc_score
import gc
gc.enable()

dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }


In [8]:

print('Download Train and Test Data.\n')
train = pd.read_csv('/home/ryan/cs/datasets/microsoft/train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
test  = pd.read_csv('/home/ryan/cs/datasets/microsoft/test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')

gc.collect()



Download Train and Test Data.



202205

In [10]:
train.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0,5,0,0,0,1,7,0,,53447.0,...,36144.0,0,,0.0,0,0,0.0,0.0,10.0,0
1,1,5,57,30,6466,1,7,0,,53447.0,...,57858.0,0,,0.0,0,0,0.0,0.0,8.0,0
2,2,5,0,0,0,1,7,0,,53447.0,...,52682.0,0,,0.0,0,0,0.0,0.0,3.0,0
3,3,5,0,0,0,1,7,0,,53447.0,...,20050.0,0,,0.0,0,0,0.0,0.0,3.0,1
4,4,5,0,0,0,1,7,0,,53447.0,...,19844.0,0,0.0,0.0,0,0,0.0,0.0,1.0,1


In [27]:
col = 'Census_OSBuildRevision'
train[col] = train[col].astype('str')
test[col] = test[col].astype('str')

In [28]:
train[col].head()

0    165
1      1
2    165
3    228
4    191
Name: Census_OSBuildRevision, dtype: object

In [29]:
test[col].head()

0    1387
1     611
2    2189
3     371
4     371
Name: Census_OSBuildRevision, dtype: object

In [34]:
le = LabelEncoder().fit(np.unique(train[col].unique().tolist()+test[col].unique().tolist()))

In [37]:
train[col] = le.transform(train[col])+1
test[col] = le.transform(test[col])+1

In [38]:
train[col].head()

0     83
1      2
2     83
3    181
4    152
Name: Census_OSBuildRevision, dtype: int64

In [39]:
test[col].head()

0     52
1    276
2    174
3    232
4    232
Name: Census_OSBuildRevision, dtype: int64

In [44]:
agg_tr = train.groupby([col]).aggregate({'MachineIdentifier':'count'})

In [46]:
type(agg_tr)

pandas.core.frame.DataFrame

In [49]:
agg_tr

Unnamed: 0_level_0,MachineIdentifier
Census_OSBuildRevision,Unnamed: 1_level_1
1,166369
2,106583
3,940
4,9178
5,960
6,143
7,1506
8,3424
9,16
11,5571


In [51]:
agg_tr = agg_tr.reset_index()
agg_tr

Unnamed: 0,Census_OSBuildRevision,MachineIdentifier
0,1,166369
1,2,106583
2,3,940
3,4,9178
4,5,960
5,6,143
6,7,1506
7,8,3424
8,9,16
9,11,5571


In [53]:
agg_tr = agg_tr.rename({'MachineIdentifier':'Train'}, axis=1)


In [54]:
agg_tr

Unnamed: 0,Census_OSBuildRevision,Train
0,1,166369
1,2,106583
2,3,940
3,4,9178
4,5,960
5,6,143
6,7,1506
7,8,3424
8,9,16
9,11,5571


In [57]:
agg_te = (test
              .groupby([col])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Test'}, axis=1))

In [58]:
agg_te

Unnamed: 0,Census_OSBuildRevision,Test
0,1,166919
1,2,179005
2,3,1240
3,4,8550
4,5,987
5,6,20
6,7,425
7,8,419
8,9,4
9,10,87


In [60]:
agg = pd.merge(agg_tr, agg_te, on=col, how='outer').replace(np.nan, 0)

In [61]:
agg

Unnamed: 0,Census_OSBuildRevision,Train,Test
0,1,166369.0,166919.0
1,2,106583.0,179005.0
2,3,940.0,1240.0
3,4,9178.0,8550.0
4,5,960.0,987.0
5,6,143.0,20.0
6,7,1506.0,425.0
7,8,3424.0,419.0
8,9,16.0,4.0
9,11,5571.0,3791.0


In [62]:
agg = agg[(agg['Train'] > 1000)].reset_index(drop=True)

In [63]:
agg

Unnamed: 0,Census_OSBuildRevision,Train,Test
0,1,166369.0,166919.0
1,2,106583.0,179005.0
2,4,9178.0,8550.0
3,7,1506.0,425.0
4,8,3424.0,419.0
5,11,5571.0,3791.0
6,12,25401.0,14498.0
7,14,23002.0,23058.0
8,15,10994.0,7304.0
9,16,4137.0,2962.0


In [64]:
agg['Total'] = agg['Train'] + agg['Test']
    #Drop unbalanced values
agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]
agg

Unnamed: 0,Census_OSBuildRevision,Train,Test,Total
0,1,166369.0,166919.0,333288.0
1,2,106583.0,179005.0,285588.0
2,4,9178.0,8550.0,17728.0
3,7,1506.0,425.0,1931.0
5,11,5571.0,3791.0,9362.0
6,12,25401.0,14498.0,39899.0
7,14,23002.0,23058.0,46060.0
8,15,10994.0,7304.0,18298.0
9,16,4137.0,2962.0,7099.0
10,17,1295.0,792.0,2087.0


In [65]:
train[col].shape

(8921483,)

In [67]:
agg[col+'Copy'] = agg[col]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [68]:
merged = pd.merge(train[[col]], agg[[col, col+'Copy']], on=col, how='left')

In [69]:
merged

Unnamed: 0,Census_OSBuildRevision,Census_OSBuildRevisionCopy
0,83,
1,2,2.0
2,83,
3,181,
4,152,
5,83,
6,83,
7,1,1.0
8,207,
9,242,242.0


In [79]:
print('Transform all features to category.\n')
for usecol in train.columns.tolist()[58:-1]:
    print(usecol)

    train[usecol] = train[usecol].astype('str')
    test[usecol] = test[usecol].astype('str')
    
    #Fit LabelEncoder
    le = LabelEncoder().fit(
            np.unique(train[usecol].unique().tolist()+
                      test[usecol].unique().tolist()))

    #At the end 0 will be used for dropped values
    train[usecol] = le.transform(train[usecol])+1
    test[usecol]  = le.transform(test[usecol])+1

    agg_tr = (train
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Train'}, axis=1))
    agg_te = (test
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Test'}, axis=1))

    agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
    #Select values with more than 1000 observations
    agg = agg[(agg['Train'] > 1000)].reset_index(drop=True)
    agg['Total'] = agg['Train'] + agg['Test']
    #Drop unbalanced values

    agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]

    agg[usecol+'Copy'] = agg[usecol]

    train[usecol] = (pd.merge(train[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    test[usecol]  = (pd.merge(test[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

#     del le, agg_tr, agg_te, agg, usecol
    gc.collect()
    break
    
          
y_train = np.array(train['HasDetections'])
train_ids = train.index
test_ids  = test.index

# del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()


Transform all features to category.

Census_OSBuildRevision


0

In [80]:
train['Census_OSBuildRevision']

0            1
1          104
2            1
3            1
4            1
5            1
6            1
7           79
8            1
9           96
10           1
11           1
12           1
13           1
14           1
15           1
16         114
17           1
18          53
19          77
20          79
21           1
22           1
23           1
24          39
25          32
26           1
27         111
28           1
29          70
          ... 
8921453     53
8921454      1
8921455     59
8921456      1
8921457    104
8921458     73
8921459    134
8921460     53
8921461    105
8921462     54
8921463     77
8921464     96
8921465     53
8921466     91
8921467     89
8921468    104
8921469    134
8921470      1
8921471      1
8921472    111
8921473     96
8921474     77
8921475     51
8921476      6
8921477     33
8921478     96
8921479    132
8921480     89
8921481    105
8921482      1
Name: Census_OSBuildRevision, Length: 8921483, dtype: category
Categories (135, int64

In [None]:

#Fit OneHotEncoder
ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(train)

#Transform data using small groups to reduce memory usage
m = 100000
train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
test  = vstack([ohe.transform(test[i*m:(i+1)*m])  for i in range(test.shape[0] // m +  1)])
save_npz('train.npz', train, compressed=True)
save_npz('test.npz',  test,  compressed=True)

del ohe, train, test
gc.collect()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

lgb_test_result  = np.zeros(test_ids.shape[0])
#lgb_train_result = np.zeros(train_ids.shape[0])
#xgb_test_result  = np.zeros(test_ids.shape[0])
#xgb_train_result = np.zeros(train_ids.shape[0])
counter = 0

print('\nLightGBM\n')

for train_index, test_index in skf.split(train_ids, y_train):
    
    print('Fold {}\n'.format(counter + 1))
    
    train = load_npz('train.npz')
    X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)])
    X_val = vstack([train[test_index[i*m:(i+1)*m]]  for i in range(test_index.shape[0] //  m + 1)])
    X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
    y_fit, y_val = y_train[train_index], y_train[test_index]
    
    del train
    gc.collect()

    lgb_model = lgb.LGBMClassifier(max_depth=-1,
                                   n_estimators=30000,
                                   learning_rate=0.05,
                                   num_leaves=2**12-1,
                                   colsample_bytree=0.28,
                                   objective='binary', 
                                   n_jobs=-1)
                                   
    #xgb_model = xgb.XGBClassifier(max_depth=6,
    #                              n_estimators=30000,
    #                              colsample_bytree=0.2,
    #                              learning_rate=0.1,
    #                              objective='binary:logistic', 
    #                              n_jobs=-1)
    
                               
    lgb_model.fit(X_fit, y_fit, eval_metric='auc', 
                  eval_set=[(X_val, y_val)], 
                  verbose=100, early_stopping_rounds=100)
                  
    #xgb_model.fit(X_fit, y_fit, eval_metric='auc', 
    #              eval_set=[(X_val, y_val)], 
    #              verbose=1000, early_stopping_rounds=300)

    #lgb_train_result[test_index] += lgb_model.predict_proba(X_val)[:,1]
    #xgb_train_result[test_index] += xgb_model.predict_proba(X_val)[:,1]
    
    del X_fit, X_val, y_fit, y_val, train_index, test_index
    gc.collect()
    
    test = load_npz('test.npz')
    test = csr_matrix(test, dtype='float32')
    lgb_test_result += lgb_model.predict_proba(test)[:,1]
    #xgb_test_result += xgb_model.predict_proba(test)[:,1]
    counter += 1
    
    del test
    gc.collect()
    
    #Stop fitting to prevent time limit error
    #if counter == 3 : break

#print('\nLigthGBM VAL AUC Score: {}'.format(roc_auc_score(y_train, lgb_train_result)))
#print('\nXGBoost VAL AUC Score: {}'.format(roc_auc_score(y_train, xgb_train_result)))


#submission['HasDetections'] = xgb_test_result / counter
#submission.to_csv('xgb_submission.csv', index=False)
#submission['HasDetections'] = 0.5 * lgb_test_result / counter  + 0.5 * xgb_test_result / counter 
##submission.to_csv('lgb_xgb_submission.csv', index=False)

print('\nDone.')

In [9]:
submission = pd.read_csv('/home/ryan/cs/datasets/microsoft/sample_submission.csv')
submission['HasDetections'] = lgb_test_result / counter
submission.to_csv('lgb_submission.csv', index=False)