In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
#import xgboost as xgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import roc_auc_score
import gc
from functools import cmp_to_key
import time
import matplotlib.pyplot as plt
from numba import jit
from functools import reduce
gc.enable()

dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }


In [2]:
print('Download Train and Test Data.\n')
%time train = pd.read_csv('/home/ryan/cs/datasets/microsoft/train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
%time test  = pd.read_csv('/home/ryan/cs/datasets/microsoft/test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')
gc.collect()

Download Train and Test Data.

CPU times: user 1min 15s, sys: 2.39 s, total: 1min 17s
Wall time: 1min 19s
CPU times: user 1min 7s, sys: 1.76 s, total: 1min 8s
Wall time: 1min 9s


201019

In [3]:
if 5244810 in train.index:
    train.loc[5244810,'AvSigVersion'] = '1.273.1144.0'
    train['AvSigVersion'].cat.remove_categories('1.2&#x17;3.1144.0',inplace=True)

In [None]:
osver_timestamps = np.load('/home/ryan/cs/datasets/microsoft/OSVersionTimestamps.npy')[()]
avsig_timestamps = np.load('/home/ryan/cs/datasets/microsoft/AvSigVersionTimestamps.npy')[()]

In [4]:
columns_to_delete = ['HasDetections', #label
                     'MachineIdentifier',
                     'DefaultBrowsersIdentifier', #nan columns
                     'PuaMode',
                     'ProductName', #imbalanced columns
                     'IsBeta',
                     'IsSxsPassiveMode',
                     'HasTpm',
                     'AutoSampleOptIn',
                     'PuaMode',
                     'UacLuaenable',
                     'Census_DeviceFamily',
                     'Census_ProcessorClass',
                     'Census_IsPortableOperatingSystem',
                     'Census_IsFlightsDisabled',
                     'Census_IsVirtualDevice',
                     'Census_OSSkuName', # overlap columns
                     'OsVer',
                     'Census_OSArchitecture',
                     'Census_OSInstallLanguageIdentifier'
                     'Census_InternalBatteryNumberOfCharges' # strange values
                    ]


columns_to_process = []
for col in train.columns:
    if col not in columns_to_delete:
        columns_to_process.append(col)

In [5]:
def trim_all(df_train, df_test, columns_to_trim, delete_time_sensitive=False, delete_nan_thresh=0.8, fill='new'):
    fe_train = pd.DataFrame()
    fe_test = pd.DataFrame()
    for i, col in enumerate(columns_to_trim):
        print(str(i) + '/' + str(len(columns_to_trim)) + ' Trimming ' + col + '...')
        if False:
#         if col == 'AvSigVersion':
            fe_train[col] = df_train[col]
            fe_test[col] = df_test[col]
        else:
            tr_col, te_col, le_name_mapping = trim(df_train, df_test, col, fill='new')
            if delete_time_sensitive and 'nan' in le_name_mapping:
                percent = (te_col.values == le_name_mapping['nan']).sum() / float(len(te_col))
                if percent < delete_nan_thresh:
                    fe_train[col], fe_test[col] =  tr_col, te_col
                    
                else:
                    print('Deleted ' + col)
            else:
                fe_train[col], fe_test[col] =  tr_col, te_col
                    
        
    print('Done!')
    return fe_train, fe_test

def trim(raw_tr, raw_te, col, fill='new', factor=4, min_obs=1000):
#     raw_tr = raw_tr[:100]
#     raw_te = raw_te[:100]
    agg_tr = raw_tr.groupby(col).size().to_frame('tr_counts').reset_index()
    agg_te = raw_te.groupby(col).size().to_frame('te_counts').reset_index()
    
    agg = pd.merge(agg_tr, agg_te, on=col, how='outer')
    try:
        agg = agg.fillna(0)
    except:
        pass
    agg = agg[agg['tr_counts'] > min_obs].reset_index(drop=True)
    agg['total_counts'] = agg['tr_counts'] + agg['te_counts']
    agg = agg[(agg['tr_counts'] / agg['total_counts'] > (1.0/factor)) & (agg['tr_counts'] / agg['total_counts'] < ((factor-1.0)/factor))]
    
    agg[col+'_copy'] = agg[col]
    
    trim_tr = pd.merge(raw_tr[[col]], agg[[col, col+'_copy']], on=col, how='left')[col+'_copy'].astype(str)
    trim_te = pd.merge(raw_te[[col]], agg[[col, col+'_copy']], on=col, how='left')[col+'_copy'].astype(str)

    
    le = LabelEncoder().fit(trim_tr.values)
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

    if fill == 'ordinal':
        # should be same uniques
        good_vals = []
        for g in le.classes_:
            if g != 'nan':
                good_vals.append(float(g))
        good_vals.sort()
        trim_tr = fill_ordinal(raw_tr[col], trim_tr, good_vals).astype(np.int32).rename(col+'_otrim')
        trim_te = fill_ordinal(raw_te[col], trim_te, good_vals).astype(np.int32).rename(col+'_otrim')
                
    elif fill == 'new':
        trim_tr = pd.Series(le.transform(trim_tr))
        trim_te = pd.Series(le.transform(trim_te))
        trim_tr = trim_tr.astype('category').rename(col+'_ztrim')
        trim_te = trim_te.astype('category').rename(col+'_ztrim')
    
    return trim_tr, trim_te, le_name_mapping


def fill_ordinal(raw_series, series, good_vals):
    curr_index = 0
    len_good_vals = len(good_vals)
    
    sorted_series = series.argsort()

    for idx in sorted_series:
        
        if series[idx] == 'nan':
            if curr_index >= len_good_vals - 1:
                series[idx] = good_vals[-1]
            elif float(raw_series.iloc[idx]) < good_vals[curr_index+1]:
                series[idx] = good_vals[curr_index]
            else:
                series[idx] = good_vals[curr_index+1]
                curr_index += 1
    return series
    

In [6]:
def condense_feature(df, col, category_map, fill_with=None):
    feature = df[col]
    reverse_map = {}
    default_val = None
    categories = feature.unique()
    categories = categories[~pd.isnull(categories)]
    
    for k, v_arr in category_map.items():
        if len(v_arr) != 0:
            for v in v_arr:       
                reverse_map[v] = k
                index = np.argwhere(categories==v)
                categories = np.delete(categories, index)         
        else:
            default_val = k
            
    for v in categories:
        reverse_map[v] = default_val
        
    condensed = df[col].map(reverse_map)
    if fill_with:
        condensed = condensed.fillna(fill_with)
    else:
        condensed = condensed.fillna(default_val)
    
    return condensed

def generate_count_feature(df, col, counts=None):
    if counts is None:
        feature = df[col]
        groups = df.groupby(col)
        counts = groups.size()
    count_feature = df[col].map(counts)
    
    return count_feature, counts

def group_battery(x):
    x = x.lower()
    if ('li' in x) or ('ion' in x):
        return 1
    else:
        return 0
    
def generate_freq_feature(df, col, on='HasDetections', frequencies=None):
#     print(frequencies)
    if frequencies is None:
        groups = df.groupby(col)
        sizes = groups.size()
        sums = groups.sum()[on]
        frequencies = sums/sizes
    freq_feature = df[col].map(frequencies)
    
    return freq_feature, frequencies

def generate_version_mapping(df, col, num_splits=4, fill_val='0.0.0.0'):
    feature = df[col].astype(str)
    feature.fillna(fill_val)
    versions = feature.unique()
    
    def version_compare(x, y):
        x_splits = x.split('.')
        y_splits = y.split('.')
        for x_val, y_val in zip(x_splits, y_splits):
            try:
                int_x = int(x_val)
            except:
                int_x = 0
            try:
                int_y = int(y_val)
            except:
                int_y = 0
            if int_x > int_y:
                return 1
            elif int_x < int_y:
                return -1
        return 0

    sorted_versions = sorted(versions, key=cmp_to_key(version_compare))
    mapping = {}
    for i, v in enumerate(sorted_versions):
        mapping[v] = i
        
    return feature.map(mapping)

def generate_split_version(df, col, num_splits=4):
    feature = df[col].astype(str)
    sv = pd.DataFrame()
    for i in range(num_splits):
        sv[col + '_' + str(i)] = feature.apply(lambda x: x.split('.')[i]).astype(np.int16)
        
    return sv

def create_age_feature(df, col, raw_timestamps):
    timestamps = {}
    for k, v in raw_timestamps.items():
        timestamps[k] = v.toordinal()
    max_val = max(list(avsig_timestamps.values()))
    for k, v in timestamps.items():
        timestamps[k] = maxval - timestamps[k]
        
    return df[col].map(timestamps)

In [7]:
def generate_count_df(df, cols, counts_map={}):
    count_df = pd.DataFrame()
    for col in cols:
        if col in counts_map:
            count_df[col + '_counts'], _ = generate_count_feature(df, col, counts=counts_map[col])
        else:
            count_df[col + '_counts'], counts_map[col] = generate_count_feature(df, col)
        
    return count_df, counts_map
        
def generate_freq_df(df, cols, freqs_map={}):
    freq_df = pd.DataFrame()
    for col in cols:
        print('Generating frequency feature for ' + col + '...')
        if col in freqs_map:
            freq_df[col + '_freqs'], _ = generate_freq_feature(df, col, frequencies=freqs_map[col])
        else:
            freq_df[col + '_freqs'],  freqs_map[col] =  generate_freq_feature(df, col)
            
    return freq_df, freqs_map

def generate_category_df(df, cols):
    cat_df = pd.DataFrame()
    for col in cols:
        cat_df[col+'_cat'] = df[col].astype('category')
        
    return cat_df
        
def generate_boolean_df(df, cols):
    bool_df = pd.DataFrame()
    for col in cols:
        bool_df[col+'_bool'] = df[col].astype(np.bool)
        
    return bool_df
        
def generate_gb_df(df, cols):
    gb_df = pd.DataFrame()
    for col in cols:
        gb_df[col+'_gb'] = df[col].fillna(df[col].median()).apply(lambda x: int(x/1000))
        
    return gb_df


In [8]:
# input raw df before trim: counts, condense, split_version, boolean, and non-categorical
def feature_engineer_1(df, counts_map={}):

    fe = pd.DataFrame()
    
    fe['MoreThanOneAV'] = condense_feature(df, 'AVProductsInstalled', {False: [0.0, 1.0], True: []}, fill_with=False).astype(bool)
    fe['SkuCondensed'] = condense_feature(df, 'SkuEdition', {'Home': ['Home'], 'Pro': ['Pro'], 'SkuEtc':[]}).astype('category')
    fe['SmartScreen_condensed'] = condense_feature(df, 'SmartScreen', 
                                            {True: ['RequireAdmin', 'On', 'Warn', 'on', 'Enabled', 'warn', 'Block', 'Prompt', 'Promp', 'requireadmin', 'prompt', 'requireAdmin'], 
                                             False: []}).astype(np.bool)
    
    fe['BatteryType'] = df['Census_InternalBatteryType'].apply(group_battery)
    
    fe['IeVerIdentifier_num'] = df['IeVerIdentifier'].fillna(0.0).astype(np.int16)
    fe['ProcessorCoreCount_num'] = df['Census_ProcessorCoreCount'].fillna(4.0).astype(np.int16)
    fe['Census_ProcessorModelIdentifier_num'] = df['Census_ProcessorModelIdentifier'].fillna(0.0).astype(np.int16)
    fe['Census_InternalPrimaryDiagonalDisplaySizeInInches_num'] = df['Census_InternalPrimaryDiagonalDisplaySizeInInches'].fillna(15.5).astype(np.int16)
    
    fe['AvSigAge'] = create_age_feature(df, 'AvSigVersion', avsig_timestamps)
    fe['OsVerAge'] = create_age_feature(df, 'Census_OSVersion', osver_timestamps)
    
    split_cols = ['Census_OSVersion', 'AppVersion', 'EngineVersion', 'AvSigVersion']
    
    count_cols = ['EngineVersion', 'AppVersion', 'AvSigVersion', 'AVProductStatesIdentifier',
                  'CountryIdentifier', 'Census_OSVersion', 'CityIdentifier', 'OrganizationIdentifier', 
                  'GeoNameIdentifier', 'Platform', 'Processor', 'OsBuild', 'OsSuite', 'OsPlatformSubRelease',
                  'OsBuildLab', 'IeVerIdentifier', 'Census_MDC2FormFactor', 'Census_OEMNameIdentifier', 
                  'Census_ProcessorCoreCount', 'Census_ProcessorManufacturerIdentifier', 
                  'Census_ProcessorModelIdentifier', 'Census_PrimaryDiskTypeName', 'Census_ChassisTypeName', 
                  'Census_PowerPlatformRoleName', 'Census_OSBranch', 'Census_OSBuildNumber', 
                  'Census_OSBuildRevision','Census_OSEdition','Census_OSInstallTypeName', 
                  'Census_OSUILocaleIdentifier', 'Census_ActivationChannel','Census_FirmwareManufacturerIdentifier',
                  'Census_FirmwareVersionIdentifier', 'Census_IsTouchEnabled']
    
    fe_count_cols = ['AppVersion_1', 'AppVersion_2', 'AppVersion_3', 'Census_OSVersion_2', 
                     'Census_OSVersion_3', 'EngineVersion_2', 'EngineVersion_3', 
                     'AvSigVersion_1', 'AvSigVersion_2', 'BatteryType', 'SkuCondensed']
    
    gb_cols = ['Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity', 'Census_TotalPhysicalRAM']
    
    for split_col in split_cols:
        fe = fe.join(generate_split_version(df, split_col, num_splits=4))
    
    count_df, counts_map = generate_count_df(df, count_cols, counts_map=counts_map)
    fe = fe.join(count_df)
    count_df, counts_map = generate_count_df(fe, fe_count_cols, counts_map=counts_map)
    fe = fe.join(count_df)
    
    fe = fe.join(generate_gb_df(df, gb_cols))
    
    return fe, counts_map

# input trimmed df
def feature_engineer_2(df, frequencies_map={}):
    
    fe = pd.DataFrame()
    
    fe['EngineVersionMapped'] = generate_version_mapping(df, 'EngineVersion')
    fe['AppVersionMapped'] = generate_version_mapping(df, 'AppVersion')
    fe['AvSigVersionMapped'] = generate_version_mapping(df, 'AvSigVersion')
    
    freq_cols = ['OrganizationIdentifier', 'CountryIdentifier', 'IeVerIdentifier', 
                 'Census_ActivationChannel', 'Census_FirmwareManufacturerIdentifier', 
                 'Census_OSUILocaleIdentifier']
    
    freq_df, frequencies_map = generate_freq_df(df, freq_cols, freqs_map=frequencies_map)
    fe = fe.join(freq_df)
    
    # cost feature
    
    return fe, frequencies_map
    

In [9]:
print('Feature engineering train features...')
fe1_train, counts = feature_engineer_1(train)
print('Feature engineering test features...')
fe1_test, _ = feature_engineer_1(test, counts_map=counts)

Feature engineering train features...
Feature engineering test features...


In [35]:
non_categorical_columns = ['ProcessorCoreCount', 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 
                           'Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity', 
                           'Census_TotalPhysicalRAM']

columns_to_trim = []
for col in columns_to_process:
    if col not in non_categorical_columns:
        columns_to_trim.append(col)

%time trim_train, trim_test = trim_all(train, test, columns_to_trim)

0/61 Trimming EngineVersion...
1/61 Trimming AppVersion...
2/61 Trimming AvSigVersion...
3/61 Trimming RtpStateBitfield...
4/61 Trimming AVProductStatesIdentifier...
5/61 Trimming AVProductsInstalled...
6/61 Trimming AVProductsEnabled...
7/61 Trimming CountryIdentifier...
8/61 Trimming CityIdentifier...
9/61 Trimming OrganizationIdentifier...
10/61 Trimming GeoNameIdentifier...
11/61 Trimming LocaleEnglishNameIdentifier...
12/61 Trimming Platform...
13/61 Trimming Processor...
14/61 Trimming OsBuild...
15/61 Trimming OsSuite...
16/61 Trimming OsPlatformSubRelease...
17/61 Trimming OsBuildLab...
18/61 Trimming SkuEdition...
19/61 Trimming IsProtected...
20/61 Trimming SMode...
21/61 Trimming IeVerIdentifier...
22/61 Trimming SmartScreen...
23/61 Trimming Firewall...
24/61 Trimming Census_MDC2FormFactor...
25/61 Trimming Census_OEMNameIdentifier...
26/61 Trimming Census_OEMModelIdentifier...
27/61 Trimming Census_ProcessorCoreCount...
28/61 Trimming Census_ProcessorManufacturerIdentifier

In [25]:
ord_train.head()

Unnamed: 0,AppVersion_1_ord,AppVersion_2_ord,AppVersion_3_ord,Census_OSVersion_2_ord,Census_OSVersion_3_ord,EngineVersion_2_ord,EngineVersion_3_ord
0,18.0,17656.0,18052.0,17134.0,17443.0,14901.0,3
1,13.0,17134.0,1.0,17134.0,1.0,14600.0,3
2,18.0,17656.0,18052.0,17134.0,17443.0,14901.0,3
3,18.0,17656.0,18052.0,17134.0,17443.0,14901.0,3
4,18.0,17656.0,18052.0,17134.0,17443.0,14901.0,3


In [13]:
ordinal_columns = ['AppVersion_1', 'AppVersion_2', 'AppVersion_3', 
                   'Census_OSVersion_2', 
                   'Census_OSVersion_3', 'EngineVersion_2', 'EngineVersion_3']
ord_train = pd.DataFrame()
ord_test = pd.DataFrame()
for col in ordinal_columns:
    print('Generating ordinal trim column for ' + col + '...')
    ord_train_col, ord_test_col, _ = trim(fe1_train, fe1_test, col, fill='ordinal')
    trim_train[col+'_ord'] = ord_train_col
    trim_test[col+'_ord'] = ord_test_col
print('Done!')

Generating ordinal trim column for AppVersion_1...
Generating ordinal trim column for AppVersion_2...
Generating ordinal trim column for AppVersion_3...
Generating ordinal trim column for Census_OSVersion_2...
Generating ordinal trim column for Census_OSVersion_3...
Generating ordinal trim column for EngineVersion_2...
Generating ordinal trim column for EngineVersion_3...
Done!


In [37]:
trim_train.head()

Unnamed: 0,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier
0,28,46,117,4,137,0,1,115,214,3,...,1,29,1254,0,1,0,0,0,0,1
1,20,16,7,4,137,0,1,177,415,3,...,1,29,1254,0,1,0,0,0,0,12
2,28,46,117,4,137,0,1,169,454,3,...,1,6,716,0,1,0,0,0,0,7
3,28,46,117,4,137,0,1,171,566,19,...,1,11,248,0,1,0,0,0,0,7
4,28,46,117,4,137,0,1,76,710,19,...,0,11,235,0,0,0,0,0,0,0


In [38]:
trim_train['HasDetections'] = train['HasDetections']

In [39]:
print('Feature engineering train features (part 2)...')
fe2_train, freqs = feature_engineer_2(trim_train)
print('Feature engineering test features (part 2)...')
fe2_test, _ = feature_engineer_2(trim_test, frequencies_map=freqs)

Feature engineering train features (part 2)...
Generating frequency feature for OrganizationIdentifier...
Generating frequency feature for CountryIdentifier...
Generating frequency feature for IeVerIdentifier...
Generating frequency feature for Census_ActivationChannel...
Generating frequency feature for Census_FirmwareManufacturerIdentifier...
Generating frequency feature for Census_OSUILocaleIdentifier...
Feature engineering test features (part 2)...
Generating frequency feature for OrganizationIdentifier...
Generating frequency feature for CountryIdentifier...
Generating frequency feature for IeVerIdentifier...
Generating frequency feature for Census_ActivationChannel...
Generating frequency feature for Census_FirmwareManufacturerIdentifier...
Generating frequency feature for Census_OSUILocaleIdentifier...


In [None]:
trimmed_train.head()

In [40]:
trim_train = trim_train.drop('HasDetections', axis=1)

In [73]:
# join fe1, fe2, and trimmed
train_final = reduce(lambda left, right: left.join(right, rsuffix='_oops'), [trim_train, ord_train, fe1_train, fe2_train])

test_final = reduce(lambda left, right: left.join(right, rsuffix='_oops'), [trim_test, ord_test, fe1_test, fe2_test])


In [12]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

NameError: name 'reduce_mem_usage' is not defined

In [84]:
train_final.to_csv('train_df.csv')

In [55]:
test_final.to_csv('test_df.csv')

AttributeError: to_csv not found

In [57]:
train_final = pd.read_csv('train_df.csv')

In [56]:
test_final = pd.read_csv('test_df.csv')

In [58]:
train_final = reduce_mem_usage(train_final)
test_final = reduce_mem_usage(test_final)

Mem. usage decreased to 3349.50 Mb (68.2% reduction)
Mem. usage decreased to 2963.05 Mb (68.1% reduction)


In [60]:
y_train = np.array(train['HasDetections'])
train_ids = train_final.index
test_ids  = test_final.index

gc.collect()

12

In [61]:
@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

def eval_auc(preds, dtrain):
    labels = dtrain.get_label()
    return 'auc', fast_auc(labels, preds), True

In [62]:
for i in train_final.columns:
    if train_final[i].isnull().values.any():
        train_final[i] = train_final[i].fillna(0)


In [63]:
for i in test_final.columns:
    if test_final[i].isnull().values.any():
        test_final[i] = test_final[i].fillna(0)

In [64]:
train_final = train_final.drop('SkuCondensed', axis=1)
test_final = test_final.drop('SkuCondensed', axis=1)
    

In [65]:
train_final = train_final.drop('Unnamed: 0', axis=1)
test_final = test_final.drop('Unnamed: 0', axis=1)

In [None]:
train_final = train_final.drop(['AvSigVersion_3', 'Census_OSVersion_0', 'Census_OSVersion_1', 'EngineVersion_0',
                                'EngineVersion_1', 'AppVersion_0', 'Processor', 'AvSigVersion_0',
                                'EngineVersion_3_ord', 'SMode', 'Platform_counts', 'Platform', 
                                'Census_OSVersion_2_counts', 'Census_IsPenCapable', 'Census_IsAlwaysOnAlwaysConnectedCapable',
                                'Census_OSVersion_2', 'AvSigVersion_2_counts', 'AvSigVersion_counts', 'Census_OSVersion_counts'], axis=1)
test_final = test_final.drop(['AvSigVersion_3', 'Census_OSVersion_0', 'Census_OSVersion_1', 'EngineVersion_0',
                                'EngineVersion_1', 'AppVersion_0', 'Processor', 'AvSigVersion_0',
                                'EngineVersion_3_ord', 'SMode', 'Platform_counts', 'Platform', 
                                'Census_OSVersion_2_counts', 'Census_IsPenCapable', 'Census_IsAlwaysOnAlwaysConnectedCapable',
                                'Census_OSVersion_2', 'AvSigVersion_2_counts', 'AvSigVersion_counts', 'Census_OSVersion_counts'], axis=1)

In [66]:
# #Fit OneHotEncoder
# ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(train_final)

# #Transform data using small groups to reduce memory usage
# m = 100000
# train = vstack([ohe.transform(train_final[i*m:(i+1)*m]) for i in range(train_final.shape[0] // m + 1)])
# test  = vstack([ohe.transform(test_final[i*m:(i+1)*m])  for i in range(test_final.shape[0] // m +  1)])



In [71]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)
feature_importances = []
fold_predictions = []
lgb_test_result  = np.zeros(test_ids.shape[0])
counter = 0

print('\nLightGBM\n')

for train_index, test_index in skf.split(train_ids, y_train):
    
    print('\nFold {}\n'.format(counter + 1))
    X_fit = train_final.iloc[train_index]
    X_val = train_final.iloc[test_index]

    X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
    y_fit, y_val = y_train[train_index], y_train[test_index]

    lgb_model = lgb.LGBMClassifier(max_depth=-1,
                                   n_estimators=30000,
                                   learning_rate=0.05,
                                   num_leaves=2**12-1,
                                   colsample_bytree=0.28,
                                   objective='binary', 
                                   n_jobs=-1)
                                   
    %time lgb_model.fit(X_fit, y_fit, eval_metric='auc', eval_set=[(X_val, y_val)], verbose=100, early_stopping_rounds=100)        

    feature_importances.append(lgb_model.feature_importances_)
    
    test_final = csr_matrix(test_final, dtype='float32')
    fold_predictions.append(lgb_model.predict_proba(test_final)[:,1])
    counter += 1
    
    gc.collect()



LightGBM

Fold 1

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.601962	valid_0's auc: 0.736793
[200]	valid_0's binary_logloss: 0.593327	valid_0's auc: 0.74309
[300]	valid_0's binary_logloss: 0.591243	valid_0's auc: 0.744699
[400]	valid_0's binary_logloss: 0.590601	valid_0's auc: 0.745163
[500]	valid_0's binary_logloss: 0.590416	valid_0's auc: 0.745245
Early stopping, best iteration is:
[471]	valid_0's binary_logloss: 0.590434	valid_0's auc: 0.74525
CPU times: user 2h 17min 11s, sys: 13.2 s, total: 2h 17min 25s
Wall time: 17min 36s
Fold 2

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.600519	valid_0's auc: 0.737576
[200]	valid_0's binary_logloss: 0.59365	valid_0's auc: 0.742854
[300]	valid_0's binary_logloss: 0.591491	valid_0's auc: 0.74455
[400]	valid_0's binary_logloss: 0.590657	valid_0's auc: 0.745159
[500]	valid_0's binary_logloss: 0.590478	valid_0's auc: 0.74522
[600]	valid_0's bi

In [72]:
n_splits=5
predictions = reduce(lambda x, y: np.add(y,x), fold_predictions)

In [73]:
submission = pd.read_csv('/home/ryan/cs/datasets/microsoft/sample_submission.csv')
submission['HasDetections'] = predictions / n_splits
submission.to_csv('lgb_submission_my_trim_3.csv', index=False)

In [74]:
submission.head()

Unnamed: 0,MachineIdentifier,HasDetections
0,0000010489e3af074adeac69c53e555e,0.482285
1,00000176ac758d54827acd545b6315a5,0.365851
2,0000019dcefc128c2d4387c1273dae1d,0.347003
3,0000055553dc51b1295785415f1a224d,0.318019
4,00000574cefffeca83ec8adf9285b2bf,0.414866


In [31]:
submission.head()

Unnamed: 0,MachineIdentifier,HasDetections
0,0000010489e3af074adeac69c53e555e,0.547239
1,00000176ac758d54827acd545b6315a5,0.444145
2,0000019dcefc128c2d4387c1273dae1d,0.455303
3,0000055553dc51b1295785415f1a224d,0.297712
4,00000574cefffeca83ec8adf9285b2bf,0.438431


In [33]:
type(feature_importances[0])

numpy.ndarray

In [75]:
train_final.head()

Unnamed: 0,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_TotalPhysicalRAM_gb,EngineVersionMapped,AppVersionMapped,AvSigVersionMapped,OrganizationIdentifier_freqs,CountryIdentifier_freqs,IeVerIdentifier_freqs,Census_ActivationChannel_freqs,Census_FirmwareManufacturerIdentifier_freqs,Census_OSUILocaleIdentifier_freqs
0,28,46,117,4,137,0,1,115,214,3,...,4,28,46,117,0.505859,0.505371,0.520996,0.491943,0.501465,0.508789
1,20,16,7,4,137,0,1,177,415,3,...,4,20,16,7,0.505859,0.492188,0.520996,0.491943,0.501465,0.506836
2,28,46,117,4,137,0,1,169,454,3,...,4,28,46,117,0.505859,0.499023,0.520996,0.519531,0.518555,0.485352
3,28,46,117,4,137,0,1,171,566,19,...,4,28,46,117,0.497803,0.530273,0.520996,0.519531,0.511719,0.543457
4,28,46,117,4,137,0,1,76,710,19,...,6,28,46,117,0.497803,0.533203,0.520996,0.491943,0.511719,0.506836


In [76]:
fi = pd.DataFrame({'col':train_final.columns, 'importance':feature_importances[0]}, )


In [77]:
fi.sort_values('importance', axis=0, ascending=False)

Unnamed: 0,col,importance
133,AvSigVersion_2_counts,56653
97,CityIdentifier_counts,55784
93,AvSigVersion_counts,55218
89,AvSigVersion_2,55067
123,Census_FirmwareVersionIdentifier_counts,54290
137,Census_SystemVolumeTotalCapacity_gb,50966
111,Census_ProcessorModelIdentifier_counts,49524
8,CityIdentifier,46825
29,Census_ProcessorModelIdentifier,46050
73,Census_ProcessorModelIdentifier_oops,44406


In [78]:
train_final['Census_OSVersion_2']

0          17134
1          17134
2          17134
3          17134
4          17134
5          17134
6          17134
7          14393
8          17134
9          16299
10         17134
11         17134
12         17134
13         17134
14         17134
15         17134
16         10586
17         17134
18         17134
19         16299
20         14393
21         10240
22         17134
23         17134
24         17134
25         16299
26         17134
27         17134
28         17134
29         16299
           ...  
8921453    17134
8921454    17134
8921455    14393
8921456    17134
8921457    17760
8921458    17134
8921459    14393
8921460    17134
8921461    15063
8921462    14393
8921463    16299
8921464    16299
8921465    17134
8921466    10240
8921467    16299
8921468    17134
8921469    14393
8921470    17134
8921471    17134
8921472    17134
8921473    16299
8921474    16299
8921475    16299
8921476    16299
8921477    15063
8921478    16299
8921479    10586
8921480    162