In [9]:
import pandas as pd
import numpy as np
import lightgbm as lgb
#import xgboost as xgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import roc_auc_score
import gc
from functools import cmp_to_key
import time
import matplotlib.pyplot as plt
from numba import jit
from functools import reduce
import seaborn as sns
from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout
from keras.layers.embeddings import Embedding
gc.enable()

dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }


Using TensorFlow backend.


In [2]:
print('Download Train and Test Data.\n')
%time train = pd.read_csv('/home/ryan/cs/datasets/microsoft/train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
%time test  = pd.read_csv('/home/ryan/cs/datasets/microsoft/test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')
osver_timestamps = np.load('/home/ryan/cs/datasets/microsoft/OSVersionTimestamps.npy')[()]
avsig_timestamps = np.load('/home/ryan/cs/datasets/microsoft/AvSigVersionTimestamps.npy')[()]
if 5244810 in train.index:
    train.loc[5244810,'AvSigVersion'] = '1.273.1144.0'
    train['AvSigVersion'].cat.remove_categories('1.2&#x17;3.1144.0',inplace=True)
    
gc.collect()

Download Train and Test Data.

CPU times: user 1min 32s, sys: 2.65 s, total: 1min 35s
Wall time: 1min 38s
CPU times: user 1min 23s, sys: 1.69 s, total: 1min 24s
Wall time: 1min 26s


201033

In [3]:
def condense_feature(df, col, category_map, fill_with=None):
    feature = df[col]
    reverse_map = {}
    default_val = None
    categories = feature.unique()
    categories = categories[~pd.isnull(categories)]
    
    for k, v_arr in category_map.items():
        if len(v_arr) != 0:
            for v in v_arr:       
                reverse_map[v] = k
                index = np.argwhere(categories==v)
                categories = np.delete(categories, index)         
        else:
            default_val = k
            
    for v in categories:
        reverse_map[v] = default_val
        
    condensed = df[col].map(reverse_map)
    if fill_with:
        condensed = condensed.fillna(fill_with)
    else:
        condensed = condensed.fillna(default_val)
    
    return condensed

def generate_count_feature(df, col, counts=None):
    if counts is None:
        feature = df[col]
        groups = df.groupby(col)
        counts = groups.size()
    count_feature = df[col].map(counts)
    
    return count_feature, counts

def group_battery(x):
    x = x.lower()
    if ('li' in x) or ('ion' in x):
        return 1
    else:
        return 0
    
def generate_freq_feature(df, col, on='HasDetections', frequencies=None):
#     print(frequencies)
    if frequencies is None:
        groups = df.groupby(col)
        sizes = groups.size()
        sums = groups.sum()[on]
        frequencies = sums/sizes
    freq_feature = df[col].map(frequencies)
    
    return freq_feature, frequencies

def generate_version_mapping(df, col, num_splits=4, fill_val='0.0.0.0'):
    feature = df[col].astype(str)
    feature.fillna(fill_val)
    versions = feature.unique()
    
    def version_compare(x, y):
        x_splits = x.split('.')
        y_splits = y.split('.')
        for x_val, y_val in zip(x_splits, y_splits):
            try:
                int_x = int(x_val)
            except:
                int_x = 0
            try:
                int_y = int(y_val)
            except:
                int_y = 0
            if int_x > int_y:
                return 1
            elif int_x < int_y:
                return -1
        return 0

    sorted_versions = sorted(versions, key=cmp_to_key(version_compare))
    mapping = {}
    for i, v in enumerate(sorted_versions):
        mapping[v] = i
        
    return feature.map(mapping)

def generate_split_version(df, col, num_splits=4):
    feature = df[col].astype(str)
    sv = pd.DataFrame()
    for i in range(num_splits):
        sv[col + '_' + str(i)] = feature.apply(lambda x: x.split('.')[i]).astype(np.int16)
        
    return sv

def create_age_feature(df, col, raw_timestamps):
    timestamps = {}
    for k, v in raw_timestamps.items():
        timestamps[k] = v.toordinal()
    maxval = max(list(timestamps.values()))
    for k, v in timestamps.items():
        timestamps[k] = maxval - timestamps[k]
        
    return df[col].map(timestamps)

In [4]:
def generate_count_df(df, cols, counts_map={}):
    count_df = pd.DataFrame()
    for col in cols:
        if col in counts_map:
            count_df[col + '_counts'], _ = generate_count_feature(df, col, counts=counts_map[col])
        else:
            count_df[col + '_counts'], counts_map[col] = generate_count_feature(df, col)
        
    return count_df, counts_map

def generate_two_count_df(df1, df2, cols, counts_map={}):
    count_df1 = pd.DataFrame()
    count_df2 = pd.DataFrame()
    features = pd.concat([df1, df2])
    for col in cols:
        counts = features[col].value_counts()
        count_df1[col + '_counts'] = df1[col].map(counts)
        count_df2[col + '_counts'] = df2[col].map(counts)
        
    count_df1 = count_df1.fillna(0).astype(np.int32)
    count_df2 = count_df2.fillna(0).astype(np.int32)
    return count_df1, count_df2

        
def generate_freq_df(df, cols, freqs_map={}):
    freq_df = pd.DataFrame()
    for col in cols:
        print('Generating frequency feature for ' + col + '...')
        if col in freqs_map:
            freq_df[col + '_freqs'], _ = generate_freq_feature(df, col, frequencies=freqs_map[col])
        else:
            freq_df[col + '_freqs'],  freqs_map[col] =  generate_freq_feature(df, col)
            
    return freq_df, freqs_map

def generate_category_df(df, cols):
    cat_df = pd.DataFrame()
    for col in cols:
        cat_df[col+'_cat'] = df[col].astype('category')
        
    return cat_df
        
def generate_boolean_df(df, cols):
    bool_df = pd.DataFrame()
    for col in cols:
        bool_df[col+'_bool'] = df[col].astype(np.bool)
        
    return bool_df
        
def generate_gb_df(df, cols):
    gb_df = pd.DataFrame()
    for col in cols:
        gb_df[col+'_gb'] = df[col].fillna(df[col].median()).apply(lambda x: int(x/1000))
        
    return gb_df


In [5]:
def trim_all(df_train, df_test, columns_to_trim, delete_time_sensitive=False, delete_nan_thresh=0.8, fill='new'):
    fe_train = pd.DataFrame()
    fe_test = pd.DataFrame()
    for i, col in enumerate(columns_to_trim):
        print(str(i+1) + '/' + str(len(columns_to_trim)) + ' Trimming ' + col + '...')
        tr_col, te_col, le_name_mapping = trim(df_train, df_test, col, fill=fill)
        if delete_time_sensitive and 'nan' in le_name_mapping:
            percent = (te_col.values == le_name_mapping['nan']).sum() / float(len(te_col))
            if percent < delete_nan_thresh:
                fe_train[col], fe_test[col] =  tr_col, te_col

            else:
                print('Deleted ' + col)
        else:
            fe_train[col], fe_test[col] =  tr_col, te_col
                    
        
    print('Done!')
    return fe_train, fe_test

def trim(raw_tr, raw_te, col, fill='new', factor=4, min_obs=1000):
    agg_tr = raw_tr.groupby(col).size().to_frame('tr_counts').reset_index()
    agg_te = raw_te.groupby(col).size().to_frame('te_counts').reset_index()
    
    agg = pd.merge(agg_tr, agg_te, on=col, how='outer')

    try:
        agg = agg.fillna(0)
    except:
        pass

    agg = agg[agg['tr_counts'] > min_obs].reset_index(drop=True)
    agg['total_counts'] = agg['tr_counts'] + agg['te_counts']
    agg = agg[(agg['tr_counts'] / agg['total_counts'] > (1.0/factor)) & (agg['tr_counts'] / agg['total_counts'] < ((factor-1.0)/factor))]
    agg[col+'_copy'] = agg[col]

    trim_tr = pd.merge(raw_tr[[col]], agg[[col, col+'_copy']], on=col, how='left')[col+'_copy'].astype(str)
    trim_te = pd.merge(raw_te[[col]], agg[[col, col+'_copy']], on=col, how='left')[col+'_copy'].astype(str)
    
    if fill == 'new':
        le = LabelEncoder().fit(trim_tr.values)
        mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        trim_tr = pd.Series(le.transform(trim_tr)) + 1
        trim_te = pd.Series(le.transform(trim_te)) + 1
        trim_tr = trim_tr.astype('category').rename(col+'_ztrim')
        trim_te = trim_te.astype('category').rename(col+'_ztrim')
    
    elif fill == 'ordinal':
        # should be same uniques
        mapping = ordinal_mapping(trim_tr.values)
        good_vals = []
        for g in mapping.keys():
            if g != 'nan':
                good_vals.append(float(g))
        good_vals.sort()

        trim_tr = fill_ordinal(raw_tr[col], trim_tr, good_vals).astype(float).rename(col+'_otrim')
        trim_te = fill_ordinal(raw_te[col], trim_te, good_vals).astype(float).rename(col+'_otrim')
        
        trim_tr = trim_tr.map(mapping)
        trim_te = trim_te.map(mapping)
                
    else:
        raise Exception('fill should be new or ordinal')
    
    return trim_tr, trim_te, mapping


def fill_ordinal(raw_series, series, good_vals):

    curr_index = 0
    len_good_vals = len(good_vals)
    
    sorted_series = series.argsort()

    for idx in sorted_series:
        
        if series[idx] == 'nan':
            if curr_index >= len_good_vals - 1:
                series[idx] = good_vals[-1]
            elif raw_series.iloc[idx] < good_vals[curr_index+1]:
                series[idx] = good_vals[curr_index]
            else:
                series[idx] = good_vals[curr_index+1]
                curr_index += 1
    return series


def ordinal_mapping(vals, fill_val=-1):
    feature = vals.astype(float)
    feature[np.isnan(feature)] = fill_val

    versions = np.unique(feature)
    
    def version_compare(x, y):

        try:
            int_x = int(x_val)
        except:
            int_x = 0
        try:
            int_y = int(y_val)
        except:
            int_y = 0
        if int_x > int_y:
            return 1
        elif int_x < int_y:
            return -1
        return 0

    sorted_versions = sorted(versions, key=cmp_to_key(version_compare))
    mapping = dict(zip(sorted_versions, range(len(sorted_versions))))
        
    return mapping

In [6]:
# input raw df before trim: counts, condense, split_version, boolean, and non-categorical
def feature_engineer_1(df):

    fe = pd.DataFrame()
    
    fe['MoreThanOneAV'] = condense_feature(df, 'AVProductsInstalled', {False: [0.0, 1.0], True: []}, fill_with=False).astype(bool)
#     fe['SkuCondensed'] = condense_feature(df, 'SkuEdition', {'Home': ['Home'], 'Pro': ['Pro'], 'SkuEtc':[]}).astype('category')
    fe['SmartScreen_condensed'] = condense_feature(df, 'SmartScreen', 
                                            {True: ['RequireAdmin', 'On', 'Warn', 'on', 'Enabled', 'warn', 'Block', 'Prompt', 'Promp', 'requireadmin', 'prompt', 'requireAdmin'], 
                                             False: []}).astype(np.bool)
    
    fe['BatteryType'] = df['Census_InternalBatteryType'].apply(group_battery)
    
    fe['IeVerIdentifier_num'] = df['IeVerIdentifier'].fillna(0.0).astype(np.int16)
    fe['Census_ProcessorModelIdentifier_num'] = df['Census_ProcessorModelIdentifier'].fillna(0.0).astype(np.int16)
       
    
    fe['MachineCost'] = df['Census_ProcessorCoreCount']**2 * 20 + df['Census_SystemVolumeTotalCapacity']/20000 \
                        + df['Census_TotalPhysicalRAM']/200
    
    fe['AvSigAge'] = create_age_feature(df, 'AvSigVersion', avsig_timestamps)
    fe['OsVerAge'] = create_age_feature(df, 'Census_OSVersion', osver_timestamps)
    
    split_cols = ['Census_OSVersion', 'AppVersion', 'EngineVersion', 'AvSigVersion']
    
    gb_cols = ['Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity', 'Census_TotalPhysicalRAM']
    
    linear_columns = ['Census_ProcessorCoreCount', 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 
                        'Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity', 
                        'Census_TotalPhysicalRAM']
    
    for lin_col in linear_columns:
        df[lin_col] = df[lin_col].astype(np.float32)
        fe[lin_col] = df[lin_col].fillna(df[lin_col].median())
    
    for split_col in split_cols:
        fe = fe.join(generate_split_version(df, split_col, num_splits=4))
    
    fe = fe.join(generate_gb_df(df, gb_cols))
    
    return fe

# input trimmed df
def feature_engineer_2(df, frequencies_map={}):
    
    fe = pd.DataFrame()
    
    fe['EngineVersionMapped'] = generate_version_mapping(df, 'EngineVersion')
    fe['AppVersionMapped'] = generate_version_mapping(df, 'AppVersion')
    fe['AvSigVersionMapped'] = generate_version_mapping(df, 'AvSigVersion')
    
    freq_cols = ['OrganizationIdentifier', 'CountryIdentifier', 'IeVerIdentifier', 
                 'Census_ActivationChannel', 'Census_FirmwareManufacturerIdentifier', 
                 'Census_OSUILocaleIdentifier']
    
    freq_df, frequencies_map = generate_freq_df(df, freq_cols, freqs_map=frequencies_map)
    fe = fe.join(freq_df)
    
    # cost feature
    
    return fe, frequencies_map
    

In [7]:
print('Feature engineering train features...')
fe1_train = feature_engineer_1(train)
print('Feature engineering test features...')
fe1_test = feature_engineer_1(test)

Feature engineering train features...
Feature engineering test features...


In [8]:
columns_to_delete = ['HasDetections', # label
                     'MachineIdentifier', # id
                     'DefaultBrowsersIdentifier', # nan columns
                     'PuaMode',
                     'ProductName', # imbalanced columns
                     'IsBeta',
                     'IsSxsPassiveMode',
                     'HasTpm',
                     'AutoSampleOptIn',
                     'PuaMode',
                     'UacLuaenable',
                     'Census_DeviceFamily',
                     'Census_ProcessorClass',
                     'Census_IsPortableOperatingSystem',
                     'Census_IsFlightsDisabled',
                     'Census_IsVirtualDevice',
                     'Census_OSSkuName', # overlap columns
                     'OsVer',
                     'Census_OSArchitecture',
                     'Census_OSInstallLanguageIdentifier',
                     'Census_InternalBatteryNumberOfCharges' # strange values
                    ]

# from fe1
non_categorical_cols = ['Census_ProcessorCoreCount', 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 
                        'Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity', 
                        'Census_TotalPhysicalRAM', 'Census_PrimaryDiskTotalCapacity', ]

columns_to_process = []
for col in train.columns:
    if col not in columns_to_delete + non_categorical_cols:
        columns_to_process.append(col)

%time trim_train, trim_test = trim_all(train, test, columns_to_process)

1/58 Trimming EngineVersion...
2/58 Trimming AppVersion...
3/58 Trimming AvSigVersion...
4/58 Trimming RtpStateBitfield...
5/58 Trimming AVProductStatesIdentifier...
6/58 Trimming AVProductsInstalled...
7/58 Trimming AVProductsEnabled...
8/58 Trimming CountryIdentifier...
9/58 Trimming CityIdentifier...
10/58 Trimming OrganizationIdentifier...
11/58 Trimming GeoNameIdentifier...
12/58 Trimming LocaleEnglishNameIdentifier...
13/58 Trimming Platform...
14/58 Trimming Processor...
15/58 Trimming OsBuild...
16/58 Trimming OsSuite...
17/58 Trimming OsPlatformSubRelease...
18/58 Trimming OsBuildLab...
19/58 Trimming SkuEdition...
20/58 Trimming IsProtected...
21/58 Trimming SMode...
22/58 Trimming IeVerIdentifier...
23/58 Trimming SmartScreen...
24/58 Trimming Firewall...
25/58 Trimming Census_MDC2FormFactor...
26/58 Trimming Census_OEMNameIdentifier...
27/58 Trimming Census_OEMModelIdentifier...
28/58 Trimming Census_ProcessorManufacturerIdentifier...
29/58 Trimming Census_ProcessorModelIde

In [52]:
# ordinal_columns = ['AppVersion_1',]
#                    'AppVersion_2', 'AppVersion_3',  
#                    'Census_OSVersion_3', 'EngineVersion_2']
ordinal_columns = ['AvSigVersion_2', 'Census_OSVersion_3']
%time ord_train, ord_test = trim_all(fe1_train, fe1_test, ordinal_columns, fill='ordinal')

0/2 Trimming AvSigVersion_2...
1/2 Trimming Census_OSVersion_3...
Done!
CPU times: user 10min 3s, sys: 1.01 s, total: 10min 4s
Wall time: 10min 4s


In [53]:
trim_train['HasDetections'] = train['HasDetections']

In [67]:
print('Feature engineering train features (part 2)...')
fe2_train, freqs = feature_engineer_2(trim_train)
print('Feature engineering test features (part 2)...')
fe2_test, _ = feature_engineer_2(trim_test, frequencies_map=freqs)

Feature engineering train features (part 2)...
Generating frequency feature for OrganizationIdentifier...
Generating frequency feature for CountryIdentifier...
Generating frequency feature for IeVerIdentifier...
Generating frequency feature for Census_ActivationChannel...
Generating frequency feature for Census_FirmwareManufacturerIdentifier...
Generating frequency feature for Census_OSUILocaleIdentifier...
Feature engineering test features (part 2)...
Generating frequency feature for OrganizationIdentifier...
Generating frequency feature for CountryIdentifier...
Generating frequency feature for IeVerIdentifier...
Generating frequency feature for Census_ActivationChannel...
Generating frequency feature for Census_FirmwareManufacturerIdentifier...
Generating frequency feature for Census_OSUILocaleIdentifier...


In [54]:
# count_cols = ['EngineVersion', 'AppVersion', 'AvSigVersion', 'AVProductStatesIdentifier',
#               'CountryIdentifier', 'Census_OSVersion', 'CityIdentifier', 'OrganizationIdentifier', 
#               'GeoNameIdentifier', 'OsBuild', 
#               'OsBuildLab', 'IeVerIdentifier', 'Census_MDC2FormFactor', 'Census_OEMNameIdentifier', 
#               'Census_ProcessorCoreCount', 'Census_ProcessorManufacturerIdentifier', 
#               'Census_ProcessorModelIdentifier', 'Census_PrimaryDiskTypeName', 'Census_ChassisTypeName', 
#               'Census_PowerPlatformRoleName', 'Census_OSBranch', 'Census_OSBuildNumber', 
#               'Census_OSBuildRevision','Census_OSEdition','Census_OSInstallTypeName', 
#               'Census_OSUILocaleIdentifier', 'Census_ActivationChannel','Census_FirmwareManufacturerIdentifier',
#               'Census_FirmwareVersionIdentifier', 'Census_IsTouchEnabled']

count_cols = ['CityIdentifier', 'AvSigVersion', 'Census_FirmwareVersionIdentifier', 
              'Census_ProcessorModelIdentifier', 'CountryIdentifier', 'GeoNameIdentifier', 
              'Census_OSVersion', 'Census_OSBuildRevision', 'Census_OEMNameIdentifier',
              'AVProductStatesIdentifier', 'Census_OSUILocaleIdentifier']

fe_count_cols = ['AppVersion_1', 'AppVersion_2', 'AppVersion_3', 
                 'Census_OSVersion_3', 'EngineVersion_2', 'EngineVersion_3', 
                 'AvSigVersion_1', 'AvSigVersion_2']

count_df1_train, count_df1_test = generate_two_count_df(train, test, count_cols)
count_df2_train, count_df2_test = generate_two_count_df(fe1_train, fe1_test, fe_count_cols)

In [10]:
trim_train = trim_train.drop('HasDetections', axis=1)

ValueError: labels ['HasDetections'] not contained in axis

In [96]:
# join fe1, fe2, and trimmed
# train_final = reduce(lambda left, right: left.join(right, rsuffix='_oops'), [trim_train, ord_train, fe1_train, fe2_train, count_df1_train, count_df2_train])

# test_final = reduce(lambda left, right: left.join(right, rsuffix='_oops'), [trim_test, ord_test, fe1_test, fe2_test, count_df1_test, count_df2_test])


In [105]:
for i in train_final.columns:
    if train_final[i].isnull().values.any():
        train_final[i] = train_final[i].fillna(0)
    if test_final[i].isnull().values.any():
        test_final[i] = test_final[i].fillna(0)

In [71]:
train_final.head()

Unnamed: 0,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,AVProductStatesIdentifier_counts,Census_OSUILocaleIdentifier_counts,AppVersion_1_counts,AppVersion_2_counts,AppVersion_3_counts,Census_OSVersion_3_counts,EngineVersion_2_counts,EngineVersion_3_counts,AvSigVersion_1_counts,AvSigVersion_2_counts
0,28,46,117,4,137,0,1,115,214,3,...,11333187,629790,11675668,5825662,5824824,959932,3833951,8049943,3887498,20517
1,20,16,7,4,137,0,1,177,415,3,...,11333187,5971350,1053108,1053048,488388,285590,299099,2695190,324819,243423
2,28,46,117,4,137,0,1,169,454,3,...,11333187,938464,11675668,5825662,5824824,959932,3833951,8049943,3887498,16282
3,28,46,117,4,137,0,1,171,566,19,...,11333187,156553,11675668,5825662,5824824,1565473,3833951,8049943,3887498,53752
4,28,46,117,4,137,0,1,76,710,19,...,11333187,5971350,11675668,5825662,5824824,252155,3833951,8049943,3887498,50449


In [None]:
# train_final = train_final.drop(['AvSigVersion_3', 'Census_OSVersion_0', 'Census_OSVersion_1', 'EngineVersion_0',
#                                 'EngineVersion_1', 'AppVersion_0', 'Processor', 'AvSigVersion_0',
#                                 'SMode', 'Platform', 
#                                 'Census_IsPenCapable', 'Census_IsAlwaysOnAlwaysConnectedCapable',
#                                  'Census_OSVersion_2', 'AvSigVersion_2_counts', 'AvSigVersion_counts', 'Census_OSVersion_counts'], axis=1)
# test_final = test_final.drop(['AvSigVersion_3', 'Census_OSVersion_0', 'Census_OSVersion_1', 'EngineVersion_0',
#                                 'EngineVersion_1', 'AppVersion_0', 'Processor', 'AvSigVersion_0',
#                                  'SMode', 'Platform', 
#                                'Census_IsPenCapable', 'Census_IsAlwaysOnAlwaysConnectedCapable',
#                                 'Census_OSVersion_2', 'AvSigVersion_2_counts', 'AvSigVersion_counts', 'Census_OSVersion_counts'], axis=1)

In [67]:
save_df = False
if save_df:
    print('Saving train dataframe...')
    %time train_final.to_csv('train_df.csv')
    print('Saving test dataframe...')
    %time test_final.to_csv('test_df.csv')
    print('Done!')
    
load_df = False
if load_df:
    print('Loading train dataframe...')
    %time train_final = pd.read_csv('train_df.csv')
    print('Loading test dataframe...')
    %time test_final = pd.read_csv('test_df.csv')

In [47]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [98]:
train_final = reduce_mem_usage(train_final)
test_final = reduce_mem_usage(test_final)

Mem. usage decreased to 1650.85 Mb (39.7% reduction)
Mem. usage decreased to 1453.21 Mb (39.7% reduction)


In [70]:
y_trn = train['HasDetections']
gc.collect()

1347

In [57]:
@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

def eval_auc(preds, dtrain):
    labels = dtrain.get_label()
    return 'auc', fast_auc(labels, preds), True

def predict_chunk(model, test):
    initial_idx = 0
    chunk_size = 1000000
    current_pred = np.zeros(len(test))
    while initial_idx < test.shape[0]:
        final_idx = min(initial_idx + chunk_size, test.shape[0])
        idx = range(initial_idx, final_idx)
        current_pred[idx] = model.predict(test.iloc[idx], num_iteration=model.best_iteration)
        initial_idx = final_idx
    #predictions += current_pred / min(folds.n_splits, max_iter)
    return current_pred

In [83]:
def build_embedding_network(df, cat_cols, num_cols):
    
    inputs = []
    embeddings = []
    
    for col_name in cat_cols:
        col = df[col_name]
        inp = Input(shape=(1,))
        embedding = Embedding(len(col.unique()), 3, input_length=1)(inp)
        embedding = Reshape(target_shape=(3,))(embedding)
        inputs.append(inp)
        embeddings.append(embedding)
    
    input_numeric = Input(shape=(len(num_cols),))
    embedding_numeric = Dense(16)(input_numeric) 
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)

    x = Concatenate()(embeddings)
    x = Dense(400, activation='relu')(x)
    x = Dropout(.5)(x)
    x = Dense(400, activation='relu')(x)
    x = Dropout(.5)(x)
    output = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs, output)

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

def train_model(X, y, X_test, cat_cols, num_cols, fold_idx):
    predictions = []
    for idx_trn, idx_val in fold_idx:
    
        X_trn = X.iloc[idx_trn]
        y_trn = y.iloc[idx_trn]
        X_val = X.iloc[idx_val]
        y_val = y.iloc[idx_val]

        X_trn_f, X_val_f, X_test_f = format_data(X_trn, X_val, X_test, cat_cols, num_cols) 

        model = build_embedding_network(X_trn, cat_cols, num_cols)
        
        model.fit(X_trn_f, y_trn.values, epochs=10, batch_size=4096, verbose=1, validation_data=(X_val_f, y_val.values))

        predictions.append(model.predict(X_test_f))
        
    return predictions
    

def format_data(X_trn, X_val, X_test, cat_cols, num_cols):
    inp_trn = []
    inp_val = []
    inp_test = []
    
    for c in cat_cols:
        inp_trn.append(X_trn[c].values)
        inp_val.append(X_val[c].values)
        inp_test.append(X_test[c].values)
        
    other_cols = [c for c in num_cols]
    inp_trn.append(X_trn[other_cols].values)
    inp_val.append(X_val[other_cols].values)
    inp_test.append(X_test[other_cols].values)
        
    return inp_trn, inp_val, inp_test

def create_submission(result_dict, fname):
    submission = pd.read_csv('/home/ryan/cs/datasets/microsoft/sample_submission.csv')
    submission['HasDetections'] = result_dict['averaged_prediction']
    submission.to_csv('submissions/' + fname, index=False)
    return submission

In [65]:
curr_train = pd.DataFrame()
curr_test = pd.DataFrame()
gb_cols = ['Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity', 'Census_TotalPhysicalRAM']

linear_columns = ['Census_ProcessorCoreCount', 'Census_InternalPrimaryDiagonalDisplaySizeInInches']
for col in trim_train.columns:
    curr_train[col] = trim_train[col]
    curr_test[col] = trim_test[col]
    
for col in gb_cols:
    curr_train[col+'_gb'] = fe1_train[col+'_gb']
    curr_test[col+'_gb'] = fe1_test[col+'_gb']
    
for col in linear_columns:
    curr_train[col] = fe1_train[col]
    curr_test[col] = fe1_test[col]

In [38]:
cat_cols = ['AVProductStatesIdentifier', 'CountryIdentifier', 
            'Census_ProcessorModelIdentifier', 'Census_OSVersion', 'GeoNameIdentifier', 'LocaleEnglishNameIdentifier',
            'AppVersion', 'Census_OSBuildRevision', 'Census_FirmwareVersionIdentifier', 'CityIdentifier', 'IeVerIdentifier',
            'Census_OEMModelIdentifier']

num_cols = ['Census_SystemVolumeTotalCapacity_gb', 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Census_TotalPhysicalRAM_gb']

In [21]:
fold1_idx = fe1_train.index[fe1_train['AvSigAge'] > 97]
fold2_idx = fe1_train.index[fe1_train['AvSigAge'] <= 97]

fold_idx = [(fold1_idx, fold2_idx), 
            (fold2_idx, fold1_idx)]

In [84]:
preds = train_model(curr_train, y_trn, curr_test, cat_cols, num_cols, fold_idx)

Train on 4529553 samples, validate on 4391862 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 4391862 samples, validate on 4529553 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [82]:
print(preds)

[array([[ 0.6284526 ],
       [ 0.74612904],
       [ 0.46587014],
       ..., 
       [ 0.46409273],
       [ 0.57696682],
       [ 0.55440515]], dtype=float32), array([[ 0.6154893 ],
       [ 0.6154893 ],
       [ 0.43009728],
       ..., 
       [ 0.38941884],
       [ 0.56791836],
       [ 0.45565873]], dtype=float32)]


In [22]:
counts_to_add = ['CountryIdentifier_counts', 'CityIdentifier_counts', 'Census_ProcessorModelIdentifier_counts']
for c in counts_to_add:
    curr_train[c] = count_df1_train[c]
    curr_test[c] = count_df1_train[c]

NameError: name 'count_df1_train' is not defined