In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import gc

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 100)

In [None]:
# train = pd.read_csv('/kaggle/input/microsoft-malware-prediction/train.csv')
# Reading data as usual causes memory error, so we will try to utilize the variables size.

We will load variables as following: 
* load objects as categories.
* Binary values are switched to int8
* Binary values with missing values are switched to float16 (int does not understand nan), it is possible to use category here as well.
* 64 bits encoding are all switched to 32, or 16 of possible

In [None]:
# https://www.kaggle.com/theoviel/load-the-totality-of-the-data
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [None]:
train_df = pd.read_csv('../input/microsoft-malware-prediction/train.csv', dtype=dtypes)

In [None]:
# train_df = train_full_df.sample(2000000)

In [None]:
train_viz = train_df.sample(1500000)

In [None]:
train_df.info()

In [None]:
test_df = pd.read_csv('../input/microsoft-malware-prediction/test.csv', dtype=dtypes)

In [None]:
test_df.info()

In [None]:
# https://www.kaggle.com/faridsharaf/ashrae-project
# Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train_df = reduce_mem_usage(train_df)

In [None]:
train_viz = reduce_mem_usage(train_viz)

In [None]:
test_df = reduce_mem_usage(test_df)

In [None]:
train_df.shape

In [None]:
test_df.shape

# Target Variable Exploration

In [None]:
sns.catplot(x='HasDetections', kind="count", palette="mako", data=train_viz)

In [None]:
train_df.HasDetections.value_counts()

# EDA

To start working with the data, we might need to make feature selection, we have 82 feature with many data types which might contain some highly corrleated values, skewed or null values, making them useless.

# Data Exploration

In [None]:
train_viz.head(10)

* Machine Identifier has no value here as it doesn't indicate anything about the malware and it is only one record per each machine.
* The data description has nothing about ProductName, EngineVersion, AppVersion but we may extract some good info from them that indicates something about being hit by a malware.

## IsBeta

Next is to look for the missing values.

In [None]:
sns.catplot(x='IsBeta', kind="count", palette="mako", data=train_viz)

## DefaultBrowsersIdentifier

In [None]:
top_browsers = train_viz['DefaultBrowsersIdentifier'].value_counts().head(10).index
top_browsers
# top_browsers = train_viz[train_viz['DefaultBrowsersIdentifier'].isin(top_browsers)]['DefaultBrowsersIdentifier'].value_counts()

In [None]:
sns.countplot(data=train_viz, x='DefaultBrowsersIdentifier', 
              order=train_viz.DefaultBrowsersIdentifier.value_counts().index[:10],
             palette="mako")

## SmartScreen

In [None]:
sns.countplot(data=train_viz, x='SmartScreen', 
              order=train_viz.SmartScreen.value_counts().index[:5],
              hue='HasDetections',
             palette="mako")

## Census_OSBuildNumber

In [None]:
sns.countplot(data=train_viz, x='Census_OSBuildNumber', 
              order=train_viz.Census_OSBuildNumber.value_counts().index[:5],
              hue='HasDetections',
             palette="mako")

## AVProductsInstalled

In [None]:
sns.countplot(data=train_viz, x='AVProductsInstalled', 
              order=train_viz.AVProductsInstalled.value_counts().index[:5],
              hue='HasDetections',
             palette="mako")

## AVProductsEnabled

In [None]:
sns.countplot(data=train_viz, x='AVProductsEnabled', 
              order=train_viz.AVProductsEnabled.value_counts().index[:5],
              hue='HasDetections',
             palette="mako")

# Missing Values

In [None]:
(train_df.isnull().sum()*100/train_df.shape[0]).sort_values(ascending=False)[:10] # to get %

The two features that have 99% missing values are almost useless, so we will consider dropping them.  
We will check the 95% and 83% values using correlation or feature impotance later.

### Census_ProcessorClass

In [None]:
train_df.Census_ProcessorClass.value_counts()

In [None]:
test_df.Census_ProcessorClass.isnull().sum()/test_df.shape[0]

In [None]:
test_df.Census_ProcessorClass.value_counts()

so it also have 99% of values missing in test data, it will not be useful in train or test.

### PuaMode

In [None]:
test_df.PuaMode.isnull().sum()/test_df.shape[0]

In [None]:
test_df.Census_ProcessorClass.isnull().sum()/test_df.shape[0]

In [None]:
dropped_features = []
dropped_features.append('PuaMode')
dropped_features.append('Census_ProcessorClass')
dropped_features

### DefaultBrowsersIdentifier

In [None]:
train_df.DefaultBrowsersIdentifier.value_counts()

## Missing Data Imputation

In [None]:
(train_df.isnull().sum()*100/train_df.shape[0]).sort_values(ascending = False)[:10]

### DefaultBrowsersIdentifier

in Default Browser variable, we may consider all the missing values to be one category and assign a number to it.

In [None]:
train_df.DefaultBrowsersIdentifier.value_counts().head(10)

In [None]:
train_df.DefaultBrowsersIdentifier.fillna(0, inplace=True)

In [None]:
test_df.DefaultBrowsersIdentifier.fillna(0, inplace=True)

### Census_IsFlightingInternal

In [None]:
train_df.Census_IsFlightingInternal.value_counts()

Census of flighting internal has almost one value so it is useless

In [None]:
dropped_features.append('Census_IsFlightingInternal')

### Census_InternalBatteryType

In [None]:
train_df.Census_InternalBatteryType.value_counts()

In [None]:
train_df.Census_InternalBatteryType.isnull().sum()/train_df.shape[0]

around 70% of the data is null, and some values are defected so we will fix them and impute missing to be one category called **unknown**.

In [None]:
trans_dict = {
    '˙˙˙': 'unknown',
    'unkn': 'unknown', 
    'l& ': 'unknown',
    'liÿÿ': 'unknown',
    'li? ': 'unknown',
    'd   ': 'unknown',
    'í-i': 'unknown',
    '÷ÿóö': 'unknown',
    '0ts0': 'unknown',
    'li-l': 'unknown',
    'lio': 'unknown',
    '˙˙˙': 'unknown',
    'l  ': 'unknown',
    '@i': 'unknown',
    'lÿÿÿ': 'unknown',
    np.nan: 'unknown'
}
train_df.replace({'Census_InternalBatteryType': trans_dict}, inplace=True)

In [None]:
test_df.replace({'Census_InternalBatteryType': trans_dict}, inplace=True)

### Census_ThresholdOptIn

### Census_IsWIMBootEnabled

In [None]:
train_df.Census_ThresholdOptIn.value_counts()   

In [None]:
train_df.Census_IsWIMBootEnabled.value_counts()

values are not variant, so we will drop it.

In [None]:
dropped_features.append('Census_ThresholdOptIn')
dropped_features.append('Census_IsWIMBootEnabled')

### SmartScreen

In [None]:
train_df.SmartScreen.value_counts()

In [None]:
trans_dict = {
    'off': 'Off', 
    '&#x02;': '2', 
    '&#x01;': '1', 
    'on': 'On', 
    'requireadmin': 'RequireAdmin', 
    'OFF': 'Off', 
    'Promt': 'Prompt', 
    'requireAdmin': 'RequireAdmin', 
    'prompt': 'Prompt', 
    'warn': 'Warn', 
    '00000000': '0', 
    '&#x03;': '3', 
    np.nan: 'NoExist'
}
train_df.replace({'SmartScreen': trans_dict}, inplace=True)

In [None]:
test_df.replace({'SmartScreen': trans_dict}, inplace=True)

In [None]:
train_df.SmartScreen.value_counts()

### OrganizationIdentifier

In [None]:
train_df.OrganizationIdentifier.value_counts()

Assume that the null values represent some category, so we will replace them with 0.

In [None]:
train_df.replace({'OrganizationIdentifier': {np.nan: 0}}, inplace=True)

In [None]:
test_df.replace({'OrganizationIdentifier': {np.nan: 0}}, inplace=True)

In [None]:
(train_df.isnull().sum()*100/train_df.shape[0]).sort_values(ascending = False)[:20]

now we will drop the null values and columns that we selected before.

In [None]:
dropped_features.append('MachineIdentifier')

# Skewness Check

In [None]:
sk_df = pd.DataFrame([{'column': c, 'uniq': train_df[c].nunique(), 
                       'skewness': train_df[c].value_counts(normalize=True).values[0] * 100} 
                      for c in train_df.columns])
sk_df = sk_df.sort_values('skewness', ascending=False)
sk_df

In [None]:
train_df.Census_DeviceFamily.value_counts()

In [None]:
dropped_features.append('Census_DeviceFamily')

In [None]:
dropped_features.append('Census_IsFlightsDisabled')
dropped_features.append('IsBeta')
dropped_features.append('SMode')
dropped_features.append('AutoSampleOptIn')
dropped_features.append('Census_IsPortableOperatingSystem')
dropped_features.append('Census_IsVirtualDevice')
dropped_features.append('UacLuaenable')

In [None]:
train_df.drop(dropped_features, axis = 1, inplace=True)
print(train_df.shape)

In [None]:
(train_df.isnull().sum()*100/train_df.shape[0]).sort_values(ascending = False)[:20]

In [None]:
train_df.Census_InternalBatteryType.fillna('unknown', inplace = True)

In [None]:
test_df.Census_InternalBatteryType.fillna('unknown', inplace = True)

In [None]:
train_df.dropna(inplace = True)
print(train_df.shape)

# Correlation Test

In [None]:
train_df['SmartScreen'] = train_df.SmartScreen.astype('category')
train_df['Census_InternalBatteryType'] = train_df.Census_InternalBatteryType.astype('category')

In [None]:
cat_cols = train_df.select_dtypes(include='category').columns.tolist()

In [None]:
cols = train_df.select_dtypes(include=np.number).columns.tolist()
len(cols)

Check correlation between variables and each other to lower the dimensions.  
For better visualizations, we will draw each 10 columns separated.

In [None]:
plt.figure(figsize=(15,15))
corr_cols = cols[:15]
corr_cols.append('HasDetections')
sns.heatmap(train_df[corr_cols].corr(), cmap='Spectral', annot=True, center=0.0)
plt.title('Correlation between 1 ~ 15th columns')
plt.show()

In [None]:
# keep the variables that has more values variation and drop the others
dropped_correlated = []

In [None]:
train_df.IsSxsPassiveMode.value_counts()

In [None]:
train_df.RtpStateBitfield.value_counts()

RtpStateBitfield has more values, so we will remove it.

In [None]:
dropped_correlated.append('RtpStateBitfield')

In [None]:
plt.figure(figsize=(15,15))
corr_cols = cols[15:30]
corr_cols.append('HasDetections')
sns.heatmap(train_df[corr_cols].corr(), cmap='Spectral', annot=True, center=0.0)
plt.title('Correlation between 15 ~ 30th columns')
plt.show()

In [None]:
plt.figure(figsize=(15,15))
corr_cols = cols[30:]
sns.heatmap(train_df[corr_cols].corr(), cmap='Spectral', annot=True, center=0.0)
plt.title('Correlation between 15 ~ 30th columns')
plt.show()

In [None]:
print('Census_OSUILocaleIdentifier: ', len(train_df.Census_OSUILocaleIdentifier.value_counts()))
print('Census_OSInstallLanguageIdentifier: ', len(train_df.Census_OSInstallLanguageIdentifier.value_counts()))

In [None]:
dropped_correlated.append('Census_OSUILocaleIdentifier')

Now lets check the correlation between all features

In [None]:
corr = train_df[cols].corr()
high_corr = ((corr >= 0.95) | (corr <= -0.95)).astype('uint8')
plt.figure(figsize=(15,15))
sns.heatmap(high_corr, cmap='Spectral', annot=True, center=0.0)
plt.show()

In [None]:
print('OsBuild: ', len(train_df.OsBuild.value_counts()))
print('Census_OSBuildNumber: ', len(train_df.Census_OSBuildNumber.value_counts()))

In [None]:
dropped_correlated.append('Census_OSBuildNumber')

In [None]:
dropped_correlated

In [None]:
train_df.drop(dropped_correlated, axis = 1, inplace = True)

In [None]:
gc.collect()

In [None]:
train_df.shape

In [None]:
final_cols = train_df.columns.to_list()
final_cols.remove('HasDetections')
final_cols

In [None]:
test_df = test_df[final_cols]
test_df.shape

In [None]:
(test_df.isnull().sum()*100/test_df.shape[0]).sort_values(ascending=False)[:20] # to get %

In [None]:
test_df.dropna(inplace = True)

In [None]:
test_df.shape

# Feature Engineering

In [None]:
train_df.head(10)

In [None]:
train_df['OsBuildLab'] = train_df['OsBuildLab'].cat.add_categories(['0.0.0.0.0-0'])
train_df['OsBuildLab'] = train_df['OsBuildLab'].fillna('0.0.0.0.0-0')
test_df['OsBuildLab'] = test_df['OsBuildLab'].cat.add_categories(['0.0.0.0.0-0'])
test_df['OsBuildLab'] = test_df['OsBuildLab'].fillna('0.0.0.0.0-0')

In [None]:
# https://www.kaggle.com/artgor/is-this-malware-eda-fe-and-lgb-updated#Feature-engineering-and-transformation
def feature_engineering(df):
    df['EngineVersion_2'] = df['EngineVersion'].apply(lambda x: x.split('.')[2]).astype('category')
    df['EngineVersion_3'] = df['EngineVersion'].apply(lambda x: x.split('.')[3]).astype('category')

    df['AppVersion_1'] = df['AppVersion'].apply(lambda x: x.split('.')[1]).astype('category')
    df['AppVersion_2'] = df['AppVersion'].apply(lambda x: x.split('.')[2]).astype('category')
    df['AppVersion_3'] = df['AppVersion'].apply(lambda x: x.split('.')[3]).astype('category')

    df['AvSigVersion_0'] = df['AvSigVersion'].apply(lambda x: x.split('.')[0]).astype('category')
    df['AvSigVersion_1'] = df['AvSigVersion'].apply(lambda x: x.split('.')[1]).astype('category')
    df['AvSigVersion_2'] = df['AvSigVersion'].apply(lambda x: x.split('.')[2]).astype('category')

    df['OsBuildLab_0'] = df['OsBuildLab'].apply(lambda x: x.split('.')[0]).astype('category')
    df['OsBuildLab_1'] = df['OsBuildLab'].apply(lambda x: x.split('.')[1]).astype('category')
    df['OsBuildLab_2'] = df['OsBuildLab'].apply(lambda x: x.split('.')[2]).astype('category')
    df['OsBuildLab_3'] = df['OsBuildLab'].apply(lambda x: x.split('.')[3]).astype('category')
    
    df['Census_OSVersion_0'] = df['Census_OSVersion'].apply(lambda x: x.split('.')[0]).astype('category')
    df['Census_OSVersion_1'] = df['Census_OSVersion'].apply(lambda x: x.split('.')[1]).astype('category')
    df['Census_OSVersion_2'] = df['Census_OSVersion'].apply(lambda x: x.split('.')[2]).astype('category')
    df['Census_OSVersion_3'] = df['Census_OSVersion'].apply(lambda x: x.split('.')[3]).astype('category')
    
    df['aspect_ratio'] = df['Census_InternalPrimaryDisplayResolutionHorizontal']/ df['Census_InternalPrimaryDisplayResolutionVertical']
    
    df['Screen_Area'] = (df['aspect_ratio']* (df['Census_InternalPrimaryDiagonalDisplaySizeInInches']**2))/(df['aspect_ratio']**2 + 1)

    df['ram_per_processor'] = df['Census_TotalPhysicalRAM']/ df['Census_ProcessorCoreCount']

    df['new_num_0'] = df['Census_InternalPrimaryDiagonalDisplaySizeInInches'] / df['Census_ProcessorCoreCount']

    df['new_num_1'] = df['Census_ProcessorCoreCount'] * df['Census_InternalPrimaryDiagonalDisplaySizeInInches']
    
    return df

In [None]:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

In [None]:
train_df.head()

# Encoding

In [None]:
import lightgbm as lgb
import xgboost as xgb
import time
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, roc_auc_score
from catboost import CatBoostClassifier
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings("ignore")

import logging

logging.basicConfig(filename='log.txt',level=logging.DEBUG, format='%(asctime)s %(message)s')

In [None]:
cat_cols = [col for col in train_df.columns if col not in ['MachineIdentifier', 'Census_SystemVolumeTotalCapacity', 'HasDetections'] and str(train_df[col].dtype) == 'category']
len(cat_cols)

In [None]:
train = reduce_mem_usage(train_df)
test = reduce_mem_usage(test_df)
gc.collect()
del train_df, test_df

In [None]:
def frequency_encoding(variable):
    t = pd.concat([train[variable], test[variable]]).value_counts().reset_index()
    t = t.reset_index()
    t.loc[t[variable] == 1, 'level_0'] = np.nan
    t.set_index('index', inplace=True)
    max_label = t['level_0'].max() + 1
    t.fillna(max_label, inplace=True)
    return t.to_dict()['level_0']

In [None]:
for col in tqdm_notebook(cat_cols):
    freq_enc_dict = frequency_encoding(col)
    train[col] = train[col].map(lambda x: freq_enc_dict.get(x, np.nan))
    test[col] = test[col].map(lambda x: freq_enc_dict.get(x, np.nan))

In [None]:
%%time
indexer = {}
for col in cat_cols:
    # print(col)
    _, indexer[col] = pd.factorize(train[col].astype(str), sort=True)
    
for col in tqdm_notebook(cat_cols):
    # print(col)
    train[col] = indexer[col].get_indexer(train[col].astype(str))
    test[col] = indexer[col].get_indexer(test[col].astype(str))
    
    train = reduce_mem_usage(train, verbose=False)
    test = reduce_mem_usage(test, verbose=False)

In [None]:
del indexer

In [None]:
train = train.sample(2000000)

# Model

In [None]:
y = train['HasDetections']
train = train.drop('HasDetections', axis=1)
gc.collect()

In [None]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=15)

In [None]:
from numba import jit
# fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013
@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

def eval_auc(preds, dtrain):
    labels = dtrain.get_label()
    return 'auc', fast_auc(labels, preds), True

# idea from this kernel: https://www.kaggle.com/fabiendaniel/detecting-malwares-with-lgbm
def predict_chunk(model, test):
    initial_idx = 0
    chunk_size = 1000000
    current_pred = np.zeros(len(test))
    while initial_idx < test.shape[0]:
        final_idx = min(initial_idx + chunk_size, test.shape[0])
        idx = range(initial_idx, final_idx)
        current_pred[idx] = model.predict(test.iloc[idx], num_iteration=model.best_iteration)
        initial_idx = final_idx
    #predictions += current_pred / min(folds.n_splits, max_iter)
    return current_pred


def train_model(X=train, X_test=test, y=y, params=None, folds=folds, model_type='lgb', plot_feature_importance=False, averaging='usual', make_oof=False):
    result_dict = {}
    if make_oof:
        oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        gc.collect()
        print('Fold', fold_n + 1, 'started at', time.ctime())
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        
        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature = cat_cols, free_raw_data=False)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature = cat_cols, free_raw_data=False)
            
            model = lgb.train(params,
                    train_data,
                    num_boost_round=2000,
                    valid_sets = [train_data, valid_data],
                    verbose_eval=500,
                    early_stopping_rounds = 200,
                    feval=eval_auc)

            del train_data, valid_data
            
            y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
            del X_valid
            gc.collect()
            # print('predicting on test')
            # y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            y_pred = predict_chunk(model, X_test)
            # print('predicted')
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid), ntree_limit=model.best_ntree_limit)
            #y_pred = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit)
            y_pred = predict_chunk(model, xgb.DMatrix(X_test))
            
        if model_type == 'lcv':
            model = LogisticRegressionCV(scoring='roc_auc', cv=3)
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid)
            # y_pred = model.predict(X_test)
            y_pred = predict_chunk(model, X_test)
            
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,  eval_metric='AUC', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            # y_pred = model.predict(X_test)
            y_pred = predict_chunk(model, X_test)
        
        if make_oof:
            oof[valid_index] = y_pred_valid.reshape(-1,)
            
        scores.append(fast_auc(y_valid, y_pred_valid))
        print('Fold roc_auc:', roc_auc_score(y_valid, y_pred_valid))
        print('')
        
        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importance()
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        
        if plot_feature_importance:
            feature_importance["importance"] /= n_fold
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False).index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
            logging.info('Top features')
            for f in best_features.sort_values(by="importance", ascending=False)['feature'].values:
                logging.info(f)

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
            
    result_dict['prediction'] = prediction
    if make_oof:
        result_dict['oof'] = oof
    
    return result_dict

In [None]:
# params from https://www.kaggle.com/fabiendaniel/detecting-malwares-with-lgbm
params = {'num_leaves': 256,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 5,
         'learning_rate': 0.05,
         "boosting": "gbdt",
         "feature_fraction": 0.8,
         "bagging_freq": 5,
         "bagging_fraction": 0.8,
         "bagging_seed": 11,
         "lambda_l1": 0.15,
         "lambda_l2": 0.15,
         "random_state": 42,          
         "verbosity": -1}

In [None]:
del freq_enc_dict

In [None]:
result_dict1 = train_model(X=train, X_test=test, y=y, params=params, model_type='lgb', plot_feature_importance=True, averaging='rank')