In [33]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

import gc
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import ks_2samp
from sklearn.model_selection import KFold
import lightgbm as lgb

### read files

In [34]:
%%time
# read files
folder = '../input/ieee-fraud-detection/'
print('Loading data')

train_identity = pd.read_csv(folder + 'train_identity.csv', index_col='TransactionID')
print('\tloaded train_identity with shape:', train_identity.shape)
train_transaction = pd.read_csv(folder + 'train_transaction.csv', index_col='TransactionID')
print('\tloaded train_transaction with shape:', train_transaction.shape)
test_identity = pd.read_csv(folder + 'test_identity.csv', index_col='TransactionID')
print('\tloaded test_identity with shape:', test_identity.shape)
test_transaction = pd.read_csv(folder + 'test_transaction.csv', index_col='TransactionID')
print('\tloaded test_transaction with shape:', test_transaction.shape)
submission = pd.read_csv(folder + 'sample_submission.csv')
print('\tloaded sample_submission with shape:', submission.shape)

Loading data
	loaded train_identity with shape: (144233, 40)
	loaded train_transaction with shape: (590540, 393)
	loaded test_identity with shape: (141907, 40)
	loaded test_transaction with shape: (506691, 392)
	loaded sample_submission with shape: (506691, 2)
CPU times: user 42.2 s, sys: 13.2 s, total: 55.4 s
Wall time: 55.4 s


### feature engineering

In [35]:
# change numeric variable type a smaller type to reduce memory usage
def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtypes
        
        start_mem = df.memory_usage().sum()/1024**2
        if col_type in ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # check if column min and max within a smaller numeric type
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64) 
    end_mem = df.memory_usage().sum() / 1024**2
    
    print('Memory usage reduce from {:.2f} MB to {:.2f} MB with {:.1f}% reduction'.format(start_mem, end_mem, 100*(start_mem-end_mem)/start_mem))
    
    return df

In [36]:
# get device details
def get_device(df):
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]

    df['OS_id_30'] = df['id_30'].str.split(' ', expand=True)[0]
    df['version_id_30'] = df['id_30'].str.split(' ', expand=True)[1]

    df['browser_id_31'] = df['id_31'].str.split(' ', expand=True)[0]
    df['version_id_31'] = df['id_31'].str.split(' ', expand=True)[1]

    df['screen_width'] = df['id_33'].str.split('x', expand=True)[0]
    df['screen_height'] = df['id_33'].str.split('x', expand=True)[1]

    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    df['had_id'] = 1
    
    return df

In [37]:
%%time
#train_transaction = reduce_memory(train_transaction)
#test_transaction = reduce_memory(test_transaction)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.7 µs


In [38]:
%%time
train_identity = get_device(train_identity)
print(train_identity.groupby('device_name').size())
test_identity = get_device(test_identity)

device_name
HTC             406
Huawei         2377
LG             2331
MacOS         12573
Motorola       2935
Others         5530
RV             4385
Samsung       12092
Sony            575
Trident        7440
Windows       47722
ZTE             518
iOS Device    19782
dtype: int64
CPU times: user 16.4 s, sys: 72 ms, total: 16.5 s
Wall time: 16.5 s


In [39]:
# hour of transaction datetime
def make_hour_feature(df, tname='TransactionDT'):
    hours = df[tname]/3600
    df['encoded_hours'] = np.floor(hours) % 24
    return df

# reference: https://www.kaggle.com/fchmiel/day-and-time-powerful-predictive-feature
# DOW is not significant
# HOD is significant (some hour with low #transaction and high #fraud)

In [40]:
%%time
train_transaction = make_hour_feature(train_transaction)
test_transaction = make_hour_feature(test_transaction)

CPU times: user 36 ms, sys: 4 ms, total: 40 ms
Wall time: 30 ms


In [41]:
%%time
# For our model current TransactionAmt is a noise
# https://www.kaggle.com/kyakovlev/ieee-check-noise
# (even if features importances are telling contrariwise)
# There are many unique values and model doesn't generalize well
# https://www.kaggle.com/kyakovlev/ieee-gb-2-make-amount-useful-again

# for aggregation only
train_transaction['uid'] = train_transaction['card1'].astype(str)+'_'+train_transaction['card2'].astype(str)
test_transaction['uid'] = test_transaction['card1'].astype(str)+'_'+test_transaction['card2'].astype(str)

train_transaction['uid2'] = train_transaction['uid'].astype(str)+'_'+train_transaction['card3'].astype(str)+'_'+train_transaction['card5'].astype(str)
test_transaction['uid2'] = test_transaction['uid'].astype(str)+'_'+test_transaction['card3'].astype(str)+'_'+test_transaction['card5'].astype(str)

train_transaction['uid3'] = train_transaction['uid2'].astype(str)+'_'+train_transaction['addr1'].astype(str)+'_'+train_transaction['addr2'].astype(str)
test_transaction['uid3'] = test_transaction['uid2'].astype(str)+'_'+test_transaction['addr1'].astype(str)+'_'+test_transaction['addr2'].astype(str)

# aggregations
i_cols = ['card1','card2','card3','card5','uid','uid2','uid3']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = pd.concat([train_transaction[[col, 'TransactionAmt']], test_transaction[[col,'TransactionAmt']]])
        #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
        
        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   
    
        train_transaction[new_col_name] = train_transaction[col].map(temp_df)
        test_transaction[new_col_name]  = test_transaction[col].map(temp_df)

# remove uid columns
train_transaction.drop(columns=['uid', 'uid2', 'uid3'], inplace=True)
test_transaction.drop(columns=['uid', 'uid2', 'uid3'], inplace=True)
gc.collect()

CPU times: user 36.3 s, sys: 1min 6s, total: 1min 42s
Wall time: 1min 42s


28

In [42]:
%%time
# Merge transaction and identity
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
print('shape if training set:', train.shape)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
print('shape if testing set:', test.shape)

del train_transaction, train_identity, test_transaction, test_identity
gc.collect()

shape if training set: (590540, 457)
shape if testing set: (506691, 456)
CPU times: user 4.56 s, sys: 1.86 s, total: 6.42 s
Wall time: 6.41 s


49

In [43]:
%%time
# encoding for object variable
for col in train.columns:
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

CPU times: user 1min 6s, sys: 180 ms, total: 1min 6s
Wall time: 1min 6s


In [44]:
%%time
train = reduce_memory(train)
test = reduce_memory(test)

Memory usage reduce from 575.30 MB to 571.92 MB with 0.6% reduction
Memory usage reduce from 503.70 MB to 500.80 MB with 0.6% reduction
CPU times: user 2min 16s, sys: 5min 6s, total: 7min 22s
Wall time: 7min 22s


In [45]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']

X_test = test.drop(['TransactionDT'], axis=1)

del train, test
gc.collect()

28

### Modeling

In [46]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47,
         }

In [None]:
%%time

NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=100)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.957603	valid_1's auc: 0.889741
[400]	training's auc: 0.977055	valid_1's auc: 0.902599
[600]	training's auc: 0.987982	valid_1's auc: 0.910365
[800]	training's auc: 0.993709	valid_1's auc: 0.914483
[1000]	training's auc: 0.996602	valid_1's auc: 0.916559
[1200]	training's auc: 0.998086	valid_1's auc: 0.917402
[1400]	training's auc: 0.998939	valid_1's auc: 0.917978
Early stopping, best iteration is:
[1449]	training's auc: 0.99909	valid_1's auc: 0.918112
Fold 1 | AUC: 0.9181123968471516
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.956459	valid_1's auc: 0.913962
[400]	training's auc: 0.976255	valid_1's auc: 0.925719
[600]	training's auc: 0.988364	valid_1's auc: 0.93323
[800]	training's auc: 0.994328	valid_1's auc: 0.936753
[1000]	training's auc: 0.997163	valid_1's auc: 0.938538
[1200]	training's auc: 0.998571	valid_1's auc: 0.939171


In [None]:
submission['isFraud'] = y_preds
submission.to_csv("submission.csv", index=False)