In [1]:
import numpy as np
import pandas as pd
import os
import warnings
import math
import sys
import seaborn as sns
from pathlib import Path
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

# utils
sys.path.insert(0, '/tf/notebooks/other/kaggle')
from utils import utils_featexp
from utils import utils_features_engineering
from utils import utils_features_plots
from utils import utils_reduce_memory
from utils import utils_statistic

warnings.filterwarnings('ignore')

In [6]:
START_DATE = datetime.strptime('2017-11-30', '%Y-%m-%d')
main_path = Path('../../input/ieee-cis-fraud-detection/')

train_transaction_data = pd.read_csv(main_path / 'train_transaction.csv')
train_identity_data = pd.read_csv(main_path / 'train_identity.csv')
test_transaction_data = pd.read_csv(main_path / 'test_transaction.csv')
test_identity_data = pd.read_csv(main_path / 'test_identity.csv')

print('Number of train transaction data:{}\nNumber of train identity data:{}'.format(
    train_transaction_data.shape[0],
    train_identity_data.shape[0]
))

print('Number of test transaction data:{}\nNumber of test identity data:{}'.format(
    test_transaction_data.shape[0],
    test_identity_data.shape[0]
))

Number of train transaction data:590540
Number of train identity data:144233
Number of test transaction data:506691
Number of test identity data:141907


# Features selection - time consistency

In [7]:
from datetime import datetime, timedelta
import lightgbm as lgb

START_DATE = datetime.strptime('2017-11-30', '%Y-%m-%d')
train_transaction_data['DT_M'] = train_transaction_data['TransactionDT'].apply(lambda x: (START_DATE + timedelta(seconds = x)))
train_transaction_data['DT_M'] = (train_transaction_data['DT_M'].dt.year-2017)*12 + train_transaction_data['DT_M'].dt.month 

# SPLIT DATA INTO FIRST MONTH AND LAST MONTH
train = train_transaction_data[train_transaction_data.DT_M==12].copy()
validate = train_transaction_data[train_transaction_data.DT_M==17].copy()



In [19]:
# TRAIN AND VALIDATE
col = 'C3'
num_verbose_eval = 50
lgbm = lgb.LGBMClassifier(n_estimators=500, objective='binary', num_leaves=8, learning_rate=0.02, metric='auc')
h = lgbm.fit(
    train[[col]], 
    train.isFraud,     
    eval_metric='auc', 
    eval_set=[(train[[col]],train.isFraud),
              (validate[[col]],validate.isFraud)],
    eval_names=['train', 'valid'],
    verbose=num_verbose_eval
)

auc_train = np.round(h._best_score['train']['auc'], 4)
auc_val = np.round(h._best_score['valid']['auc'], 4)
print('Best auc score:\ntrain:{}\nvalid:{}'.format(auc_train, auc_val))

[50]	train's auc: 0.504236	valid's auc: 0.502194
[100]	train's auc: 0.504236	valid's auc: 0.502194
[150]	train's auc: 0.504236	valid's auc: 0.502194
[200]	train's auc: 0.504236	valid's auc: 0.502194
[250]	train's auc: 0.504236	valid's auc: 0.502194
[300]	train's auc: 0.504236	valid's auc: 0.502194
[350]	train's auc: 0.504236	valid's auc: 0.502194
[400]	train's auc: 0.504236	valid's auc: 0.502194
[450]	train's auc: 0.504236	valid's auc: 0.502194
[500]	train's auc: 0.504236	valid's auc: 0.502194
Best auc score:
train:0.5042
valid:0.5022


In [20]:
# TRAIN AND VALIDATE
col = 'C7'
num_verbose_eval = 50
lgbm = lgb.LGBMClassifier(n_estimators=500, objective='binary', num_leaves=8, learning_rate=0.02, metric='auc')
h = lgbm.fit(
    train[[col]], 
    train.isFraud,     
    eval_metric='auc', 
    eval_set=[(train[[col]],train.isFraud),
              (validate[[col]],validate.isFraud)],
    eval_names=['train', 'valid'],
    verbose=num_verbose_eval
)

auc_train = np.round(h._best_score['train']['auc'], 4)
auc_val = np.round(h._best_score['valid']['auc'], 4)
print('Best auc score:\ntrain:{}\nvalid:{}'.format(auc_train, auc_val))

[50]	train's auc: 0.651491	valid's auc: 0.667144
[100]	train's auc: 0.65151	valid's auc: 0.667192
[150]	train's auc: 0.651558	valid's auc: 0.667192
[200]	train's auc: 0.651607	valid's auc: 0.667192
[250]	train's auc: 0.651624	valid's auc: 0.667179
[300]	train's auc: 0.651708	valid's auc: 0.667183
[350]	train's auc: 0.651856	valid's auc: 0.667183
[400]	train's auc: 0.652105	valid's auc: 0.667183
[450]	train's auc: 0.652115	valid's auc: 0.667187
[500]	train's auc: 0.652176	valid's auc: 0.667187
Best auc score:
train:0.6522
valid:0.6672


In [25]:
# TRAIN AND VALIDATE
col = 'C8'
num_verbose_eval = 50
lgbm = lgb.LGBMClassifier(n_estimators=500, objective='binary', num_leaves=8, learning_rate=0.02, metric='auc')
h = lgbm.fit(
    train[[col]], 
    train.isFraud,     
    eval_metric='auc', 
    eval_set=[(train[[col]],train.isFraud),
              (validate[[col]],validate.isFraud)],
    eval_names=['train', 'valid'],
    verbose=num_verbose_eval
)

auc_train = np.round(h._best_score['train']['auc'], 4)
auc_val = np.round(h._best_score['valid']['auc'], 4)
print('Best auc score:\ntrain:{}\nvalid:{}'.format(auc_train, auc_val))

[50]	train's auc: 0.656968	valid's auc: 0.581784
[100]	train's auc: 0.65703	valid's auc: 0.58176
[150]	train's auc: 0.657097	valid's auc: 0.581759
[200]	train's auc: 0.657147	valid's auc: 0.581752
[250]	train's auc: 0.657204	valid's auc: 0.581873
[300]	train's auc: 0.657252	valid's auc: 0.581875
[350]	train's auc: 0.657478	valid's auc: 0.581875
[400]	train's auc: 0.657479	valid's auc: 0.581876
[450]	train's auc: 0.657503	valid's auc: 0.581875
[500]	train's auc: 0.657508	valid's auc: 0.581875
Best auc score:
train:0.6575
valid:0.5819


The next step is to remove weak variables from your model and then evaluate your entire model with your normal local validation to see if AUC increases or decreases.

In [None]:
datas_range = pd.data_range(start='2017-10-01', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

for df in [train_transaction_data, test_transaction_data]:
    df['DT'] = df['TransactionDT'].apply(lambda x:(START_DATE + timedelta(seconds=x)))
    df['DT_M'] = ((df['DT'].dt.year - 2017)*12 + df['DT'].dt.month).astype(np.int8)
    df['DT_W'] = ((df['DT'].dt.year - 2017)*52 + df['DT'].dt.weekofyear).astype(np.int8)
    df['DT_D'] = ((df['DT'].dt.year - 2017)*365 + df['DT'].dt.dayofyear).astype(np.int8)
    
    df['DT_hour'] = (df['DT'].dt.hour).astype(np.int8)
    df['DT_day_week'] = (df['DT'].dt.dayofweek).astype(np.int8)
    df['DT_day_month'] = (df['DT'].dt.day).astype(np.int8)
    df['DT_week_month'] = (df['DT'].dt.day) / 7
    df['DT_week_month'] = df['DT_week_month'].apply(lambda x: math.ceil(x))
    


In [33]:
train_card1_df = train_transaction_data['card1'].value_counts().reset_index()
test_card1_df = test_transaction_data['card1'].value_counts().reset_index()


In [39]:
max(train_transaction_data.groupby('card1').size() )

14932

In [None]:
daset[c+'_freq'] = daset[c].map(daset.groupby(c).size() / daset.shape[0])