In [0]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

from google.colab import files
from zipfile import ZipFile

## Step 1 Get data from kaggle

In [2]:
def get_data_from_kaggle():
  !pip install -U -q kaggle
  with open('kaggle.json', 'w') as f:
    f.write('{"username":"YOUR USERNAME","key":"YOUR KEY"}') # Change to your Kaggle API key
  !mkdir -p ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !kaggle competitions download -c ieee-fraud-detection
  with ZipFile('train_identity.csv.zip', 'r') as zipObj:
    zipObj.extractall()
  with ZipFile('train_transaction.csv.zip', 'r') as zipObj:
    zipObj.extractall()
  with ZipFile('test_identity.csv.zip', 'r') as zipObj:
    zipObj.extractall()
  with ZipFile('test_transaction.csv.zip', 'r') as zipObj:
    zipObj.extractall()
  with ZipFile('sample_submission.csv.zip', 'r') as zipObj:
    zipObj.extractall()
  !ls

get_data_from_kaggle()

Downloading train_transaction.csv.zip to /content
 93% 49.0M/52.5M [00:01<00:00, 19.3MB/s]
100% 52.5M/52.5M [00:01<00:00, 47.2MB/s]
Downloading train_identity.csv.zip to /content
  0% 0.00/3.02M [00:00<?, ?B/s]
100% 3.02M/3.02M [00:00<00:00, 204MB/s]
Downloading test_transaction.csv.zip to /content
 87% 41.0M/47.3M [00:01<00:00, 14.7MB/s]
100% 47.3M/47.3M [00:01<00:00, 44.5MB/s]
Downloading test_identity.csv.zip to /content
  0% 0.00/2.97M [00:00<?, ?B/s]
100% 2.97M/2.97M [00:00<00:00, 204MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/1.14M [00:00<?, ?B/s]
100% 1.14M/1.14M [00:00<00:00, 202MB/s]
kaggle.json		   test_identity.csv	     train_identity.csv
sample_data		   test_identity.csv.zip     train_identity.csv.zip
sample_submission.csv	   test_transaction.csv      train_transaction.csv
sample_submission.csv.zip  test_transaction.csv.zip  train_transaction.csv.zip


## Step 2 Load data and Preprocessing features


In [0]:
# Column with Strings
str_type = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain','M1', 'M2', 'M3', 'M4','M5',
            'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 
            'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

# First 53 Columns 
cols = ['TransactionID', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
       'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain',
       'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
       'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8',
       'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4',
       'M5', 'M6', 'M7', 'M8', 'M9']

# "V" Columns to load decided by correlation analysis 
# See this notebook: https://www.kaggle.com/cdeotte/eda-for-columns-v-and-id

v =  [1, 3, 4, 6, 8, 11]
v += [13, 14, 17, 20, 23, 26, 27, 30]
v += [36, 37, 40, 41, 44, 47, 48]
v += [54, 56, 59, 62, 65, 67, 68, 70]
v += [76, 78, 80, 82, 86, 88, 89, 91]

v += [107, 108, 111, 115, 117, 120, 121, 123] # maybe group, no NAN
v += [124, 127, 129, 130, 136] # relates to groups, no NAN

# lots of NaNs below
v += [138, 139, 142, 147, 156, 162] #b1
v += [165, 160, 166] #b1
v += [178, 176, 173, 182] #b2
v += [187, 203, 205, 207, 215] #b2
v += [169, 171, 175, 180, 185, 188, 198, 210, 209] #b2
v += [218, 223, 224, 226, 228, 229, 235] #b3
v += [240, 258, 257, 253, 252, 260, 261] #b3
v += [264, 266, 267, 274, 277] #b3
v += [220, 221, 234, 238, 250, 271] #b3

v += [294, 284, 285, 286, 291, 297] # relates to grous, no NAN
v += [303, 305, 307, 309, 310, 320] # relates to groups, no NAN
v += [281, 283, 289, 296, 301, 314] # relates to groups, no NAN

cols += ['V'+str(x) for x in v]

dtypes = {}
for c in cols+['id_0'+str(x) for x in range(1,10)]+['id_'+str(x) for x in range(10,34)]: 
    dtypes[c] = 'float32'
for c in str_type: 
    dtypes[c] = 'category'

In [4]:
# Load train
X_train = pd.read_csv('train_transaction.csv', index_col='TransactionID', dtype=dtypes, usecols=cols+['isFraud'])
train_id = pd.read_csv('train_identity.csv', index_col='TransactionID', dtype=dtypes)
X_train = X_train.merge(train_id, how='left', left_index=True, right_index=True)
y_train = X_train['isFraud'].copy()

# Load test
X_test = pd.read_csv('test_transaction.csv',index_col='TransactionID', dtype=dtypes, usecols=cols)
test_id = pd.read_csv('test_identity.csv',index_col='TransactionID', dtype=dtypes)
X_test = X_test.merge(test_id, how='left', left_index=True, right_index=True)

del train_id, test_id, X_train['isFraud']

print('Train shape',X_train.shape,'test shape',X_test.shape)

Train shape (590540, 213) test shape (506691, 213)


In [0]:
# Normalize "D" Column
for i in range(1,16):
    if i in [1,2,3,5,9]: continue
    X_train['D'+str(i)] =  X_train['D'+str(i)] - X_train.TransactionDT/np.float32(24*60*60)
    X_test['D'+str(i)] = X_test['D'+str(i)] - X_test.TransactionDT/np.float32(24*60*60) 

for i,f in enumerate(X_train.columns):
# Factorize categorical features
    if (np.str(X_train[f].dtype)=='category')|(X_train[f].dtype=='object'): 
        df_comb = pd.concat([X_train[f],X_test[f]],axis=0)
        df_comb,_ = df_comb.factorize(sort=True)
        if df_comb.max()>32000: print(f,'needs int32')
        X_train[f] = df_comb[:len(X_train)].astype('int16')
        X_test[f] = df_comb[len(X_train):].astype('int16')    
# Make all numerical features positive and set NaN to -1
    elif f not in ['TransactionAmt','TransactionDT']:
        mn = np.min((X_train[f].min(),X_test[f].min()))
        X_train[f] -= np.float32(mn) # minus the min value
        X_test[f] -= np.float32(mn)  # minus the min value
        X_train[f].fillna(-1,inplace=True)
        X_test[f].fillna(-1,inplace=True)

# Add a new feature: cents in TransactionAmt
X_train['cents'] = (X_train['TransactionAmt'] - np.floor(X_train['TransactionAmt'])).astype('float32')
X_test['cents'] = (X_test['TransactionAmt'] - np.floor(X_test['TransactionAmt'])).astype('float32')

In [0]:
cols = list( X_train.columns )
cols.remove('TransactionDT')
for c in ['D6','D7','D8','D9','D12','D13','D14']:
    cols.remove(c)

## Step 3 Local Solid CV for Hyperparameter Tuning (Optional)

In [0]:
do_local_cv_xgb = False # True or False
do_local_cv_lgb = False # True or False

if do_local_cv_xgb:

  idxT = X_train.index[:3*len(X_train)//4] # 3/4 as training set
  idxV = X_train.index[3*len(X_train)//4:] # 1/4 as validation set

  model_xgb = xgb.XGBClassifier( 
              n_estimators = 200,
              max_depth = 12, 
              learning_rate = 0.02, 
              subsample = 0.8,
              colsample_bytree = 0.4, 
              missing = -1, 
              eval_metric = 'auc',
              tree_method = 'gpu_hist'
  )
  hist = model_xgb.fit(X_train.loc[idxT,cols], y_train[idxT], 
                      eval_set=[(X_train.loc[idxV,cols],y_train[idxV])],
                      verbose=50, early_stopping_rounds=100)
  del model_xgb


if do_local_cv_xgb:

  idxT = X_train.index[:3*len(X_train)//4]
  idxV = X_train.index[3*len(X_train)//4:]

  model_lgb = lgb.LGBMClassifier(
              num_leaves = 144,
              max_depth = 12,
              learning_rate = 0.02, 
              n_estimators = 1000,  
              bagging_fraction = 0.8,
              bagging_freq = 5, 
              feature_fraction = 0.9,
              n_jobs = -1,
              missing = -1,
              verbose = -1) 

  hist = model_lgb.fit(X_train.loc[idxT,cols], y_train[idxT], 
                      eval_set=[(X_train.loc[idxV,cols],y_train[idxV])],
                      verbose=50, early_stopping_rounds=100, eval_metric='auc')
  del model_lgb

## Step 4 Ensemble XGB and LGBM

In [8]:
pred_xgb = np.zeros(len(X_test))
pred_lgb = np.zeros(len(X_test))

model_xgb = xgb.XGBClassifier(
            n_estimators = 200,
            max_depth = 12,
            learning_rate = 0.02,
            subsample = 0.8,
            colsample_bytree = 0.4,
            eval_metric = 'auc',
            missing = -1,
            tree_method='gpu_hist')  

model_lgb = lgb.LGBMClassifier(
            num_leaves = 144,
            max_depth = 12,
            learning_rate = 0.02, 
            n_estimators = 200,  
            bagging_fraction = 0.8,
            bagging_freq = 5, 
            feature_fraction = 0.9,
            n_jobs = -1,
            missing = -1,
            verbose = -1) 
    
skf = StratifiedKFold(n_splits=5, shuffle=True)

for i, (idxT, idxV) in enumerate(skf.split(X_train, y_train)):

    print('=== Fold ', i)
    print('samples for train = ', len(idxT), 'samples for validation = ', len(idxV))

    print('--- Training XGB ')
    model_xgb.fit(X_train[cols].iloc[idxT], y_train.iloc[idxT], 
                        eval_set=[(X_train[cols].iloc[idxV],y_train.iloc[idxV])],
                        verbose=50, early_stopping_rounds=100)
    pred_xgb += model_xgb.predict_proba(X_test[cols])[:,1]/skf.n_splits

    print('--- Training LGBM ')
    model_lgb.fit(X_train[cols].iloc[idxT], y_train.iloc[idxT], 
                        eval_set=[(X_train[cols].iloc[idxV],y_train.iloc[idxV])],
                        verbose=50, early_stopping_rounds=100, eval_metric='auc')
    
    pred_lgb += model_lgb.predict_proba(X_test[cols])[:,1]/skf.n_splits

preds = 0.5 * (pred_lgb + pred_xgb)

=== Fold  0
samples for train =  472431 samples for validation =  118109
--- Traing XGB 
[0]	validation_0-auc:0.83774
Will train until validation_0-auc hasn't improved in 100 rounds.
[50]	validation_0-auc:0.893765
[100]	validation_0-auc:0.908615
[150]	validation_0-auc:0.923749
[199]	validation_0-auc:0.934384
--- Training LGBM 
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.0952255	valid_0's auc: 0.898925
[100]	valid_0's binary_logloss: 0.0826363	valid_0's auc: 0.9173
[150]	valid_0's binary_logloss: 0.076306	valid_0's auc: 0.928619
[200]	valid_0's binary_logloss: 0.0716261	valid_0's auc: 0.938581
Did not meet early stopping. Best iteration is:
[200]	valid_0's binary_logloss: 0.0716261	valid_0's auc: 0.938581
=== Fold  1
samples for train =  472431 samples for validation =  118109
--- Traing XGB 
[0]	validation_0-auc:0.85461
Will train until validation_0-auc hasn't improved in 100 rounds.
[50]	validation_0-auc:0.902123
[100]	validation_0-a

## Final Step: Submission

In [0]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.isFraud = preds
sample_submission.to_csv('my_submission.csv',index=False)