In [1]:
import pandas as pd
import os
os.chdir('../')
from kernel import utils, data_processing, modelling

# 0. ad hoc Preprocessing

## 1. Load Training Data

In [None]:
df = pd.read_parquet('./data/train_transaction.parquet').pipe(utils.detect_id_col)

In [None]:
df = pd.read_parquet('./data/train_transaction.parquet').pipe(utils.detect_id_col)
df = pd.concat([df[df.isFraud == 1], df[df.isFraud == 0].sample(frac=0.1)], axis=0)

In [None]:
df_fea, labels = utils.split_feature_target(df, 'isFraud')
df_fea.head()

In [None]:
cat_cols = [f'card{i}' for i in range(1, 7)] + [f'M{i}' for i in range(1, 10)]
cat_cols += ['P_emaildomain', 'R_emaildomain', 'ProductCD', 'addr1', 'addr2']

# 1. Build LR Meta Feature

In [None]:
dp = data_processing.GenericDataProcessor(df_fea, 'fraud', True, 15, cat_cols=cat_cols)

In [None]:
clf, params_lr, est_lr = modelling.train_lr_classifier(
    dp.data.values, labels.values, n_iter=30)

In [None]:
lr = est_lr(**params_lr).fit(dp.data.values, labels.values)

In [None]:
lr.predict_proba(dp.data.values)[:, 1]

In [None]:
df_fea['lr_meta'] = lr.predict_proba(dp.data.values)[:, 1]
lr_meta_train = df_fea.lr_meta.to_dict()
df_fea = None
df['lr_meta'] = df.index.map(lr_meta_train).tolist()
train_data_parth = './data/train_transactions_lr_meta.parquet'
df.reset_index().to_parquet(train_data_parth)
df = None

In [None]:
df_inf = pd.read_csv('./data/test_transaction.csv')
df_inf['lr_meta'] = lr.predict_proba(dp.transform(df_inf))[:, 1]
lr_meta_test = df_inf.lr_meta.to_dict()
df_inf['lr_meta'] = df_inf.index.map(lr_meta_test).tolist()
inf_data_path = './data/test_transactions_lr_meta.parquet'
df_inf.to_parquet(inf_data_path)
df_inf = None

# 2. Catboost

In [None]:
df = pd.read_parquet(train_data_path)

In [None]:
df_fea, labels = utils.split_feature_target(df, 'isFraud')
df_fea.head()

In [None]:
df_fea, _, _ = data_processing.catboost_preprocessing(df_fea, cat_cols=cat_cols)

In [None]:
df_inf = pd.read_parquet(inf_data_path)
df_inf.set_index('TransactionID', inplace=True)
df_inf, _, _ = data_processing.catboost_preprocessing(df_inf, cat_cols=cat_cols)

In [None]:
params_m = {'iterations':5000,
            'learning_rate':0.02,
            'depth':5,
            'eval_metric':'AUC',
            'verbose':200,
            'od_type':"Iter", # overfit detector
            'od_wait':500, # most recent best iteration to wait before stopping
            'random_seed': 1
            }
cat_model, cr = modelling.train_catboost_classifier(
    df_fea, labels, cat_cols, params=params_m, plot=True)

In [None]:
df_inf['isFraud'] = cat_model.predict(df_inf[cr])
df_inf = df_inf.reset_index()[['TransactionID', 'isFraud']]
df_inf.to_csv('./data/inf.csv', index=False)