In [16]:
import numpy as np, pandas as pd, os, gc
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
TRAIN_LEN = 590540
TEST_LEN = 506691

In [34]:
test = pd.read_pickle("../processed_input/X_test_encoding_deviceinfo_email.pkl")
train = pd.read_pickle("../processed_input/X_train_encoding_deviceinfo_email.pkl")
y = pd.read_pickle("../input/y_test.pkl")
y.index = train.index

assert train.shape[0] == TRAIN_LEN
assert test.shape[0] == TEST_LEN

In [35]:
params = {
    "bagging_fraction": 0.51138755073088926,
    "feature_fraction": 0.57393165508963395,
    "learning_rate": 0.065644111129998017,
    "max_depth": 36.060235965987744,
    "min_child_weight": 0.012766164534423117,
    "min_data_in_leaf": 66.262898246319878,
    "num_leaves": 1142.2406570962667,
    "reg_alpha": 0.87937921984473566,
    "reg_lambda": 0.78503840886987675,
}


def run_model(X_train, y_train, X_val, y_val, params):
    params["max_depth"] = int(params["max_depth"])
    params["min_data_in_leaf"] = int(params["min_data_in_leaf"])
    params["num_leaves"] = int(params["num_leaves"])
    params["metric"] = "auc"

    d_train = lgb.Dataset(X_train, label=y_train)
    d_val = lgb.Dataset(X_val, label=y_val, reference=d_train)

    model = lgb.train(
        params,
        d_train,
        verbose_eval=-1,
        num_boost_round=1000,
        early_stopping_rounds=20,
        valid_sets=d_val,
    )
    pred_proba = model.predict(X_val)
    score = roc_auc_score(y_val, pred_proba)

    return pred_proba, score

In [36]:
tscv = TimeSeriesSplit(n_splits=10)
model = None
preds = []
scores = []

for train_index, val_index in [i for i in tscv.split(train)]:
    print(len(train_index), len(val_index))
    X_train, X_val = train.iloc[train_index, :].copy(), train.iloc[val_index, :].copy()
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    pred, score = run_model(X_train, y_train, X_val, y_val, params)

    preds.append(pred)
    scores.append(score)

53690 53685
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[87]	valid_0's auc: 0.89669
0.8966896617594806
107375 53685
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[75]	valid_0's auc: 0.898089
0.8980888376436422
161060 53685
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[63]	valid_0's auc: 0.909893
0.9098934405395407
214745 53685
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[58]	valid_0's auc: 0.918944
0.9189443434848723
268430 53685
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[50]	valid_0's auc: 0.908756
0.9087557599271008
322115 53685
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[131]	valid_0's auc: 0.911803
0.9118033196948719
375800 53685
Training until validation scores don't improve

In [40]:
back_pred = []
for pred in preds:
    back_pred.extend(pred)

In [67]:
train_ensemble_df = pd.concat(
    [pd.Series(train.index[-536850:]), pd.Series(back_pred)],
    axis=1
)
train_ensemble_df.columns = ["TransactionID", "isFraud"]

In [68]:
assert train_ensemble_df.shape == (536850, 2)
assert train_ensemble_df.loc[0, 'TransactionID'] == 3040690.0

In [70]:
train_ensemble_df.to_csv(path_or_buf="train_ensemble_ray_lightgbm.csv",index=False)

In [71]:
# generate test labels
model = lgb.train(
    params,
    lgb.Dataset(train, label=y),
    verbose_eval=-1,
    num_boost_round=1000,
)

In [72]:
test.head()

Unnamed: 0_level_0,TransactionAmt,ProductCD,card1,card2,card3,card5,card6,addr1,addr2,dist1,...,version_id_30,browser_id_31,version_id_31,screen_width,screen_height,had_id,P_emaildomain_bin,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663549.0,31.950001,W,10409.0,111.0,150.0,226.0,debit,170.0,87.0,1.0,...,,,,,,1,google,com,,
3663550.0,49.0,W,4272.0,111.0,150.0,226.0,debit,299.0,87.0,4.0,...,,,,,,1,aol,com,,
3663551.0,171.0,W,4476.0,574.0,150.0,226.0,debit,472.0,87.0,2635.0,...,,,,,,1,microsoft,com,,
3663552.0,284.950012,W,10989.0,360.0,150.0,166.0,debit,205.0,87.0,17.0,...,,,,,,1,google,com,,
3663553.0,67.949997,W,18018.0,452.0,150.0,117.0,debit,264.0,87.0,6.0,...,,,,,,1,google,com,,


In [73]:
test_pred = model.predict(test)
test_ensemble_df = pd.DataFrame({
    'TransactionID': test.index,
    'isFraud': test_pred
})

In [74]:
test_ensemble_df.to_csv('test_ensemble_ray_lightgbm.csv', index=False)