<a href="https://colab.research.google.com/github/samipn/autogluon/blob/main/01_kaggle_ieee_fraud_autogluon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IEEE-CIS Fraud Detection with AutoGluon

*Colab-ready | Last prepared: 2025-10-14*

This notebook trains an AutoGluon model on the IEEE-CIS Fraud Detection competition and generates a submission file.

**What you'll do:**
1. Install dependencies
2. Authenticate Kaggle API and download data
3. Merge transaction + identity tables
4. Train AutoGluon (`roc_auc`), view leaderboard + feature importance
5. Predict on test and create `submission.csv`
6. (Optional) Submit to Kaggle from the notebook


In [1]:
# 1) Install deps
!pip -q install -U pip setuptools wheel
!pip -q install -U autogluon kaggle catboost lightgbm xgboost


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.8 MB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.8 MB[0m [31m10.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0m  Preparin

In [2]:
# 2) Kaggle auth helper (run this, then follow instructions)
import os, json, pathlib, sys
from pathlib import Path

KAGGLE_DIR = Path('~/.kaggle').expanduser()
KAGGLE_DIR.mkdir(parents=True, exist_ok=True)
kaggle_json = KAGGLE_DIR / 'kaggle.json'

if not kaggle_json.exists():
    print("🔐 No kaggle.json found. If you're in Colab:")
    print(" - Go to https://www.kaggle.com/settings/account and create a new API token.")
    print(" - Upload the downloaded kaggle.json below.")
    try:
        from google.colab import files  # type: ignore
        uploaded = files.upload()
        if 'kaggle.json' in uploaded:
            with open(kaggle_json, 'wb') as f:
                f.write(uploaded['kaggle.json'])
    except Exception as e:
        print("Note: If not in Colab, save kaggle.json to ~/.kaggle/kaggle.json and re-run this cell.")

if kaggle_json.exists():
    os.chmod(kaggle_json, 0o600)
    print("✅ kaggle.json configured.")
else:
    print("⚠️ kaggle.json still missing. Kaggle steps will fail until it's present.")


🔐 No kaggle.json found. If you're in Colab:
 - Go to https://www.kaggle.com/settings/account and create a new API token.
 - Upload the downloaded kaggle.json below.


Saving kaggle.json to kaggle.json
✅ kaggle.json configured.


In [3]:
# 3) Download competition files
!mkdir -p data/ieee
!kaggle competitions files -c ieee-fraud-detection | head -n 20
!kaggle competitions download -c ieee-fraud-detection -p data/ieee -q

# Unzip everything we just downloaded
import glob, zipfile, os
for z in glob.glob('data/ieee/*.zip'):
    with zipfile.ZipFile(z, 'r') as f:
        f.extractall('data/ieee')
    os.remove(z)
print("✅ Data ready in data/ieee")


name                         size  creationDate                
---------------------  ----------  --------------------------  
sample_submission.csv     6080314  2019-07-15 00:19:01.536000  
test_identity.csv        25797161  2019-07-15 00:19:01.536000  
test_transaction.csv    613194934  2019-07-15 00:19:01.536000  
train_identity.csv       26529680  2019-07-15 00:19:01.536000  
train_transaction.csv   683351067  2019-07-15 00:19:01.536000  
✅ Data ready in data/ieee


In [4]:
# 4) Load & merge, basic sanity checks
import pandas as pd
from pathlib import Path

train_tr = pd.read_csv('data/ieee/train_transaction.csv')
train_id = pd.read_csv('data/ieee/train_identity.csv')
test_tr  = pd.read_csv('data/ieee/test_transaction.csv')
test_id  = pd.read_csv('data/ieee/test_identity.csv')

train = train_tr.merge(train_id, on='TransactionID', how='left')
# Use a left merge for the test set as well to include all identity columns present in test_id
test  = test_tr.merge(test_id,  on='TransactionID', how='left')


print(train.shape, test.shape)
print(train[['TransactionDT','isFraud']].head())

# Split holdout validation
from sklearn.model_selection import train_test_split
label = 'isFraud'
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42, stratify=train[label])
print(train_df.shape, val_df.shape)

(590540, 434) (506691, 433)
   TransactionDT  isFraud
0          86400        0
1          86401        0
2          86469        0
3          86499        0
4          86506        0
(472432, 434) (118108, 434)


In [5]:
# 5) Train AutoGluon
from autogluon.tabular import TabularPredictor

TIME_LIMIT = 1200  # seconds (adjust if you want a stronger model)
PATH = 'ag_ieee/'

predictor = TabularPredictor(label=label, eval_metric='roc_auc', path=PATH)
predictor.fit(train_data=train_df, time_limit=TIME_LIMIT, presets='best_quality')


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          8
Memory Avail:       39.42 GB / 50.99 GB (77.3%)
Disk Space Avail:   178.94 GB / 225.83 GB (79.2%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfit

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7a98061399a0>

In [6]:
# 6) Leaderboard + Feature importance on validation
lb = predictor.leaderboard(val_df, silent=True)
lb


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L2,0.966401,0.956774,roc_auc,44.868693,37.873462,793.642317,11.206511,10.982984,238.297733,2,True,4
1,WeightedEnsemble_L3,0.966149,0.956819,roc_auc,45.925398,38.587045,866.511796,0.003566,0.081893,7.788391,3,True,6
2,LightGBMXT_BAG_L1,0.957921,0.952163,roc_auc,32.206973,25.959976,476.96028,32.206973,25.959976,476.96028,1,True,1
3,WeightedEnsemble_L2,0.957921,0.952163,roc_auc,32.209575,26.039433,480.915848,0.002602,0.079457,3.955568,2,True,3
4,LightGBM_BAG_L2,0.952227,0.939379,roc_auc,34.715321,27.522169,620.425672,1.053139,0.63169,65.081088,2,True,5
5,LightGBM_BAG_L1,0.878959,0.862947,roc_auc,1.455209,0.930502,78.384304,1.455209,0.930502,78.384304,1,True,2


In [7]:
fi = predictor.feature_importance(val_df)
fi.head(20)


These features in provided data are not utilized by the predictor and will be ignored: ['V16', 'V28', 'V32', 'V117', 'V119', 'V241', 'V305']
Computing feature importance via permutation shuffling for 426 features using 5000 rows with 5 shuffle sets...
	5219.11s	= Expected runtime (1043.82s per shuffle set)
	4501.5s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
C13,0.01363,0.001934,4.7e-05,5,0.017611,0.009648
TransactionAmt,0.011278,0.003964,0.001565,5,0.01944,0.003116
C1,0.008388,0.003037,0.001746,5,0.014641,0.002135
P_emaildomain,0.007026,0.002978,0.003095,5,0.013158,0.000894
card6,0.005685,0.001997,0.001562,5,0.009797,0.001573
card2,0.005123,0.002774,0.007249,5,0.010833,-0.000588
card5,0.002557,0.000795,0.00099,5,0.004193,0.00092
C14,0.002476,0.002116,0.029519,5,0.006834,-0.001882
M4,0.002415,0.000963,0.002479,5,0.004397,0.000433
addr1,0.002244,0.0013,0.00908,5,0.00492,-0.000433


In [14]:
# 7) Predict on test and build submission
import pandas as pd

# Ensure test dataframe has the same columns as the training data used by the predictor
# This is necessary because AutoGluon expects the same features during prediction

# Reload the training data to get the column names used by the predictor
train_reloaded = pd.read_csv('data/ieee/train_transaction.csv').merge(
    pd.read_csv('data/ieee/train_identity.csv'), on='TransactionID', how='left'
)

# Get the list of columns from the training data, excluding the target variable
train_cols = train_reloaded.columns.tolist()
train_cols.remove(label) # Assuming 'label' is the name of the target column

# Reindex the test dataframe to match the training columns
# This will add missing columns with NaN values and drop extra columns
test_processed = test.reindex(columns=train_cols, fill_value=None)


proba = predictor.predict_proba(test_processed)


# If predict_proba returned a DataFrame, select positive class
if hasattr(proba, 'columns'):
    if 1 in proba.columns:
        proba = proba[1]
    else:
        proba = proba.iloc[:, -1]

submission = pd.DataFrame({
    'TransactionID': test['TransactionID'],
    'isFraud': proba.astype(float)
})
Path('submissions').mkdir(exist_ok=True, parents=True)
sub_path = 'submissions/ieee_autogluon_submission.csv'
submission.to_csv(sub_path, index=False)
print(f'✅ Saved {sub_path}')
display(submission.head())

✅ Saved submissions/ieee_autogluon_submission.csv


Unnamed: 0,TransactionID,isFraud
0,3663549,0.002891
1,3663550,0.002138
2,3663551,0.002151
3,3663552,0.003878
4,3663553,0.002528


In [None]:
# 8) (Optional) Submit to Kaggle directly from notebook
DO_SUBMIT = False  # <-- set True after you accept the competition rules on the Kaggle page

if DO_SUBMIT:
    !kaggle competitions submit -c ieee-fraud-detection -f submissions/ieee_autogluon_submission.csv -m "AutoGluon baseline"
else:
    print("Skipping submission. Set DO_SUBMIT=True to submit.")


All done! Make sure to **save the notebook with outputs** (`File > Save a copy in GitHub`) so graders can see your run.