## Summary
This notebook demonstrates an advanced approach to the AMEX Default Prediction competition with:

- GPU Acceleration: Uses RAPIDS (cuDF, cuPy) for fast data processing

- Memory Efficiency: Implements custom data iterators and chunking strategies

- Feature Engineering: Creates comprehensive time-series aggregations

- Custom Metrics: Implements the competition-specific AMEX metric

- Cross-Validation: Uses 5-fold CV with proper model ensembling

- Scalable Inference: Processes large test datasets in manageable chunks

The solution achieves strong performance through careful optimization of both computation and memory usage, making it suitable for large-scale tabular data competitions.

In [None]:
# Import essential libraries for GPU-accelerated data processing and machine learning
import pandas as pd, numpy as np  # Standard data manipulation libraries
import cupy, cudf                # RAPIDS libraries for GPU acceleration
import matplotlib.pyplot as plt, gc, os  # Plotting, garbage collection, and OS utilities

# Display RAPIDS version for reproducibility
print('RAPIDS version', cudf.__version__)

RAPIDS version 21.10.01


In [None]:
# Model version for tracking experiments
VER = 1

# Random seed for reproducible results across runs
SEED = 17

# Value to replace NaN/missing values (chosen to be outside typical range)
NAN_VALUE = -127

# Number of cross-validation folds for model validation
FOLDS = 5

In [None]:
def read_file(path = '', usecols = None):
    """
    Load parquet data with preprocessing for AMEX competition format
    
    Args:
        path: Path to parquet file
        usecols: Specific columns to load (memory optimization)
    
    Returns:
        cuDF DataFrame with preprocessed data
    """
    # Load parquet file (optionally with column selection for memory efficiency)
    if usecols is not None:
        df = cudf.read_parquet(path, columns = usecols)
    else:
        df = cudf.read_parquet(path)
    
    # Convert customer_ID from hexadecimal string to int64 for efficient processing
    # Takes last 16 characters of hex string and converts to integer
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    
    # Convert S_2 column to datetime format for temporal features
    df.S_2 = cudf.to_datetime(df.S_2)
    
    # Replace all NaN values with predefined constant for XGBoost compatibility
    df = df.fillna(NAN_VALUE)
    print('shape of data:', df.shape)
    
    return df

# Load training data using the preprocessing function
print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 190)


In [None]:
# Display first few rows to understand data structure and verify loading
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,-4532153018459703766,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,-127.0,0,0.00061,0
1,-4532153018459703766,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,-127.0,0,0.005492,0
2,-4532153018459703766,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,-127.0,0,0.006986,0
3,-4532153018459703766,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,-127.0,0,0.006527,0
4,-4532153018459703766,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,-127.0,0,0.008126,0


In [None]:
def process_and_feature_engineer(df):
    """
    Create aggregated features from time-series customer data
    
    The AMEX dataset contains multiple observations per customer over time.
    This function aggregates these into customer-level features for modeling.
    
    Args:
        df: Raw time-series data with multiple rows per customer
        
    Returns:
        Aggregated customer-level features
    """
    # Get all columns except customer_ID and timestamp
    all_cols = [col for col in list(df.columns) if not (col == 'cutomer_ID' or col == 'S_2')]
    
    # Define categorical features based on AMEX competition knowledge
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    
    # Remaining features are numerical
    num_features = [col for col in all_cols if col not in cat_features]
    
    # Create comprehensive aggregations for numerical features
    # These capture different aspects of customer behavior over time
    test_num_agg = df.groupby('customer_ID')[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    print('Num_Agg Columns:', test_num_agg.columns)
    
    # Create aggregations for categorical features
    # Count: frequency of observations, Last: most recent value, Nunique: diversity
    test_cat_agg = df.groupby('customer_ID')[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    print('Cat_Agg Columns:', test_num_agg.columns)  # Note: should be test_cat_agg.columns

    # Combine numerical and categorical aggregations
    df = cudf.concat([test_num_agg, test_cat_agg], axis = 1)
    
    # Clean up memory
    del test_num_agg, test_cat_agg
    print('Shape after engineering:', df.shape)
    
    return df

# Apply feature engineering to training data
train = process_and_feature_engineer(train)

Num_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_144_mean', 'D_144_std', 'D_144_min', 'D_144_max', 'D_144_last',
       'D_145_mean', 'D_145_std', 'D_145_min', 'D_145_max', 'D_145_last'],
      dtype='object', length=885)
Cat_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_144_mean', 'D_144_std', 'D_144_min', 'D_144_max', 'D_144_last',
       'D_145_mean', 'D_145_std', 'D_145_min', 'D_145_max', 'D_145_last'],
      dtype='object', length=885)
Shape after engineering: (458913, 918)


In [None]:
# Load target labels for supervised learning
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')

# Convert customer_ID to match format used in features
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')

# Merge features with targets
train = train.merge(targets, left_index = True, right_index = True, how = 'left')
del targets

# Reset index and define feature columns (all except customer_ID and target)
train = train.reset_index()
FEATURES = train.columns[1:-1]  # Exclude customer_ID (index 0) and target (last column)
print(f'There are {len(FEATURES)} features!')


There are 918 features!


In [None]:
# Import required libraries for modeling
from sklearn.model_selection import KFold
import xgboost as xgb
print('XGB Version', xgb.__version__)

# XGBoost hyperparameters optimized for AMEX competition
xgb_params = {
    'max_depth': 4,           # Tree depth - prevents overfitting
    'learning_rate': 0.05,    # Learning rate - conservative for better generalization
    'subsample': 0.8,         # Row sampling - reduces overfitting
    'colsample_bytree': 0.6,  # Column sampling - reduces overfitting
    'eval_metric': 'logloss', # Evaluation metric during training
    'objective': 'binary:logistic',  # Binary classification objective
    'tree_method': 'gpu_hist',       # GPU acceleration for faster training
    'predictor': 'gpu_predictor',    # GPU acceleration for prediction
    'random_state': SEED      # Reproducible results
}

XGB Version 1.6.1


In [None]:
class IterLoadForDMatrix(xgb.core.DataIter):
    """
    Custom iterator for memory-efficient XGBoost training with large datasets
    
    Loads data in batches to avoid memory overflow while maintaining GPU acceleration
    """
    def __init__(self, df = None, features = None, target = None, batch_size = 256 * 1024):
        self.features = features    # Feature column names
        self.target = target       # Target column name
        self.df = df              # Source DataFrame
        self.it = 0               # Current iteration counter
        self.batch_size = batch_size    # Rows per batch
        self.batches = int(np.ceil(len(df) / self.batch_size))  # Total batches needed
        super().__init__()
    
    def reset(self):
        """Reset iterator to beginning"""
        self.it = 0
    
    def next(self, input_data):
        """
        Load next batch of data
        
        Returns:
            1 if data loaded successfully, 0 if no more data
        """
        if self.it == self.batches:
            return 0  # No more batches
        
        # Calculate batch boundaries
        a = self.it * self.batch_size
        b = min((self.it + 1) * self.batch_size, len(self.df))
        
        # Load batch as cuDF DataFrame
        dt = cudf.DataFrame(self.df.iloc[a:b])
        
        # Pass data to XGBoost
        input_data(data = dt[self.features], label = dt[self.target])
        
        self.it += 1
        return 1  # Successfully loaded batch

In [None]:
def amex_metric_mod(y_true, y_pred):
    """
    AMEX competition metric: weighted combination of Gini coefficient and top-4% precision
    
    This metric balances overall ranking quality (Gini) with performance on highest-risk customers (top-4%)
    
    Args:
        y_true: Ground truth binary labels
        y_pred: Predicted probabilities
        
    Returns:
        AMEX metric score (higher is better)
    """
    # Calculate top-4% precision with class weighting
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]  # Sort by prediction descending
    weights = np.where(labels[:,0]==0, 20, 1)     # Weight negative class 20x
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]  # Top 4% by weight
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])  # Precision in top 4%

    # Calculate normalized Gini coefficient
    gini = [0,0]
    for i in [1,0]:  # Calculate for both prediction and random
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] * weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)

    # Return weighted combination: 50% normalized Gini + 50% top-4% precision
    return 0.5 * (gini/gini + top_four)

In [None]:
# Initialize containers for model outputs
importances = []                    # Feature importance from each fold
oof = np.zeros(len(train))         # Out-of-fold predictions
train = train.to_pandas()          # Convert to pandas for sklearn compatibility
TRAIN_SUBSAMPLE = 1.0              # Use full dataset (can be reduced for testing)
gc.collect()                       # Clean memory before training

# Set up 5-fold cross-validation
skf = KFold(n_splits = FOLDS)

# Train model on each fold
for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):
    
    # Optional subsampling for faster experimentation
    if TRAIN_SUBSAMPLE < 1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, int(len(train_idx)*TRAIN_SUBSAMPLE), replace = False)
        np.random.seed(None)
    
    print('#' * 25)
    print('### Fold', fold + 1)
    print('### Train size', len(train_idx), 'Valid size', len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#' * 25)
    
    # Prepare training data with memory-efficient iterator
    Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
    
    # Prepare validation data
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']
    
    # Create XGBoost data structures
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin = 256)  # GPU-optimized training matrix
    dvalid = xgb.DMatrix(data = X_valid, label = y_valid)        # Validation matrix
    
    # Train XGBoost model with early stopping
    model = xgb.train(xgb_params,
                      dtrain = dtrain,
                      evals = [(dtrain, 'dtrain'), (dvalid, 'dvalid')],
                      num_boost_round = 9999,      # Large number with early stopping
                      early_stopping_rounds = 100, # Stop if no improvement for 100 rounds
                      verbose_eval = 100           # Print progress every 100 rounds
                     )
    # Save trained model for later inference
    model.save_model(f'XGB_v{VER}_fold{fold}.xgb')
    
    # Extract and store feature importances
    dd = model.get_score(importance_type = 'weight')
    df = pd.DataFrame({'feature': dd.keys(), f'importance_{fold}': dd.values()})
    importances.append(df)
    
    # Generate out-of-fold predictions and calculate metric
    oof_preds = model.predict(dvalid)
    acc = amex_metric_mod(y_valid.values, oof_preds)
    print('Kaggle Metric =', acc, '\\n')
    
    # Store out-of-fold predictions for final CV score
    oof[valid_idx] = oof_preds
    
    # Clean up memory after each fold
    del dtrain, Xy_train, dd, df
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()

# Calculate overall cross-validation score
print('#' * 25)
acc = amex_metric_mod(train.target.values, oof)
print('OVERALL CV Kaggle Metric =', acc)

#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################
[0]	dtrain-logloss:0.66203	dvalid-logloss:0.66213
[100]	dtrain-logloss:0.23650	dvalid-logloss:0.23997
[200]	dtrain-logloss:0.22231	dvalid-logloss:0.22796
[300]	dtrain-logloss:0.21624	dvalid-logloss:0.22392
[400]	dtrain-logloss:0.21226	dvalid-logloss:0.22194
[500]	dtrain-logloss:0.20908	dvalid-logloss:0.22082
[600]	dtrain-logloss:0.20638	dvalid-logloss:0.22003
[700]	dtrain-logloss:0.20386	dvalid-logloss:0.21946
[800]	dtrain-logloss:0.20155	dvalid-logloss:0.21910
[900]	dtrain-logloss:0.19940	dvalid-logloss:0.21873
[1000]	dtrain-logloss:0.19728	dvalid-logloss:0.21853
[1100]	dtrain-logloss:0.19531	dvalid-logloss:0.21835
[1200]	dtrain-logloss:0.19338	dvalid-logloss:0.21819
[1300]	dtrain-logloss:0.19148	dvalid-logloss:0.21809
[1400]	dtrain-logloss:0.18963	dvalid-logloss:0.21795
[1500]	dtrain-logloss:0.18784	dvalid-logloss:0.21783
[1600]	dtrain-logloss:0.18

In [None]:
# Remove training data and trigger garbage collection to free memory for inference
del train
gc.collect()

In [None]:
# CALCULATE SIZE OF EACH SEPARATE TEST PART
def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
    """
    Calculate optimal chunking strategy for processing large test dataset
    
    Args:
        customers: List of unique customer IDs
        test: Test dataset (used for row counting)
        NUM_PARTS: Number of chunks to split data into
        verbose: Description for logging
        
    Returns:
        rows: List of row counts per chunk
        chunk: Number of customers per chunk
    """
    chunk = len(customers)//NUM_PARTS  # Customers per chunk
    
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    
    rows = []
    # Calculate rows needed for each chunk of customers
    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: 
            cc = customers[k*chunk:]  # Last chunk gets remaining customers
        else: 
            cc = customers[k*chunk:(k+1)*chunk]  # Regular chunk
        
        # Count rows for this set of customers
        s = test.loc[test.customer_ID.isin(cc)].shape
        rows.append(s)
    
    if verbose != '': 
        print( rows )
    return rows, chunk

# COMPUTE SIZE OF 4 PARTS FOR TEST DATA
NUM_PARTS = 4
TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'

print(f'Reading test data...')
# Load only customer_ID and timestamp columns for chunking calculation
test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows, num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

Reading test data...
shape of data: (11363762, 2)
We will process test data as 4 separate parts.
There will be 231155 customers in each part (except the last part).
Below are number of rows in each part:
[2841209, 2839857, 2842105, 2840591]


In [None]:
# INFER TEST DATA IN PARTS
skip_rows = 0      # Track position in full dataset
skip_cust = 0      # Track position in customer list
test_preds = []    # Store predictions from each chunk

# Process each chunk of test data
for k in range(NUM_PARTS):
    
    # READ PART OF TEST DATA
    print(f'\\nReading test data...')
    test = read_file(path = TEST_PATH)  # Load full test data
    test = test.iloc[skip_rows:skip_rows+rows[k]]  # Extract current chunk
    skip_rows += rows[k]  # Update position for next chunk
    print(f'=> Test part {k+1} has shape', test.shape )
    
    # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
    test = process_and_feature_engineer(test)  # Apply same feature engineering as training
    
    # Select customers for this chunk
    if k==NUM_PARTS-1: 
        test = test.loc[customers[skip_cust:]]  # Last chunk gets remaining customers
    else: 
        test = test.loc[customers[skip_cust:skip_cust+num_cust]]  # Regular chunk
    skip_cust += num_cust  # Update position for next chunk
    
    # TEST DATA FOR XGB
    X_test = test[FEATURES]  # Extract features for prediction
    dtest = xgb.DMatrix(data=X_test)  # Create XGBoost data structure
    test = test[['P_2_mean']]  # Keep only one column to reduce memory usage
    del X_test
    gc.collect()

    # INFER XGB MODELS ON TEST DATA
    # Load first model and make prediction
    model = xgb.Booster()
    model.load_model(f'XGB_v{VER}_fold0.xgb')
    preds = model.predict(dtest)
    
    # Add predictions from remaining folds
    for f in range(1,FOLDS):
        model.load_model(f'XGB_v{VER}_fold{f}.xgb')
        preds += model.predict(dtest)
    
    # Average predictions across all folds
    preds /= FOLDS
    test_preds.append(preds)

    # CLEAN MEMORY
    del dtest, model
    _ = gc.collect()


Reading test data...
shape of data: (11363762, 190)
=> Test part 1 has shape (2841209, 190)
Num_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_144_mean', 'D_144_std', 'D_144_min', 'D_144_max', 'D_144_last',
       'D_145_mean', 'D_145_std', 'D_145_min', 'D_145_max', 'D_145_last'],
      dtype='object', length=885)
Cat_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_144_mean', 'D_144_std', 'D_144_min', 'D_144_max', 'D_144_last',
       'D_145_mean', 'D_145_std', 'D_145_min', 'D_145_max', 'D_145_last'],
      dtype='object', length=885)
Shape after engineering: (231155, 918)

Reading test data...
shape of data: (11363762, 190)
=> Test part 2 has shape (2839857, 190)
Num_Agg Columns: Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_

In [None]:
# WRITE SUBMISSION FILE

# Combine predictions from all test chunks
test_preds = np.concatenate(test_preds)

# Create DataFrame with customer IDs and predictions
test = cudf.DataFrame(index=customers, data={'prediction':test_preds})

# Load sample submission to get correct format and customer ID order
sub = cudf.read_csv('../input/amex-default-prediction/sample_submission.csv')[['customer_ID']]

# Convert customer IDs to match our internal format
sub['customer_ID_hash'] = sub['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
sub = sub.set_index('customer_ID_hash')

# Merge predictions with submission format
sub = sub.merge(test[['prediction']], left_index=True, right_index=True, how='left')
sub = sub.reset_index(drop=True)

# Save submission file
sub.to_csv(f'submission_xgb_v{VER}.csv', index=False)
print('Submission file shape is', sub.shape )

# Display first few predictions
sub.head()


Submission file shape is (924621, 2)


Unnamed: 0,customer_ID,prediction
0,0359e97c244bbbbe2db7c21e891debe80e82291f2e470e...,0.002059
1,035b3479c9020483c00b7dac8f816759bb3aa6fdd8dfab...,0.000367
2,035a556cc13aae13de7bdcc71c81a1ab27f586f2ddf50e...,0.00286
3,035bca6744c2fe912b15a0bc6011f3ec679cbc7c60e049...,0.039637
4,0359f31145b54da7258ed5ff894cbe50dd4302d3d4a1e9...,0.036869
