# Hypertuning

**Import train, val, test sets from Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')
drive_path = '/content/drive/MyDrive/Colab Notebooks/'

Mounted at /content/drive


In [2]:
import pandas as pd
import polars as pl
from datetime import timedelta
import os

In [3]:
df_train = pl.read_parquet(os.path.join(drive_path, "df_train.parquet"))
df_val = pl.read_parquet(os.path.join(drive_path, "df_val.parquet"))

In [4]:
df_train.head()

Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,year,month,day_of_month,day_of_week,day_of_year,hour,minute,second,currency_mismatch,cross_border,high_risk_sender,high_risk_receiver,fanin_30d,fanout_30d,daily_recieve,monthly_receive,monthly_send,back_and_forth_transfers,daily_receive,amount_dispersion_std,fan_in_out_ratio,fanin_intensity_ratio,sent_to_received_ratio_monthly,daily_receiver_transaction,weekly_receiver_transaction,daily_sender_transaction,weekly_sender_transaction,circular_transaction_count
time,date,i64,i64,f64,str,str,str,str,str,i8,i16,i8,i8,i8,i16,i8,i8,i8,i8,i8,i8,i8,u32,u32,u32,f64,f64,i8,u32,f64,f64,f64,f64,i16,i16,i16,i16,i8
10:35:34,2022-10-07,5606024775,8646193759,7.122552,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Cash Deposit""",0,2022,10,7,5,280,10,35,34,0,0,0,0,1,1,1,7.122552,7.122552,1,1,0.352988,1.0,1.0,1.0,1,1,1,1,0
10:36:04,2022-10-07,2004133484,3079818547,8.999083,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Credit card""",0,2022,10,7,5,280,10,36,4,0,0,0,0,8,1,8,624.297626,98.937884,11,8,0.002648,8.0,1.0,6.309996,68,68,11,11,0
10:36:31,2022-10-07,4082552798,4173195094,9.39055,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Cheque""",0,2022,10,7,5,280,10,36,31,0,0,0,0,9,1,8,696.723021,112.611058,12,8,0.004816,9.0,1.125,6.186986,74,74,12,12,0
10:37:15,2022-10-07,35328673,1794595555,6.480489,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Debit card""",0,2022,10,7,5,280,10,37,15,0,0,0,0,1,13,1,12.887551,156.293105,1,1,1.946611,0.076923,1.0,0.082458,1,1,1,3,0
10:37:41,2022-10-07,7189128119,4344504728,9.778583,"""UK pounds""","""UK pounds""","""UK""","""UK""","""Credit card""",0,2022,10,7,5,280,10,37,41,0,0,0,0,1,6,1,9.778583,59.85713,1,1,1.567371,0.166667,1.0,0.163365,1,1,1,1,0


**XGBoost**

In [5]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score
import itertools
import pandas as pd

In [6]:
opt_features = ["Amount", "year", "day_of_month", "day_of_week", "hour", "minute", "second",
            "currency_mismatch", "high_risk_sender", "high_risk_receiver",
            "fanin_30d", "fanin_intensity_ratio",
            "sent_to_received_ratio_monthly", "back_and_forth_transfers",
            "circular_transaction_count", "Is_laundering"]

In [7]:
# Configuration
features = opt_features[:-1]
target = opt_features[-1]
use_gpu = False             # set True if you have GPU and xgboost built with GPU support
early_stopping_rounds = 50
num_boost_round = 500
verbose_eval = False        # set to integer for logging every k rounds

In [8]:
# 1) Convert Polars -> NumPy once (casting to float32)
#    Handle missing values and simple categorical encoding if needed.
def polars_to_numpy_for_xgb(df_pl: pl.DataFrame, features, target):
    # Ensure numeric features; cast numeric-like to Float32
    X_df = df_pl.select([pl.col(c).cast(pl.Float32).alias(c) for c in features])
    y_arr = df_pl.select(pl.col(target)).to_numpy().ravel()
    X_arr = X_df.to_numpy()
    return X_arr, y_arr

X_train, y_train = polars_to_numpy_for_xgb(df_train, features, target)
X_val, y_val = polars_to_numpy_for_xgb(df_val, features, target)

In [9]:
# 2) Build DMatrix once per dataset (faster than rebuilding inside loop)
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
watchlist = [(dtrain, "train"), (dval, "eval")]

In [10]:
param_grid = {
    'max_depth': [3, 5, 10],
    'learning_rate': [0.1, 0.05, 0.01],
    'subsample': [0.3, 0.5, 1]
}

In [11]:
def predict_with_best_iter(bst, dmat):
    # bst.best_iteration is int (0-based). If None, predict full model.
    best_it = getattr(bst, "best_iteration", None)
    if best_it is None:
        return bst.predict(dmat)
    # iteration_range expects (begin, end) with end exclusive
    return bst.predict(dmat, iteration_range=(0, best_it + 1))

    # inside your loop, replace prediction section with:
    #preds_proba = predict_with_best_iter(bst, dval)
    #preds = (preds_proba > 0.5).astype(int)


In [12]:
import time

# 3) Optimized grid search loop using xgb.train
results = []
start_all = time.time()

for max_depth, lr, subs in itertools.product(
        param_grid['max_depth'],
        param_grid['learning_rate'],
        param_grid['subsample']):

    params = {
        "max_depth": int(max_depth),
        "eta": float(lr),                 # alias for learning_rate in xgb.train params
        "subsample": float(subs),
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "seed": 42,
        "verbosity": 0,
        "nthread": -1
    }
    if use_gpu:
        params["tree_method"] = "gpu_hist"
        params["predictor"] = "gpu_predictor"
    else:
        params["tree_method"] = "hist"

    t0 = time.time()
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=watchlist,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=verbose_eval
    )
    t1 = time.time()

    # predictions on validation (use best_ntree_limit if early stopping occurred)
    preds_proba = predict_with_best_iter(bst, dval)
    preds = (preds_proba > 0.5).astype(int)

    f1 = f1_score(y_val, preds)
    prec = precision_score(y_val, preds)
    rec = recall_score(y_val, preds)
    report = classification_report(y_val, preds, digits=4)

    results.append({
        "max_depth": int(max_depth),
        "learning_rate": float(lr),
        "subsample": float(subs),
        "f1_score": f1,
        "precision_score": prec,
        "recall_score": rec,
        "report": report,
        "model": bst,
        "train_time_s": t1 - t0
    })

total_time = time.time() - start_all

# 4) Convert to pandas DataFrame and sort by f1 (descending)
df_results = pd.DataFrame([{k:v for k,v in r.items() if k != "model"} for r in results])
df_results = df_results.sort_values("f1_score", ascending=False).reset_index(drop=True)

# Best model and params
best = results[df_results.index[0]]
best_params = {"max_depth": best["max_depth"], "learning_rate": best["learning_rate"], "subsample": best["subsample"]}
best_model = best["model"]

# 5) Reporting
print(f"Grid search finished in {total_time:.1f}s, tried {len(results)} combos")
print("Top 5 combos by F1:")
print(df_results.head(10))
print("\nBest combo:", best_params)
print("\nClassification report for best model:")
print(best["report"])

Grid search finished in 9579.2s, tried 27 combos
Top 5 combos by F1:
   max_depth  learning_rate  subsample  f1_score  precision_score  \
0         10           0.01        1.0  0.526973         0.821239   
1         10           0.01        0.5  0.524741         0.841328   
2          5           0.05        1.0  0.522952         0.857143   
3          5           0.05        0.3  0.520612         0.880478   
4          5           0.10        1.0  0.519864         0.809187   
5          5           0.05        0.5  0.519511         0.856046   
6          3           0.10        0.5  0.518913         0.885081   
7         10           0.05        0.5  0.517755         0.821818   
8         10           0.05        1.0  0.517514         0.797909   
9         10           0.01        0.3  0.516919         0.855212   

   recall_score                                             report  \
0      0.387960                precision    recall  f1-score   ...   
1      0.381271                

In [13]:
df_results

Unnamed: 0,max_depth,learning_rate,subsample,f1_score,precision_score,recall_score,report,train_time_s
0,10,0.01,1.0,0.526973,0.821239,0.38796,precision recall f1-score ...,585.976552
1,10,0.01,0.5,0.524741,0.841328,0.381271,precision recall f1-score ...,603.601475
2,5,0.05,1.0,0.522952,0.857143,0.376254,precision recall f1-score ...,336.453116
3,5,0.05,0.3,0.520612,0.880478,0.369565,precision recall f1-score ...,364.551504
4,5,0.1,1.0,0.519864,0.809187,0.382943,precision recall f1-score ...,249.431554
5,5,0.05,0.5,0.519511,0.856046,0.37291,precision recall f1-score ...,401.940874
6,3,0.1,0.5,0.518913,0.885081,0.367057,precision recall f1-score ...,329.496894
7,10,0.05,0.5,0.517755,0.821818,0.377926,precision recall f1-score ...,241.418152
8,10,0.05,1.0,0.517514,0.797909,0.382943,precision recall f1-score ...,190.084901
9,10,0.01,0.3,0.516919,0.855212,0.370401,precision recall f1-score ...,601.456531


In [17]:
df_results.sort_values('recall_score', ascending=False)

Unnamed: 0,max_depth,learning_rate,subsample,f1_score,precision_score,recall_score,report,train_time_s
0,10,0.01,1.0,0.526973,0.821239,0.38796,precision recall f1-score ...,585.976552
8,10,0.05,1.0,0.517514,0.797909,0.382943,precision recall f1-score ...,190.084901
4,5,0.1,1.0,0.519864,0.809187,0.382943,precision recall f1-score ...,249.431554
1,10,0.01,0.5,0.524741,0.841328,0.381271,precision recall f1-score ...,603.601475
7,10,0.05,0.5,0.517755,0.821818,0.377926,precision recall f1-score ...,241.418152
10,5,0.1,0.3,0.515982,0.81295,0.377926,precision recall f1-score ...,177.387627
2,5,0.05,1.0,0.522952,0.857143,0.376254,precision recall f1-score ...,336.453116
13,10,0.1,1.0,0.511708,0.807207,0.374582,precision recall f1-score ...,138.389326
11,5,0.1,0.5,0.515868,0.832402,0.373746,precision recall f1-score ...,236.129105
5,5,0.05,0.5,0.519511,0.856046,0.37291,precision recall f1-score ...,401.940874
