## **Model Building with Feature Engineering(XGBoost)**

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,average_precision_score

In [3]:
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "upi_features.csv"
assert DATA_PATH.exists(), f"Dataset not found at {DATA_PATH}"

df_fe = pd.read_csv(DATA_PATH, encoding="utf-8")

In [4]:
y = df_fe['fraud_flag']
X = df_fe.drop(columns=['fraud_flag'])

In [5]:
cat_cols = ['transaction type', 'merchant_category', 'transaction_status', 'sender_age_group', 'receiver_age_group',
            'sender_state', 'sender_bank', 'receiver_bank', 'device_type', 'network_type', 'is_weekend', 'year', 'month', 'day', 'minute']
cat_cols = [col for col in cat_cols if col in X.columns]
num_cols = [col for col in X.columns if col not in cat_cols]

In [6]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))])

preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_cols),
    ('num', num_pipe, num_cols)])

In [7]:
X_processed = preprocessor.fit_transform(X)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_processed, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [8]:
model = XGBClassifier(
    n_estimators=400,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1,
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("PR-AUC",average_precision_score(y_test,y_proba))

[[49904     0]
 [   88 49816]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49904
           1       1.00      1.00      1.00     49904

    accuracy                           1.00     99808
   macro avg       1.00      1.00      1.00     99808
weighted avg       1.00      1.00      1.00     99808

ROC AUC: 0.9992381270163113
PR-AUC 0.9995164598208772


The new model has fewer false positives and more true positives (88 vs. 306 false negatives previously), indicating better precision and recall balance on the fraud class. The f1-score and accuracy are near perfect in both, but this model reduces type II errors (missed frauds), which is crucial.

False negatives reduced from 306 to 88 â†’ better detection of fraud cases

Overall precision/recall slightly improved for the minority class

ROC AUC slightly lower but still excellent (~0.999)

The feature-engineered model is better at detecting fraud despite a marginally lower ROC AUC.

In [None]:
import joblib

joblib.dump(model, "../models/xgb_model.pkl")



['../models/xgb_model.pkl']

In [12]:
# Export the feature names from the original DataFrame X
model_columns = X.columns.tolist()
joblib.dump(model_columns, "../models/model_columns.pkl")
print("Feature names saved to models/model_columns.pkl")

Feature names saved to models/model_columns.pkl


In [13]:
# This saves the entire preprocessing pipeline (Scaling, Encoding, Imputation)
joblib.dump(preprocessor, "../models/preprocessor.pkl")

print("Preprocessor saved successfully to ../models/preprocessor.pkl")

Preprocessor saved successfully to ../models/preprocessor.pkl
