In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[2]  
# cwd = fraud_detection/src/training
# parents[2] = Fraud-Detection-Pipeline

sys.path.insert(0, str(PROJECT_ROOT))

In [2]:
import argparse,os,json
from pathlib import Path
from typing import Tuple,Dict

import numpy as np,pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder,FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,precision_score,recall_score,f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


import joblib
import mlflow

import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader,TensorDataset

from fraud_detection.src.utils.common import cast_to_str

LABEL_COL = 'label'

def read_data(path:str):
    df = pd.read_parquet(path)
    if LABEL_COL not in df.columns:
        raise ValueError(f"{LABEL_COL} not found in {path}")
    
    return df

def split_x_y(df:pd.DataFrame):
    df = df.copy()
    y = df[LABEL_COL].astype(int)
    x = df.drop(columns=[LABEL_COL])

    return x,y
# def cast_to_str(df):
#     return df.astype(str)


def Build_preprocessor(
        X: pd.DataFrame,
        endcoder_type: str = 'ordinal',
        onehot_drop_first: bool = False
):
    num_cols = X.select_dtypes(include=['number']).columns.to_list()
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.to_list()

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    if endcoder_type == 'onehot':
        drop_arg = 'first' if onehot_drop_first else None
        cat_encoder = OneHotEncoder(
            handle_unknown='ignore',
            sparse_output=False,
            drop=drop_arg
        )
    elif endcoder_type == 'ordinal':
        cat_encoder = OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        )
    else:
        raise ValueError("Encoder Type must be 'ordinal' or 'onehot'")

    cat_pipeline = Pipeline([
        ('as_str', FunctionTransformer(cast_to_str, validate=False)),
        ('encoder', cat_encoder)
    ])

    transformers = []
    if num_cols:
        transformers.append(('num', num_pipeline, num_cols))
    if cat_cols:
        transformers.append(('cat', cat_pipeline, cat_cols))

    if not transformers:
        raise ValueError("No numeric or categorical columns found")

    col_transformer = ColumnTransformer(
        transformers,
        remainder='drop',
        sparse_threshold=0.0
    )

    preprocesor_pipeline = Pipeline([
        ('preprocessor', col_transformer)
    ])

    preprocesor_pipeline.fit(X)

    try:
        feature_names = list(
            preprocesor_pipeline
            .named_steps['preprocessor']
            .get_feature_names_out()
        )
    except Exception:
        feature_names = []

    def transform(df: pd.DataFrame):
        missing = [c for c in num_cols + cat_cols if c not in df.columns]
        if missing:
            raise ValueError(f"Missing required columns: {missing}")

        arr = preprocesor_pipeline.transform(df)
        return np.asarray(arr)

    preprocessor_meta = {
        'num_cols': num_cols,
        'cat_cols': cat_cols,
        'preprocessor': preprocesor_pipeline,
        'feature_names': feature_names,
        'encoder_type': endcoder_type
    }

    return preprocessor_meta, transform









def train_xgboost(
    x_train,
    y_train,
    x_val,
    y_val,
    preprocess_meta,
    params: Dict
):
    neg = (y_train == 0).sum()
    pos = (y_train == 1).sum()
    scale_pos_weight = max(1.0, neg / max(1, pos))

    clf = xgb.XGBClassifier(
        n_estimators=int(params.get("n_estimators", 200)),
        max_depth=int(params.get("max_depth", 6)),
        learning_rate=float(params.get("learning_rate", 0.1)),
        subsample=float(params.get("subsample", 0.8)),
        colsample_bytree=float(params.get("colsample_bytree", 0.8)),
        eval_metric="auc",
        scale_pos_weight=scale_pos_weight,
        n_jobs=params.get("n_jobs", -1),
        early_stopping_rounds=30
    )

    clf.fit(
        x_train,
        y_train,
        eval_set=[(x_val, y_val)],
        verbose=False
    )

    pred_proba = clf.predict_proba(x_val)[:, 1]
    pred = (pred_proba >= params.get("threshold", 0.5)).astype(int)

    metrics = evaluate_preds(y_val, pred_proba, pred)


    joblib.dump(clf, "C:/Fraud-Dectection-Pipeline/fraud_detection/models/xgb_model.joblib")
    mlflow.log_artifact("C:/Fraud-Dectection-Pipeline/fraud_detection/models/xgb_model.joblib",
                        artifact_path="model")

    joblib.dump(preprocess_meta,"C:/Fraud-Dectection-Pipeline/fraud_detection/models/preprocess_meta.joblib")
    mlflow.log_artifact("C:/Fraud-Dectection-Pipeline/fraud_detection/models/preprocess_meta.joblib",
                        artifact_path="preprocess")
    mlflow.sklearn.log_model(clf,artifact_path="sklearn_model")

    return metrics,clf




def evaluate_preds(y_true,y_proba,y_pred):
    auc = roc_auc_score(y_true,y_proba) if len(np.unique(y_true)) > 1 else 0.5
    precision = precision_score(y_true,y_pred,zero_division=0)
    recall = recall_score(y_true,y_pred,zero_division=0)
    f1 = f1_score(y_true,y_pred,zero_division=0)
    return {'auc':float(auc),'precision':float(precision),'recall':float(recall),'f1':float(f1)}

In [3]:
import os
os.environ["MLFLOW_ENABLE_MODEL_REGISTRY"] = "false"


In [4]:
fraud_path = 'C:/Fraud-Dectection-Pipeline/fraud_detection/data/fraud.parquet'
fraud_df = pd.read_parquet(fraud_path)
features_path = 'C:/Fraud-Dectection-Pipeline/fraud_detection/data/features.parquet'
features_df = pd.read_parquet(features_path)

In [5]:
fraud_df.columns.to_list()

['tx_id',
 'user_id',
 'amount',
 'device',
 'ip_hash',
 'metadata',
 'confirmed_fraud',
 'label_ts']

In [6]:
features_df = features_df.rename(columns={'ts':'label_ts'})
features_df.columns.to_list()   


['tx_id', 'user_id', 'amount', 'device', 'ip_hash', 'label_ts', 'metadata']

In [7]:
fraud_df.head()

Unnamed: 0,tx_id,user_id,amount,device,ip_hash,metadata,confirmed_fraud,label_ts
0,000e4d33-6ae1-4f48-a19d-5265a1f8ee61,3398,62.43,web,783765714,{'country': 'US'},True,2025-12-14T06:58:14.684174
1,001ab64c-9068-45fd-bc82-ff6bbe629d21,846,144.9,web,1557965436,{'country': 'BR'},False,2025-12-14T07:22:11.739529
2,0044d47f-4bdd-4fcb-aa74-7797c88b7388,9104,69.16,android,1867529584,{'country': 'NZ'},False,2025-12-14T07:23:22.094546
3,0053cd47-7650-43f0-b50b-374c008c6f41,6018,35.42,ios,3924366280,{'country': 'BR'},True,2025-12-14T06:52:00.372607
4,0061bbe4-73d0-4e8a-aff8-c4088d1575f2,2591,46.47,android,2192537053,{'country': 'CN'},True,2025-12-14T07:18:41.170647


In [8]:
features_df.head()

Unnamed: 0,tx_id,user_id,amount,device,ip_hash,label_ts,metadata
0,40744d74-31fc-4a30-b713-32eac779c8cc,8208,5.4,android,3062789086,2025-12-14T05:14:49.960692,{'country': 'CN'}
1,1686db2b-1191-4332-8cdb-5e0f8331b480,439,144.21,ios,1804060879,2025-12-14T05:14:50.113386,{'country': 'NZ'}
2,996bc6aa-9566-4998-84b2-98133565151e,3177,101.25,ios,3769892912,2025-12-14T05:14:50.218727,{'country': 'US'}
3,d98b3da2-2446-46a9-ab86-9bef6b59c345,4047,250.68,web,3036506070,2025-12-14T05:14:50.240189,{'country': 'CN'}
4,94545382-0d4d-4dc7-bccc-35ab66929472,2274,30.66,ios,2061003328,2025-12-14T05:14:50.261374,{'country': 'IN'}


In [9]:
features_df['confirmed_fraud'] = bool(False)

In [10]:
common_cols = [
    "tx_id",
    "user_id",
    "amount",
    "device",
    "ip_hash",
    "metadata",
    "confirmed_fraud",
    "label_ts"
]

In [11]:
fraud_df = fraud_df[common_cols]
features_df = features_df[common_cols]

In [12]:
final_df = pd.concat([features_df,fraud_df],ignore_index=True,axis=0)
final_df.head()

Unnamed: 0,tx_id,user_id,amount,device,ip_hash,metadata,confirmed_fraud,label_ts
0,40744d74-31fc-4a30-b713-32eac779c8cc,8208,5.4,android,3062789086,{'country': 'CN'},False,2025-12-14T05:14:49.960692
1,1686db2b-1191-4332-8cdb-5e0f8331b480,439,144.21,ios,1804060879,{'country': 'NZ'},False,2025-12-14T05:14:50.113386
2,996bc6aa-9566-4998-84b2-98133565151e,3177,101.25,ios,3769892912,{'country': 'US'},False,2025-12-14T05:14:50.218727
3,d98b3da2-2446-46a9-ab86-9bef6b59c345,4047,250.68,web,3036506070,{'country': 'CN'},False,2025-12-14T05:14:50.240189
4,94545382-0d4d-4dc7-bccc-35ab66929472,2274,30.66,ios,2061003328,{'country': 'IN'},False,2025-12-14T05:14:50.261374


In [13]:
final_df = final_df.sample(frac=1.0,random_state=42).reset_index(drop=True)
final_df.head()

Unnamed: 0,tx_id,user_id,amount,device,ip_hash,metadata,confirmed_fraud,label_ts
0,5e32c2a5-a46c-4c11-a7fe-2544aee87009,4765,0.66,android,3062904636,{'country': 'NZ'},False,2025-12-14T06:41:58.397235
1,1964e16c-4a41-439f-b959-6b5ab1629185,3909,53.55,web,552005629,{'country': 'US'},False,2025-12-14T05:15:12.692797
2,2387f020-6394-45e8-a9a9-4a8e1ea23154,3382,80.58,ios,1470284525,{'country': 'CN'},True,2025-12-14T07:19:30.646934
3,afdbe756-a367-4525-a91f-6ae7d8ba5bc2,6845,86.55,android,490418122,{'country': 'IN'},False,2025-12-14T05:15:48.932342
4,1e7d2261-fca4-429a-a1b0-6eaf1b4d431f,8584,27.88,android,3268987045,{'country': 'BR'},False,2025-12-14T06:51:28.750113


In [14]:
x = final_df.drop(columns=['tx_id','user_id','confirmed_fraud'])
y = final_df['confirmed_fraud'].astype(int)

In [15]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [16]:
amount_val = x_val['amount'].values

In [17]:
preprocess_meta,preprocess_func = Build_preprocessor(x_train,endcoder_type='ordinal',onehot_drop_first=False)
x_train = preprocess_func(x_train)
x_val = preprocess_func(x_val)

In [18]:
params = {
        "model": 'xgboost',
        "threshold": 0.5,
        "n_estimators": 200,
        "max_depth": 6,
        "learning_rate": 0.1,
        "epochs": 20 ,
        "batch_size": 1024,
        "use_focal": 'store_true'
    }



In [19]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("fraud_detection_experiment")


<Experiment: artifact_location='s3://mlflow-artifacts/1', creation_time=1765703541698, experiment_id='1', last_update_time=1765703541698, lifecycle_stage='active', name='fraud_detection_experiment', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [20]:
import os

os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"


In [21]:
# import sys
# from pathlib import Path

# PROJECT_ROOT = Path.cwd().parents[2]  
# # cwd = fraud_detection/src/training
# # parents[2] = Fraud-Detection-Pipeline

# sys.path.insert(0, str(PROJECT_ROOT))



In [22]:
from fraud_detection.src.training.evaluator import business_loss,evaluate_model,find_best_threshold

In [23]:
with mlflow.start_run(run_name='xgboost_baseline') as run:
    metrics,clf = train_xgboost(x_train,y_train,x_val,y_val,preprocess_meta,params)
    for k,v in metrics.items():
        mlflow.log_metric(k,v)
    eval_df = pd.DataFrame([metrics])
    eval_df.to_csv('C:/Fraud-Dectection-Pipeline/fraud_detection/data/eval_metrics.csv',index=False)
    mlflow.log_artifact('C:/Fraud-Dectection-Pipeline/fraud_detection/data/eval_metrics.csv',
                        artifact_path='evaluation')
    y_proba = clf.predict_proba(x_val)[:,1]
    best_t,best_loss = find_best_threshold(y_val,y_proba,amount_val)
    metrics_eco = evaluate_model(y_val,y_proba,amount_val,best_t)
    mlflow.log_metric('val_auc',metrics_eco['auc'])
    mlflow.log_metric('business_loss',metrics_eco['business_loss'])
    mlflow.log_metric('best_threshold',best_t)
print("Training complete.")
print("Metrics:",metrics)




üèÉ View run xgboost_baseline at: http://localhost:5000/#/experiments/1/runs/e5bfc7096bd74aa9b5c1d82134db2613
üß™ View experiment at: http://localhost:5000/#/experiments/1
Training complete.
Metrics: {'auc': 0.5748500372170411, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}


In [24]:
print(mlflow.version)


<module 'mlflow.version' from 'c:\\Fraud-Dectection-Pipeline\\pyvenv\\Lib\\site-packages\\mlflow\\version.py'>
