# Building and Running the Fraud Detection Pipeline

In this notebook, we'll build and run a Kubeflow Pipeline to train our fraud detection model.

In [1]:
import sys
import os

# Add the project root directory to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Adjust if needed
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to Python path")

Added /Users/prashanth.chaitanya/git-workspaces/kubeflow/kserve-example to Python path


In [2]:
# Import required libraries
import kfp
import pandas as pd
import os
import numpy as np
from sklearn.datasets import make_classification



## Generate Synthetic Data

For this example, we'll create synthetic credit card fraud data.

In [3]:
# Generate synthetic data for credit card fraud detection
def generate_synthetic_data(n_samples=10000, output_path="data/credit_card_data.csv"):
    # Create a synthetic dataset with imbalanced classes (fraud is rare)
    X, y = make_classification(
        n_samples=n_samples,
        n_features=30,  # Common features in credit card data
        n_informative=15,
        n_redundant=5,
        n_classes=2,
        weights=[0.97, 0.03],  # 3% fraud rate (imbalanced)
        random_state=42
    )
    
    # Create feature names similar to credit card transaction data
    feature_names = []
    # Transaction amount and time
    feature_names.append('Amount')
    feature_names.append('Time')
    # Add PCA-like features (V1-V28) as often seen in fraud datasets
    for i in range(1, 29):
        feature_names.append(f'V{i}')
    
    # Create dataframe
    data = pd.DataFrame(X, columns=feature_names)
    data['Class'] = y  # 0 for legitimate, 1 for fraud
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save to CSV
    data.to_csv(output_path, index=False)
    
    return output_path

# Generate the data
data_path = generate_synthetic_data()
print(f"Synthetic data generated at: {data_path}")

# Show a sample of the data
pd.read_csv(data_path).head()

Synthetic data generated at: data/credit_card_data.csv


Unnamed: 0,Amount,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,3.105771,3.056555,1.089509,1.770512,3.371395,0.509769,-0.330443,0.591382,-0.61827,-3.610516,...,-0.912571,2.266941,-0.377578,0.882072,-2.578074,0.05406,0.741488,3.092538,0.836992,0
1,0.785868,3.693099,-0.493906,-0.380848,-2.589695,0.018084,0.458995,-0.464895,-4.143324,4.663926,...,0.585884,-3.021272,2.270142,0.155664,9.388747,0.563799,-0.570881,-0.803469,2.261648,0
2,-1.13168,-0.049499,0.288242,0.707894,-1.290201,2.069087,0.663621,2.446199,-2.300579,0.282125,...,-1.036953,-1.15396,-2.679653,0.684857,5.929246,-1.268497,0.124055,0.573843,1.250873,0
3,0.095961,-1.151803,-0.53578,0.921567,-2.14629,0.954526,2.45992,-0.476551,1.803361,0.086362,...,1.57904,-2.735469,-3.521399,-0.087106,-3.243044,1.082989,0.275934,2.677531,-0.686334,0
4,3.198517,-0.234672,-0.357451,-1.576827,0.497948,1.862552,-1.592175,-0.4039,2.435102,1.489884,...,0.664606,-0.73689,0.484542,-0.456373,-0.63069,0.722676,-0.361606,0.2785,-1.959922,0


## Compile and Run the Pipeline

In [5]:
# Import our pipeline
from src.pipeline.pipeline import fraud_detection_pipeline

# Compile the pipeline
pipeline_func = fraud_detection_pipeline
pipeline_filename = "fraud_detection_pipeline.yaml"
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)

print(f"Pipeline compiled to {pipeline_filename}")

KeyError: 'x_train_path'

In [None]:
# Connect to the Kubeflow Pipelines API
client = kfp.Client()

# Create an experiment
experiment_name = "fraud-detection-kserve"
experiment = client.create_experiment(name=experiment_name)

# Submit the pipeline run
run = client.run_pipeline(
    experiment_id=experiment.id,
    job_name="fraud-detection-training",
    pipeline_package_path=pipeline_filename,
    params={
        "data_path": data_path,
        "model_name": "fraud-detection",
        "model_version": "v1"
    }
)

print(f"Pipeline run submitted with ID: {run.id}")

## Monitor the Pipeline Run

In [None]:
# Get pipeline run status
run_details = client.get_run(run.id)
print(f"Pipeline status: {run_details.run.status}")

In [None]:
# Get pipeline run results
run_result = client.wait_for_run_completion(run.id, timeout=1800)
print(f"Pipeline completed with status: {run_result.run.status}")

## Examine Pipeline Artifacts

In [None]:
# List pipeline artifacts
import kubernetes.client as k8s_client
from kubernetes import config

try:
    config.load_incluster_config()
except:
    config.load_kube_config()

# Create a Kubernetes API client
api_client = k8s_client.ApiClient()
pod_api = k8s_client.CoreV1Api(api_client)
custom_api = k8s_client.CustomObjectsApi(api_client)

# Get the model registry entry
try:
    model_registry = custom_api.get_namespaced_custom_object(
        group="serving.kubeflow.org",
        version="v1alpha1",
        namespace="kubeflow",
        plural="trainedmodels",
        name="fraud-detection-v1"
    )
    print("Model registry entry:")
    print(model_registry)
except Exception as e:
    print(f"Error getting model registry: {e}")

## Visualize Model Performance

In [None]:
# Load model metrics (assuming we have access to the PVC)
import json
import matplotlib.pyplot as plt
import seaborn as sns

try:
    # This assumes we're running within the cluster or have access to the metrics file
    with open("/mnt/artifacts/fraud-detection/v1/metrics/metrics.json", "r") as f:
        metrics = json.load(f)
        
    print("Model Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Plot metrics
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(metrics.keys()), y=list(metrics.values()))
    plt.title("Fraud Detection Model Performance")
    plt.ylim(0, 1)
    plt.show()
    
    # Load confusion matrix
    confusion_matrix = pd.read_csv("/mnt/artifacts/fraud-detection/v1/metrics/confusion_matrix.csv", index_col=0)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    
except Exception as e:
    print(f"Could not access metrics files: {e}")
    print("This is expected if running outside the Kubernetes cluster.")
    print("You can access metrics through the Kubeflow UI or by mounting the PVC.")

## Prepare for Model Deployment

Now that our model is trained and registered, we can proceed to deploy it with KServe.
Continue to the next notebook `02_deploy_model.ipynb` for deployment steps.