
# Machine Learning Pipeline for Fraud Detection

This notebook demonstrates a complete machine learning pipeline for fraud detection
using the credit fraud dataset from skrub. The dataset contains two tables:
- `baskets`: Contains basket IDs and fraud flags (target variable)
- `products`: Contains product information linked to baskets via `basket_ID`

In [None]:
import skrub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from skrub import TableVectorizer
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
dataset = skrub.datasets.fetch_credit_fraud()
print("Dataset keys:", dataset.keys())
print("Baskets shape:", dataset.baskets.shape)
print("Products shape:", dataset.products.shape)

In [None]:
# Explore the data structure
print("Baskets columns:", dataset.baskets.columns.tolist())
print("Products columns:", dataset.products.columns.tolist())
print("\nBaskets head:")
print(dataset.baskets.head())
print("\nProducts head:")
print(dataset.products.head())

In [None]:
# Check data quality and target distribution
print("Fraud distribution:")
print(dataset.baskets['fraud_flag'].value_counts())
print(f"Fraud rate: {dataset.baskets['fraud_flag'].mean():.3f}")

print("\nMissing values in baskets:")
print(dataset.baskets.isnull().sum())

print("\nMissing values in products:")
print(dataset.products.isnull().sum())

In [None]:
# Join the tables to create features
# First, let's aggregate product information by basket
basket_features = dataset.products.groupby('basket_ID').agg({
    'cash_price': ['sum', 'mean', 'std', 'min', 'max', 'count'],
    'Nbr_of_prod_purchas': ['sum', 'mean', 'std'],
    'item': 'nunique',  # Number of unique items
    'make': 'nunique',  # Number of unique makes
    'model': 'nunique',  # Number of unique models
    'goods_code': 'nunique'  # Number of unique goods codes
}).reset_index()

# Flatten column names
basket_features.columns = ['basket_ID'] + [f"{col[0]}_{col[1]}" for col in basket_features.columns[1:]]

# Add additional features
basket_features['avg_price_per_item'] = basket_features['cash_price_sum'] / basket_features['Nbr_of_prod_purchas_sum']
basket_features['price_std_norm'] = basket_features['cash_price_std'] / (basket_features['cash_price_mean'] + 1e-8)

print("Basket features shape:", basket_features.shape)
print("Basket features columns:", basket_features.columns.tolist())

In [None]:
# Merge with fraud labels
df = basket_features.merge(dataset.baskets, left_on='basket_ID', right_on='ID', how='inner')
print("Merged dataset shape:", df.shape)
print("Merged dataset columns:", df.columns.tolist())

In [None]:
# Create additional features for fraud detection
# These are domain-specific features that might be indicative of fraud

# 1. Price anomaly features
df['price_anomaly'] = (df['cash_price_sum'] > df['cash_price_sum'].quantile(0.95)).astype(int)
df['low_price_anomaly'] = (df['cash_price_sum'] < df['cash_price_sum'].quantile(0.05)).astype(int)

# 2. Quantity anomaly features
df['quantity_anomaly'] = (df['Nbr_of_prod_purchas_sum'] > df['Nbr_of_prod_purchas_sum'].quantile(0.95)).astype(int)

# 3. Diversity features
df['item_diversity'] = df['item_nunique'] / (df['Nbr_of_prod_purchas_sum'] + 1e-8)
df['make_diversity'] = df['make_nunique'] / (df['Nbr_of_prod_purchas_sum'] + 1e-8)

# 4. Price consistency features
df['price_consistency'] = 1 / (df['cash_price_std'] + 1e-8)  # Higher values = more consistent prices

print("Enhanced dataset shape:", df.shape)
print("New features added:", ['price_anomaly', 'low_price_anomaly', 'quantity_anomaly', 
                              'item_diversity', 'make_diversity', 'price_consistency'])

In [None]:
# Prepare features and target
X = df.drop(['basket_ID', 'ID', 'fraud_flag'], axis=1)
y = df['fraud_flag']

print("Feature matrix shape:", X.shape)
print("Target distribution:", y.value_counts())
print("Features:", X.columns.tolist())

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Training fraud rate:", y_train.mean())
print("Test fraud rate:", y_test.mean())

In [None]:
# Handle missing values before preprocessing
print("Missing values in features:")
print(X_train.isnull().sum())

# Fill missing values
X_train = X_train.fillna(0)  # Fill NaN with 0 for numerical features
X_test = X_test.fillna(0)

print("Missing values after filling:")
print(X_train.isnull().sum().sum())

In [None]:
# Create preprocessing pipeline using skrub's TableVectorizer
preprocessor = TableVectorizer(
    drop_if_constant=True,
    high_cardinality=skrub.TextEncoder(),
    low_cardinality=skrub.StringEncoder()
)

# Test the preprocessor
X_train_processed = preprocessor.fit_transform(X_train)
print("Processed training set shape:", X_train_processed.shape)

In [None]:
# Train multiple models and compare performance
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'pipeline': pipeline,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'auc_score': auc_score
    }
    
    print(f"{name} - AUC Score: {auc_score:.4f}")

In [None]:
# Evaluate the best model (Random Forest typically performs well on tabular data)
best_model_name = 'Random Forest'
best_pipeline = results[best_model_name]['pipeline']
y_pred = results[best_model_name]['predictions']
y_pred_proba = results[best_model_name]['probabilities']

print(f"Best Model: {best_model_name}")
print(f"AUC Score: {results[best_model_name]['auc_score']:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Fraud Detection Model Evaluation', fontsize=16, fontweight='bold')

# 1. Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0])
axes[0, 0].set_title('Confusion Matrix')
axes[0, 0].set_xlabel('Predicted')
axes[0, 0].set_ylabel('Actual')

# 2. ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
axes[0, 1].plot(fpr, tpr, linewidth=2, label=f'ROC Curve (AUC = {results[best_model_name]["auc_score"]:.3f})')
axes[0, 1].plot([0, 1], [0, 1], 'k--', linewidth=1)
axes[0, 1].set_xlabel('False Positive Rate')
axes[0, 1].set_ylabel('True Positive Rate')
axes[0, 1].set_title('ROC Curve')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Feature Importance (for Random Forest)
if hasattr(best_pipeline.named_steps['classifier'], 'feature_importances_'):
    feature_names = best_pipeline.named_steps['preprocessor'].get_feature_names_out()
    importances = best_pipeline.named_steps['classifier'].feature_importances_
    
    # Get top 10 most important features
    top_indices = np.argsort(importances)[-10:]
    top_features = [feature_names[i] for i in top_indices]
    top_importances = importances[top_indices]
    
    axes[1, 0].barh(range(len(top_features)), top_importances)
    axes[1, 0].set_yticks(range(len(top_features)))
    axes[1, 0].set_yticklabels(top_features)
    axes[1, 0].set_xlabel('Feature Importance')
    axes[1, 0].set_title('Top 10 Most Important Features')

# 4. Model Comparison
model_names = list(results.keys())
auc_scores = [results[name]['auc_score'] for name in model_names]
axes[1, 1].bar(model_names, auc_scores, color=['skyblue', 'lightgreen', 'lightcoral'])
axes[1, 1].set_ylabel('AUC Score')
axes[1, 1].set_title('Model Comparison')
axes[1, 1].tick_params(axis='x', rotation=45)
for i, v in enumerate(auc_scores):
    axes[1, 1].text(i, v + 0.001, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Cross-validation to get more robust performance estimates
print("Performing 5-fold cross-validation...")
cv_scores = cross_val_score(
    best_pipeline, X_train, y_train, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc'
)

print(f"Cross-validation AUC scores: {cv_scores}")
print(f"Mean CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# Final model evaluation on test set
print("\nFinal Model Performance Summary:")
print(f"Model: {best_model_name}")
print(f"Test AUC: {results[best_model_name]['auc_score']:.4f}")
print(f"Mean CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# Save the trained model for future use
import joblib

# Save the complete pipeline
joblib.dump(best_pipeline, 'fraud_detection_model.pkl')
print("Model saved as 'fraud_detection_model.pkl'")

In [None]:
# Example of how to use the model for new predictions
print("\nExample: Making predictions on new data")
print("To use this model on new data, you would:")
print("1. Load the model: model = joblib.load('fraud_detection_model.pkl')")
print("2. Prepare your data in the same format as X_train")
print("3. Make predictions: predictions = model.predict(new_data)")
print("4. Get probabilities: probabilities = model.predict_proba(new_data)")