## Real-World Case Studies

### Finance - Fraud Detection Models:
**Description**: Analyze a financial dataset, define SLAs for data accuracy and
completeness, and ensure high data quality for fraud detection models.

In [None]:
# write your code from here

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import unittest

# --- Sample Data Creation (simulates a financial dataset) ---
def generate_sample_data():
    np.random.seed(42)
    num_samples = 1000
    # 990 non-fraud, 10 fraud cases (1% fraud)
    fraud = pd.DataFrame({
        'Amount': np.random.normal(200, 50, 10),
        'Time': np.random.uniform(0, 100000, 10),
        'Feature1': np.random.normal(0, 1, 10),
        'Feature2': np.random.normal(0, 1, 10),
        'Class': 1
    })
    non_fraud = pd.DataFrame({
        'Amount': np.random.normal(50, 10, 990),
        'Time': np.random.uniform(0, 100000, 990),
        'Feature1': np.random.normal(0, 1, 990),
        'Feature2': np.random.normal(0, 1, 990),
        'Class': 0
    })
    return pd.concat([fraud, non_fraud], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

# --- SLA Configuration ---
SLA_MISSING_THRESHOLD = 0.01
SLA_ACCURACY_THRESHOLD = 0.95

# --- Data Quality Check ---
def check_data_quality(df):
    missing_ratio = df.isnull().mean()
    if missing_ratio.max() > SLA_MISSING_THRESHOLD:
        raise ValueError("Completeness SLA failed")
    return True

# --- Preprocessing ---
def preprocess_data(df):
    df = df.dropna()
    X = df.drop('Class', axis=1)
    y = df['Class']
    return X, y

# --- Data Splitting ---
def split_data(X, y):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

# --- Model Training ---
def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)
    return model

# --- Model Evaluation ---
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print(classification_report(y_val, y_pred))
    if acc < SLA_ACCURACY_THRESHOLD:
        raise ValueError("Accuracy SLA failed")
    return acc

# --- Pipeline Execution ---
def run_pipeline():
    df = generate_sample_data()
    check_data_quality(df)
    X, y = preprocess_data(df)
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)
    model = train_model(X_train, y_train)
    acc = evaluate_model(model, X_val, y_val)
    print(f"✅ Model passed with accuracy: {acc:.2f}")
    return model

# --- Basic Unit Tests ---
class TestFraudDetection(unittest.TestCase):

    def test_data_quality_pass(self):
        df = pd.DataFrame({
            'Amount': [10, 20, 30],
            'Time': [1, 2, 3],
            'Feature1': [0.1, 0.2, 0.3],
            'Feature2': [0.4, 0.5, 0.6],
            'Class': [0, 1, 0]
        })
        self.assertTrue(check_data_quality(df))

    def test_preprocessing(self):
        df = pd.DataFrame({
            'Amount': [10, 20],
            'Time': [1, 2],
            'Feature1': [0.1, 0.2],
            'Feature2': [0.3, 0.4],
            'Class': [0, 1]
        })
        X, y = preprocess_data(df)
        self.assertEqual(X.shape[1], 4)
        self.assertEqual(len(y), 2)

# --- Run Script ---
if __name__ == "__main__":
    print("🔍 Running Fraud Detection Pipeline...\n")
    run_pipeline()

    print("\n🧪 Running Unit Tests...")
    unittest.main(argv=[''], exit=False)


..
----------------------------------------------------------------------
Ran 2 tests in 0.007s

OK


🔍 Running Fraud Detection Pipeline...

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       148
           1       1.00      1.00      1.00         2

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

✅ Model passed with accuracy: 1.00

🧪 Running Unit Tests...
