# Overview:
Model CI/CD pipeline designed to automatically trigger model re-training upon detection of data drift.  
Methodology:  
- Develop a baseline regression model using synthetic data, incorporating some random variables to simulate potential future drift  
- Initiate re-training when a performance drop is observed (e.g., increased RMSE)  
- Consider model versioning  
- Consider data versioning

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

## Baseline model with synthetic data

In [None]:
# --- Function Definitions and Coefficient Setup ---

def dynamic_true_function(features, coeffs, noise_level=0.0):
    """Calculates the target value based on dynamic coefficients."""
    # np.dot is a clean way to do: c1*v1 + c2*v2 + ... + intercept
    base_value = np.dot(features, coeffs[:-1]) + coeffs[-1]
    if noise_level > 0:
        noise = np.random.normal(0, noise_level, size=base_value.shape)
        return base_value + noise
    return base_value

In [46]:
base_coeffs

array([ 1.5, -2. ,  1. ,  3. , -4. ,  2. ,  5. ])

In [59]:
# 2. Drifted coefficients for simulating a change in production
drift_scale = 0.1 # The magnitude of the drift
#np.random.seed(0) # Make the drift predictable for this example run
drift = np.random.normal(0, drift_scale, size=base_coeffs.shape)
drifted_coeffs = base_coeffs + drift

In [None]:
# 1. Original coefficients for training
base_coeffs = np.array([1.5, -2.0, 1.0, 3.0, -4.0, 2.0, 5.0])

# 2. Drifted coefficients for simulating a change in production
drift_scale = 0.1 # The magnitude of the drift
np.random.seed(0) # Make the drift predictable for this example run
drift = np.random.normal(0, drift_scale, size=base_coeffs.shape)
drifted_coeffs = base_coeffs + drift

print("--- Coefficient Setup ---")
print(f"Base Coefficients:    {np.round(base_coeffs, 2)}")
print(f"Drifted Coefficients: {np.round(drifted_coeffs, 2)}\n")

# --- Helper function for data generation ---
def generate_data(n_samples, coeffs, noise_level):
    """Generates a feature matrix (X) and target vector (y)"""
    x = np.random.uniform(-3, 3, size=n_samples)
    y = np.random.uniform(-4, 4, size=n_samples)
    a = np.random.normal(0, 2, size=n_samples)
    b = np.random.normal(5, 1.5, size=n_samples)
    c = np.random.uniform(0, 10, size=n_samples)
    d = np.random.normal(-2, 1, size=n_samples)
    e = np.random.uniform(-1, 1, size=n_samples)

    features_df = pd.DataFrame({
        'v1': x**4, 'v2': y**3, 'v3': a**2 * b,
        'v4': c**2, 'v5': d, 'v6': e
    })
    
    X = features_df.values
    y_target = dynamic_true_function(X, coeffs, noise_level)
    return X, y_target

--- Coefficient Setup ---
Base Coefficients:    [ 1.5 -2.   1.   3.  -4.   2.   5. ]
Drifted Coefficients: [ 1.68 -1.96  1.1   3.22 -3.81  1.9   5.1 ]



In [11]:
# =============================================================================
# PHASE 1: INITIAL MODEL TRAINING (using BASE coefficients)
# =============================================================================
print("--- PHASE 1: Initial Model Training ---")
n_train_samples = 500000
output_noise_level = 25.0

# Generate training data using the ORIGINAL function
X_train_full, y_train_full = generate_data(n_train_samples, base_coeffs, output_noise_level)

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
print("Training model on original data...")
model = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=500,
                     early_stopping=True, n_iter_no_change=15, verbose=False, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate on its own test set to confirm it learned well
y_pred_initial = model.predict(X_test_scaled)
r2_initial = r2_score(y_test, y_pred_initial)
print(f"Initial model performance on non-drifted test data (R2): {r2_initial:.4f}\n")

# Save the artifacts
joblib.dump(model, "multivariate_model_v1.joblib")
joblib.dump(scaler, "scaler_v1.joblib")
print("Initial model and scaler saved.\n")


--- PHASE 1: Initial Model Training ---
Training model on original data...
Initial model performance on non-drifted test data (R2): 0.9515

Initial model and scaler saved.



In [12]:
# =============================================================================
# PHASE 2: SIMULATING PRODUCTION WITH CONCEPT DRIFT
# =============================================================================
print("--- PHASE 2: Simulating Production with Concept Drift ---")
print("Generating new 'production' data using the DRIFTED function...")
n_prod_samples = 10000

# Generate new data using the DRIFTED coefficients
X_prod, y_prod_ground_truth = generate_data(n_prod_samples, drifted_coeffs, output_noise_level)

# IMPORTANT: We must use the OLD scaler from training
X_prod_scaled = scaler.transform(X_prod)

# Use the OLD model to make predictions on the NEW, drifted data
y_pred_drifted = model.predict(X_prod_scaled)

# Evaluate how well the old model did on the new data
r2_drifted = r2_score(y_prod_ground_truth, y_pred_drifted)
mse_drifted = mean_squared_error(y_prod_ground_truth, y_pred_drifted)

print("\n--- Performance Evaluation on Drifted Data ---")
print(f"The original model's performance on the new, drifted data (R2): {r2_drifted:.4f}")
print(f"The original model's performance on non-drifted data (R2):     {r2_initial:.4f}")
print("\nCONCLUSION: The R2 score has dropped significantly. This performance degradation")
print("is exactly what your CI/CD monitoring system should detect to trigger a retrain!")

--- PHASE 2: Simulating Production with Concept Drift ---
Generating new 'production' data using the DRIFTED function...

--- Performance Evaluation on Drifted Data ---
The original model's performance on the new, drifted data (R2): 0.9406
The original model's performance on non-drifted data (R2):     0.9515

CONCLUSION: The R2 score has dropped significantly. This performance degradation
is exactly what your CI/CD monitoring system should detect to trigger a retrain!
