In [None]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn

print("‚úÖ Libraries imported successfully")
print(f"üìÖ Training started at: {datetime.now()}")

In [None]:
# Load training data from Lakehouse
print("üìÇ Loading data from Lakehouse...")

data = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("Files/silver/customer_features/sample_customer_data.csv") \
    .toPandas()

print(f"‚úÖ Loaded {len(data)} customer records from Lakehouse")
print(f"\nüìä Dataset shape: {data.shape}")
print(f"\nüéØ Churn distribution:\n{data['churn'].value_counts()}")

In [None]:
# Feature engineering
print("üîß Engineering features...")

# ‚ö†Ô∏è INTENTIONAL ERROR: Division by zero when customer_age_days is 0
data['purchase_frequency'] = data['total_purchases'] / (data['customer_age_days'] / 30)
data['engagement_score'] = (data['total_purchases'] * data['avg_purchase_value']) / data['customer_age_days']
data['recency_score'] = 1 / (data['days_since_last_purchase'] + 1)

feature_columns = [
    'total_purchases', 'avg_purchase_value', 'days_since_last_purchase',
    'customer_age_days', 'support_tickets', 'purchase_frequency',
    'engagement_score', 'recency_score'
]

X = data[feature_columns]
y = data['churn']

print(f"‚úÖ Features prepared: {len(feature_columns)} features")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"üìä Training set: {len(X_train)} samples")
print(f"üìä Test set: {len(X_test)} samples")

In [None]:
# Model training with MLflow tracking
print("\nü§ñ Training Random Forest model...")

mlflow.set_experiment("NVR_Customer_Churn_Prediction")

with mlflow.start_run(run_name=f"rf_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    # Model parameters
    params = {
        'n_estimators': 100,
        'max_depth': 10,
        'min_samples_split': 5,
        'random_state': 42
    }
    
    # Log parameters
    mlflow.log_params(params)
    
    # Train model
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    
    print("‚úÖ Model training completed")
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Log model
    mlflow.sklearn.log_model(model, "random_forest_model")
    
    print("\nüìä Model Performance Metrics:")
    print("=" * 40)
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print("=" * 40)

In [None]:
# Feature importance analysis
print("\nüîç Feature Importance:")
print("=" * 40)

feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance.iterrows():
    print(f"  {row['feature']:30s} {row['importance']:.4f}")

print("=" * 40)

In [None]:
# Model validation summary
print("\n‚úÖ Model Training Pipeline Completed")
print("=" * 50)
print(f"  Model Type: Random Forest Classifier")
print(f"  Training Samples: {len(X_train)}")
print(f"  Test Samples: {len(X_test)}")
print(f"  Features: {len(feature_columns)}")
print(f"  Model Accuracy: {accuracy:.2%}")
print(f"  Completed at: {datetime.now()}")
print("=" * 50)
print("\nüöÄ Model ready for deployment!")

In [None]:
import os
from datetime import datetime
# Enhanced logging for production monitoring
print("=" * 50)
print("üöÄ Model Training v2.0 - Enhanced Logging")
print("=" * 50)
print(f"Training started: {datetime.now()}")
print(f"Environment: {os.environ.get('ENVIRONMENT', 'dev')}")
print(f"Model version: 2.0")
print("=" * 50)