# BK Pulse - Customer Churn Prediction Demo

This notebook demonstrates the BK Pulse churn prediction system for Bank of Kigali.

## Mission Capstone Project - ALU Rwanda

## 1. Setup and Imports

In [None]:
import sys
import os

# Add parent directory to path
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.data_loader import DataLoader
from src.data.preprocessor import DataPreprocessor
from src.models.churn_model import ChurnPredictor
from src.utils.config_loader import load_config
from src.utils.visualization import (
    plot_confusion_matrix, plot_roc_curve, 
    plot_feature_importance, plot_churn_distribution
)

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')

print("✅ Imports successful!")

## 2. Load Configuration

In [None]:
config = load_config('../config.yaml')
print("Configuration loaded:")
print(f"Algorithm: {config['model']['algorithm']}")
print(f"Test size: {config['model']['test_size']}")
print(f"Handle imbalance: {config['model']['handle_imbalance']}")

## 3. Generate Sample Data

Since we may not have real customer data, we'll generate synthetic data that mimics Bank of Kigali customer characteristics.

In [None]:
# Generate sample data
loader = DataLoader('../data/raw/customer_data.csv')
df = loader.generate_sample_data(n_samples=1000)

print(f"Generated {len(df)} customer records")
print(f"\nChurn rate: {df['churn'].mean():.2%}")
df.head()

## 4. Exploratory Data Analysis

In [None]:
# Data summary
print("Data Summary:")
print(df.describe())

In [None]:
# Visualize churn distribution
plot_churn_distribution(df)

In [None]:
# Correlation analysis
plt.figure(figsize=(12, 10))
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 5. Data Preprocessing

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor(config)

# Preprocess data
df_processed = preprocessor.preprocess(df, fit=True)

print(f"Processed data shape: {df_processed.shape}")
print(f"Features created: {preprocessor.feature_names}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = preprocessor.split_data(
    df_processed,
    test_size=config['model']['test_size'],
    random_state=config['model']['random_state']
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Train churn rate: {y_train.mean():.2%}")
print(f"Test churn rate: {y_test.mean():.2%}")

## 6. Model Training

In [None]:
# Initialize model
model = ChurnPredictor(config)

# Train model
print("Training model...")
train_metrics = model.train(X_train, y_train, X_test, y_test)

print(f"\nCross-validation score: {train_metrics['cv_mean_score']:.4f} (+/- {train_metrics['cv_std_score']:.4f})")

In [None]:
# Optimize threshold
best_threshold = model.optimize_threshold(X_test, y_test)
print(f"Optimal classification threshold: {best_threshold:.2f}")

## 7. Model Evaluation

In [None]:
# Evaluate model
metrics = model.evaluate(X_test, y_test)

print("Model Performance:")
print(f"Accuracy:  {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall:    {metrics['recall']:.4f}")
print(f"F1 Score:  {metrics['f1_score']:.4f}")
print(f"ROC AUC:   {metrics['roc_auc']:.4f}")

In [None]:
# Confusion Matrix
y_pred = model.predict(X_test)
plot_confusion_matrix(y_test, y_pred)

In [None]:
# ROC Curve
y_proba = model.predict_proba(X_test)
plot_roc_curve(y_test, y_proba)

## 8. Feature Importance Analysis

In [None]:
# Get feature importance
importance_df = model.get_feature_importance()

if not importance_df.empty:
    print("Top 10 Most Important Features:")
    print(importance_df.head(10))
    
    # Visualize
    plot_feature_importance(importance_df, top_n=15)
else:
    print("Feature importance not available for this model type.")

## 9. Making Predictions

### Single Customer Prediction

In [None]:
# Example: High-risk customer
high_risk_customer = pd.DataFrame([{
    'age': 45,
    'gender': 'M',
    'location': 'Kigali',
    'account_balance': 30000,  # Low balance
    'account_age_months': 36,
    'number_of_products': 1,  # Only one product
    'has_credit_card': 0,  # No credit card
    'has_mobile_banking': 0,  # No mobile banking
    'avg_monthly_transactions': 3,  # Low activity
    'total_transaction_amount': 100000,
    'transaction_frequency': 5,
    'days_since_last_login': 250,  # Long time since login
    'customer_service_calls': 8,  # Many calls
    'complaints_filed': 4  # Multiple complaints
}])

# Preprocess
customer_processed = preprocessor.preprocess(high_risk_customer, fit=False)
feature_cols = [col for col in customer_processed.columns if col not in ['churn', 'customer_id']]
X_customer = customer_processed[feature_cols]

# Predict
churn_prob = model.predict_proba(X_customer)[0]
churn_pred = model.predict(X_customer)[0]

print("High-Risk Customer Profile:")
print(f"Churn Prediction: {'Yes' if churn_pred else 'No'}")
print(f"Churn Probability: {churn_prob:.2%}")
print(f"Risk Level: {'High' if churn_prob > 0.7 else 'Medium' if churn_prob > 0.4 else 'Low'}")

In [None]:
# Example: Low-risk customer
low_risk_customer = pd.DataFrame([{
    'age': 32,
    'gender': 'F',
    'location': 'Kigali',
    'account_balance': 800000,  # High balance
    'account_age_months': 48,
    'number_of_products': 4,  # Multiple products
    'has_credit_card': 1,  # Has credit card
    'has_mobile_banking': 1,  # Uses mobile banking
    'avg_monthly_transactions': 25,  # Active
    'total_transaction_amount': 2000000,
    'transaction_frequency': 35,
    'days_since_last_login': 2,  # Recent login
    'customer_service_calls': 1,  # Few calls
    'complaints_filed': 0  # No complaints
}])

# Preprocess and predict
customer_processed = preprocessor.preprocess(low_risk_customer, fit=False)
X_customer = customer_processed[feature_cols]

churn_prob = model.predict_proba(X_customer)[0]
churn_pred = model.predict(X_customer)[0]

print("Low-Risk Customer Profile:")
print(f"Churn Prediction: {'Yes' if churn_pred else 'No'}")
print(f"Churn Probability: {churn_prob:.2%}")
print(f"Risk Level: {'High' if churn_prob > 0.7 else 'Medium' if churn_prob > 0.4 else 'Low'}")

## 10. Batch Predictions - Risk Segmentation

In [None]:
# Get predictions for all test customers
test_probabilities = model.predict_proba(X_test)
test_predictions = model.predict(X_test)

# Create results dataframe
results_df = pd.DataFrame({
    'actual_churn': y_test.values,
    'predicted_churn': test_predictions,
    'churn_probability': test_probabilities,
    'risk_level': ['High' if p > 0.7 else 'Medium' if p > 0.4 else 'Low' for p in test_probabilities]
})

# Risk distribution
print("Customer Risk Distribution:")
print(results_df['risk_level'].value_counts())
print(f"\nHigh Risk: {(results_df['risk_level'] == 'High').sum()} customers")
print(f"Medium Risk: {(results_df['risk_level'] == 'Medium').sum()} customers")
print(f"Low Risk: {(results_df['risk_level'] == 'Low').sum()} customers")

In [None]:
# Visualize risk distribution
plt.figure(figsize=(10, 6))
risk_counts = results_df['risk_level'].value_counts()
colors = {'High': 'red', 'Medium': 'orange', 'Low': 'green'}
plt.bar(risk_counts.index, risk_counts.values, 
        color=[colors[x] for x in risk_counts.index])
plt.title('Customer Risk Level Distribution')
plt.xlabel('Risk Level')
plt.ylabel('Number of Customers')
plt.grid(axis='y', alpha=0.3)
plt.show()

## 11. Actionable Insights for Bank of Kigali

### Retention Strategy Recommendations

In [None]:
# High-risk customers requiring immediate action
high_risk_customers = results_df[results_df['risk_level'] == 'High']
high_risk_actual_churn = high_risk_customers['actual_churn'].mean()

print("=" * 60)
print("ACTIONABLE INSIGHTS FOR BANK OF KIGALI")
print("=" * 60)
print(f"\n1. HIGH-RISK CUSTOMERS: {len(high_risk_customers)}")
print(f"   - Actual churn rate: {high_risk_actual_churn:.2%}")
print("   - Action: Immediate intervention required")
print("   - Strategy: Personal call from relationship manager")
print("   - Offer: Premium service upgrade or special retention bonus")

medium_risk_customers = results_df[results_df['risk_level'] == 'Medium']
print(f"\n2. MEDIUM-RISK CUSTOMERS: {len(medium_risk_customers)}")
print("   - Action: Proactive engagement within 2 weeks")
print("   - Strategy: Personalized email with service improvements")
print("   - Offer: Fee waiver or loyalty rewards")

low_risk_customers = results_df[results_df['risk_level'] == 'Low']
print(f"\n3. LOW-RISK CUSTOMERS: {len(low_risk_customers)}")
print("   - Action: Standard engagement and satisfaction monitoring")
print("   - Strategy: Continue excellent service")
print("   - Opportunity: Cross-sell additional products")

print("\n" + "=" * 60)

## 12. Save Model and Artifacts

In [None]:
# Save model
model.save_model('../data/models/churn_model.joblib')
print("✅ Model saved successfully!")

# Save preprocessor
preprocessor.save_preprocessor('../data/models/scaler.joblib')
print("✅ Preprocessor saved successfully!")

# Save predictions
results_df.to_csv('../data/processed/test_predictions.csv', index=False)
print("✅ Predictions saved successfully!")

## Conclusion

This notebook demonstrated the BK Pulse customer churn prediction system:

1. ✅ Generated synthetic customer data
2. ✅ Performed exploratory data analysis
3. ✅ Preprocessed and engineered features
4. ✅ Trained machine learning model
5. ✅ Evaluated model performance
6. ✅ Analyzed feature importance
7. ✅ Made predictions and segmented customers by risk
8. ✅ Provided actionable retention strategies

The system can now be deployed as an API or integrated into Bank of Kigali's systems for real-time churn prediction and proactive customer retention.

---

**Mission Capstone Project - ALU Rwanda**  
**Bank of Kigali Customer Retention Initiative**