# Lab 3: End-to-End ML Project — Customer Churn Prediction - SOLUTIONS
**Introduction to Data Science & Engineering - Day 3**

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report, roc_auc_score, roc_curve)
import joblib
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

print("Libraries loaded successfully!")

## Part 1: Generate and Explore the Dataset

In [None]:
np.random.seed(42)
n_customers = 2000

# Base features
tenure = np.random.randint(1, 72, n_customers)
monthly_charges = np.round(np.random.uniform(20, 120, n_customers), 2)
total_charges = np.round(tenure * monthly_charges * np.random.uniform(0.8, 1.1, n_customers), 2)

contract = np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers, p=[0.5, 0.3, 0.2])
payment = np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_customers)
internet = np.random.choice(['DSL', 'Fiber optic', 'No'], n_customers, p=[0.35, 0.45, 0.20])

support_tickets = np.random.poisson(2, n_customers)
num_products = np.random.randint(1, 6, n_customers)

# Churn based on realistic factors
churn_prob = np.zeros(n_customers)
churn_prob += (contract == 'Month-to-month') * 0.25
churn_prob += (tenure < 12) * 0.15
churn_prob += (monthly_charges > 80) * 0.1
churn_prob += (support_tickets > 3) * 0.15
churn_prob += (internet == 'Fiber optic') * 0.05
churn_prob -= (contract == 'Two year') * 0.2
churn_prob -= (num_products > 3) * 0.1
churn_prob = np.clip(churn_prob, 0.05, 0.85)

churn = np.random.binomial(1, churn_prob)

df = pd.DataFrame({
    'customer_id': range(1, n_customers + 1),
    'tenure_months': tenure,
    'monthly_charges': monthly_charges,
    'total_charges': total_charges,
    'contract_type': contract,
    'payment_method': payment,
    'internet_service': internet,
    'support_tickets': support_tickets,
    'num_products': num_products,
    'age': np.random.randint(18, 75, n_customers),
    'has_partner': np.random.choice([0, 1], n_customers, p=[0.45, 0.55]),
    'has_dependents': np.random.choice([0, 1], n_customers, p=[0.6, 0.4]),
    'churn': churn
})

print(f"Dataset shape: {df.shape}")
print(f"\nChurn distribution:")
print(df['churn'].value_counts(normalize=True).round(3))
df.head()

### Exercise 1.1: Explore the Data - SOLUTION

In [None]:
print("Dataset Info:")
print(f"  Shape: {df.shape}")
print(f"  Churn rate: {df['churn'].mean():.1%}")
print(f"\nNumeric summary:")
df.describe().round(2)

In [None]:
# Visualize churn by key features
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Churn by contract type
ct = df.groupby('contract_type')['churn'].mean().sort_values(ascending=False)
ct.plot(kind='bar', ax=axes[0,0], color='#3b82f6')
axes[0,0].set_title('Churn Rate by Contract Type')
axes[0,0].set_ylabel('Churn Rate')
axes[0,0].tick_params(axis='x', rotation=45)

# Tenure distribution by churn
sns.histplot(data=df, x='tenure_months', hue='churn', multiple='stack', bins=30, ax=axes[0,1], palette=['#10b981', '#ef4444'])
axes[0,1].set_title('Tenure Distribution by Churn')

# Monthly charges by churn
sns.boxplot(data=df, x='churn', y='monthly_charges', ax=axes[0,2], palette=['#10b981', '#ef4444'])
axes[0,2].set_title('Monthly Charges by Churn')
axes[0,2].set_xticklabels(['Stayed', 'Churned'])

# Support tickets by churn
sns.boxplot(data=df, x='churn', y='support_tickets', ax=axes[1,0], palette=['#10b981', '#ef4444'])
axes[1,0].set_title('Support Tickets by Churn')
axes[1,0].set_xticklabels(['Stayed', 'Churned'])

# Churn by internet service
it = df.groupby('internet_service')['churn'].mean().sort_values(ascending=False)
it.plot(kind='bar', ax=axes[1,1], color='#8b5cf6')
axes[1,1].set_title('Churn Rate by Internet Service')
axes[1,1].tick_params(axis='x', rotation=45)

# Products by churn
pt = df.groupby('num_products')['churn'].mean()
pt.plot(kind='bar', ax=axes[1,2], color='#f59e0b')
axes[1,2].set_title('Churn Rate by Number of Products')

plt.tight_layout()
plt.show()

### Exercise 2.1: Create New Features - SOLUTION

In [None]:
df_ml = df.copy()

# Tenure-based features
df_ml['tenure_years'] = df_ml['tenure_months'] / 12
df_ml['is_new_customer'] = (df_ml['tenure_months'] <= 6).astype(int)

# Charges-based features
df_ml['avg_monthly_charge'] = df_ml['total_charges'] / df_ml['tenure_months'].replace(0, 1)
df_ml['charge_per_product'] = df_ml['monthly_charges'] / df_ml['num_products']

# Support ratio
df_ml['support_per_tenure'] = df_ml['support_tickets'] / df_ml['tenure_months'].replace(0, 1)

# Engagement score (composite)
df_ml['engagement_score'] = (
    df_ml['num_products'] * 0.3 +
    (df_ml['tenure_months'] / 72) * 0.3 +
    df_ml['has_partner'] * 0.2 +
    df_ml['has_dependents'] * 0.2
)

print("New features created:")
print(df_ml[['tenure_years', 'is_new_customer', 'avg_monthly_charge', 
             'charge_per_product', 'support_per_tenure', 'engagement_score']].describe().round(3))

### Exercise 2.2: Encode Categorical Variables - SOLUTION

In [None]:
# One-hot encode
df_encoded = pd.get_dummies(df_ml, columns=['contract_type', 'payment_method', 'internet_service'], drop_first=True)

# Drop customer_id (not a feature)
df_encoded = df_encoded.drop('customer_id', axis=1)

print(f"Shape after encoding: {df_encoded.shape}")
print(f"\nFeatures:")
for col in df_encoded.columns:
    print(f"  {col}: {df_encoded[col].dtype}")

### Exercise 2.3: Prepare Train/Test Split - SOLUTION

In [None]:
X = df_encoded.drop('churn', axis=1)
y = df_encoded['churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features
numeric_features = ['tenure_months', 'monthly_charges', 'total_charges', 'support_tickets',
                   'num_products', 'age', 'tenure_years', 'avg_monthly_charge',
                   'charge_per_product', 'support_per_tenure', 'engagement_score']

scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

print(f"Training set: {X_train.shape} ({y_train.mean():.1%} churn)")
print(f"Test set: {X_test.shape} ({y_test.mean():.1%} churn)")

### Exercise 3.1: Train Three Models - SOLUTION

In [None]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_prob': y_prob,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_prob)
    }
    
    print(f"\n{name}:")
    print(f"  Accuracy:  {results[name]['accuracy']:.4f}")
    print(f"  Precision: {results[name]['precision']:.4f}")
    print(f"  Recall:    {results[name]['recall']:.4f}")
    print(f"  F1 Score:  {results[name]['f1']:.4f}")
    print(f"  AUC:       {results[name]['auc']:.4f}")

### Exercise 3.2: Cross-Validation - SOLUTION

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("5-Fold Cross-Validation Results (F1 Score):")
print("-" * 55)

for name, model_info in results.items():
    cv_scores = cross_val_score(model_info['model'], X_train, y_train, cv=cv, scoring='f1')
    print(f"  {name:25s}: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

### Exercise 3.3: Confusion Matrices - SOLUTION

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, (name, res) in zip(axes, results.items()):
    cm = confusion_matrix(y_test, res['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
               xticklabels=['Stayed', 'Churned'],
               yticklabels=['Stayed', 'Churned'])
    ax.set_title(f'{name}\n(F1: {res["f1"]:.3f})')
    ax.set_ylabel('Actual')
    ax.set_xlabel('Predicted')

plt.tight_layout()
plt.show()

### Exercise 3.4: ROC Curves - SOLUTION

In [None]:
plt.figure(figsize=(10, 8))

for name, res in results.items():
    fpr, tpr, _ = roc_curve(y_test, res['y_prob'])
    plt.plot(fpr, tpr, linewidth=2, label=f"{name} (AUC={res['auc']:.3f})")

plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random (AUC=0.500)')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves \u2014 Model Comparison', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Exercise 4.1: Analyze Feature Importance - SOLUTION

In [None]:
rf_model = results['Random Forest']['model']

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature', palette='viridis')
plt.title('Top 15 Features \u2014 Random Forest Importance', fontsize=14)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print("\nTop 10 Features:")
for _, row in feature_importance.head(10).iterrows():
    print(f"  {row['feature']:30s}: {row['importance']:.4f}")

In [None]:
gb_model = results['Gradient Boosting']['model']

gb_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': gb_model.feature_importances_
}).sort_values('importance', ascending=False)

# Side-by-side comparison
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

sns.barplot(data=feature_importance.head(10), x='importance', y='feature', ax=axes[0], palette='viridis')
axes[0].set_title('Random Forest \u2014 Top 10 Features')

sns.barplot(data=gb_importance.head(10), x='importance', y='feature', ax=axes[1], palette='magma')
axes[1].set_title('Gradient Boosting \u2014 Top 10 Features')

plt.tight_layout()
plt.show()

### Exercise 5.1: Save the Best Model - SOLUTION

In [None]:
best_model_name = max(results, key=lambda k: results[k]['f1'])
best_model = results[best_model_name]['model']

print(f"Best model: {best_model_name} (F1: {results[best_model_name]['f1']:.4f})")

# Save model and scaler
joblib.dump(best_model, 'churn_model.pkl')
joblib.dump(scaler, 'churn_scaler.pkl')

print("Model and scaler saved!")

### Exercise 5.2: Create Prediction Function - SOLUTION

In [None]:
def predict_churn(customer_data, model_path='churn_model.pkl', scaler_path='churn_scaler.pkl'):
    """Predict churn probability for a new customer."""
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    
    # Ensure correct feature order
    df_input = pd.DataFrame([customer_data])
    
    # Engineer features
    df_input['tenure_years'] = df_input['tenure_months'] / 12
    df_input['is_new_customer'] = (df_input['tenure_months'] <= 6).astype(int)
    df_input['avg_monthly_charge'] = df_input['total_charges'] / df_input['tenure_months'].replace(0, 1)
    df_input['charge_per_product'] = df_input['monthly_charges'] / df_input['num_products']
    df_input['support_per_tenure'] = df_input['support_tickets'] / df_input['tenure_months'].replace(0, 1)
    df_input['engagement_score'] = (
        df_input['num_products'] * 0.3 +
        (df_input['tenure_months'] / 72) * 0.3 +
        df_input['has_partner'] * 0.2 +
        df_input['has_dependents'] * 0.2
    )
    
    # Encode categoricals (match training)
    df_encoded = pd.get_dummies(df_input, columns=['contract_type', 'payment_method', 'internet_service'], drop_first=True)
    
    # Align columns with training data
    for col in X_train.columns:
        if col not in df_encoded.columns:
            df_encoded[col] = 0
    df_encoded = df_encoded[X_train.columns]
    
    # Scale
    numeric_features_list = ['tenure_months', 'monthly_charges', 'total_charges', 'support_tickets',
                            'num_products', 'age', 'tenure_years', 'avg_monthly_charge',
                            'charge_per_product', 'support_per_tenure', 'engagement_score']
    df_encoded[numeric_features_list] = scaler.transform(df_encoded[numeric_features_list])
    
    # Predict
    prob = model.predict_proba(df_encoded)[0][1]
    prediction = "CHURN RISK" if prob > 0.5 else "LIKELY TO STAY"
    
    return {'prediction': prediction, 'churn_probability': round(prob, 4)}

# Test prediction
test_customer = {
    'tenure_months': 3,
    'monthly_charges': 95.00,
    'total_charges': 285.00,
    'contract_type': 'Month-to-month',
    'payment_method': 'Electronic check',
    'internet_service': 'Fiber optic',
    'support_tickets': 5,
    'num_products': 1,
    'age': 32,
    'has_partner': 0,
    'has_dependents': 0
}

result = predict_churn(test_customer)
print(f"Prediction: {result['prediction']}")
print(f"Churn Probability: {result['churn_probability']:.1%}")

In [None]:
# Test with a loyal customer
loyal_customer = {
    'tenure_months': 48,
    'monthly_charges': 55.00,
    'total_charges': 2640.00,
    'contract_type': 'Two year',
    'payment_method': 'Bank transfer',
    'internet_service': 'DSL',
    'support_tickets': 1,
    'num_products': 4,
    'age': 45,
    'has_partner': 1,
    'has_dependents': 1
}

result = predict_churn(loyal_customer)
print(f"Prediction: {result['prediction']}")
print(f"Churn Probability: {result['churn_probability']:.1%}")

In [None]:
# Clean up saved files
import os
for f in ['churn_model.pkl', 'churn_scaler.pkl']:
    if os.path.exists(f):
        os.remove(f)
print("Cleanup complete!")

## Summary

In this lab, you learned how to:

1. **Generate and explore** a realistic churn dataset
2. **Engineer features** from raw data (tenure-based, charge-based, engagement)
3. **Encode and scale** features for ML
4. **Train and compare** three models (Logistic Regression, Random Forest, Gradient Boosting)
5. **Evaluate models** with cross-validation, confusion matrices, and ROC curves
6. **Analyze feature importance** to understand model decisions
7. **Save and deploy** models with joblib and build prediction functions

---

*Introduction to Data Science & Engineering | AI Elevate*