# Customer Analysis: Segmentation and Churn Prediction

This notebook performs advanced customer analysis using the Brazilian E-commerce dataset. We'll focus on:
1. RFM (Recency, Frequency, Monetary) Analysis
2. Customer Segmentation using K-means Clustering
3. Churn Prediction Model Development
4. Customer Lifetime Value Analysis

In [None]:
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from src.data_processing.data_loader import OlistDataLoader
from src.visualization.plot_utils import (
    set_plotting_style,
    plot_correlation_matrix
)
from src.models.model_evaluation import evaluate_classifier
from utils.logging_utils import setup_logger
from config.settings import RFM_QUANTILES, CUSTOMER_SEGMENTS

# Set up logging
logger = setup_logger(__name__)

## 1. Data Loading and Preparation

In [None]:
# Load data
data_dir = project_root / 'data' / 'raw'
loader = OlistDataLoader(data_dir)
processed_data = loader.get_preprocessed_data()

# Extract customer features
customer_features = processed_data['customer_features']
orders = processed_data['orders']

# Set plotting style
set_plotting_style()

## 2. RFM Analysis

In [None]:
def calculate_rfm_scores(df):
    """Calculate RFM scores for each customer."""
    # Recency score
    r_labels = range(4, 0, -1)
    r_quartiles = pd.qcut(df['days_since_last_purchase'],
                         q=4, labels=r_labels)
    
    # Frequency score
    f_labels = range(1, 5)
    f_quartiles = pd.qcut(df['order_count'],
                         q=4, labels=f_labels)
    
    # Monetary score
    m_labels = range(1, 5)
    m_quartiles = pd.qcut(df['total_spend'],
                         q=4, labels=m_labels)
    
    return pd.DataFrame({
        'R': r_quartiles,
        'F': f_quartiles,
        'M': m_quartiles
    })

# Calculate RFM scores
rfm_scores = calculate_rfm_scores(customer_features)
customer_features = pd.concat([customer_features, rfm_scores], axis=1)

# Plot RFM score distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, metric in enumerate(['R', 'F', 'M']):
    sns.countplot(data=customer_features, x=metric, ax=axes[idx])
    axes[idx].set_title(f'{metric} Score Distribution')
plt.tight_layout()
plt.show()

## 3. Customer Segmentation

In [None]:
# Prepare features for clustering
clustering_features = [
    'days_since_last_purchase',
    'order_count',
    'total_spend',
    'avg_order_value'
]

X = customer_features[clustering_features].copy()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform K-means clustering
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
customer_features['Cluster'] = kmeans.fit_predict(X_scaled)

# Analyze clusters
cluster_stats = customer_features.groupby('Cluster')[clustering_features].mean()
print("\nCluster Statistics:")
print(cluster_stats)

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=customer_features,
    x='total_spend',
    y='order_count',
    hue='Cluster',
    palette='deep'
)
plt.title('Customer Segments based on Spend and Order Frequency')
plt.show()

## 4. Churn Prediction

In [None]:
def define_churn(df, inactive_days=90):
    """Define churn based on inactivity period."""
    return (df['days_since_last_purchase'] > inactive_days).astype(int)

# Prepare features for churn prediction
churn_features = [
    'order_count',
    'total_spend',
    'avg_order_value',
    'R',
    'F',
    'M'
]

X = customer_features[churn_features].copy()
y = define_churn(customer_features)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate model
metrics = evaluate_classifier(
    y_test,
    y_pred,
    ['Not Churned', 'Churned']
)

print("\nChurn Prediction Results:")
print(metrics['classification_report'])

# Feature importance
feature_importance = pd.DataFrame({
    'feature': churn_features,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance for Churn Prediction')
plt.show()

## 5. Customer Lifetime Value Analysis

In [None]:
def calculate_clv(df):
    """Calculate Customer Lifetime Value."""
    avg_order_value = df['total_spend'] / df['order_count']
    purchase_frequency = df['order_count'] / (
        df['days_since_last_purchase'] / 365
    )
    clv = avg_order_value * purchase_frequency
    return clv

# Calculate CLV
customer_features['CLV'] = calculate_clv(customer_features)

# Visualize CLV distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=customer_features, x='CLV', bins=50)
plt.title('Customer Lifetime Value Distribution')
plt.xlabel('CLV')
plt.show()

# CLV by segment
plt.figure(figsize=(10, 6))
sns.boxplot(data=customer_features, x='Cluster', y='CLV')
plt.title('Customer Lifetime Value by Segment')
plt.show()

## 6. Key Findings and Recommendations

### Customer Segmentation
1. Segment Characteristics:
   - [Document the characteristics of each identified segment]
   - [Include average order value, frequency, etc.]

2. Churn Risk:
   - [Document churn prediction accuracy]
   - [List main predictors of churn]
   - [Recommend retention strategies]

3. CLV Insights:
   - [Document CLV distribution]
   - [Identify high-value customer characteristics]
   - [Suggest strategies for value optimization]

### Recommendations
1. Segment-Specific Strategies:
   - [List targeted marketing approaches]
   - [Suggest personalization opportunities]

2. Churn Prevention:
   - [List early intervention strategies]
   - [Suggest retention campaigns]

3. Value Optimization:
   - [Recommend cross-selling opportunities]
   - [Suggest loyalty program improvements]

### Next Steps
1. [List follow-up analyses]
2. [Suggest A/B tests]
3. [Recommend implementation priorities]