## 1. Customer Churn Prediction

We aim to predict customer churn using features such as recency, frequency, monetary value, and delivery performance.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load and preprocess data
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Set random seed for reproducibility
np.random.seed(42)

# Load the datasets (assuming they're already loaded from previous analysis)
try:
    # Adjust these paths to match your local file structure
    orders = pd.read_csv('data/olist_orders_dataset.csv')
    order_items = pd.read_csv('data/olist_order_items_dataset.csv')
    customers = pd.read_csv('data/olist_customers_dataset.csv')
    payments = pd.read_csv('data/olist_order_payments_dataset.csv')
    reviews = pd.read_csv('data/olist_order_reviews_dataset.csv')
    
    print("Data loaded successfully!")
except NameError:
    print("Using previously loaded data")

# Convert date columns to datetime
date_columns = ['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date',
                'order_estimated_delivery_date']

for col in date_columns:
    orders[col] = pd.to_datetime(orders[col], errors='coerce')

# Define the analysis date (last date in the dataset + 30 days)
last_order_date = orders['order_purchase_timestamp'].max()
analysis_date = last_order_date + timedelta(days=30)
print(f"Analysis date: {analysis_date}")

# Define churn: customers who haven't made a purchase in the last 90 days
churn_threshold = 90  # days

# Create customer purchase history features
customer_orders = orders.groupby('customer_id').agg({
    'order_id': 'count',
    'order_purchase_timestamp': [
        'min',  # first purchase date
        'max',  # last purchase date
        lambda x: (analysis_date - x.max()).days  # recency
    ]
}).reset_index()

# Flatten the multi-level column names
customer_orders.columns = ['customer_id', 'order_count', 'first_purchase_date', 
                          'last_purchase_date', 'days_since_last_purchase']

# Define churn label (1 = churned, 0 = active)
customer_orders['churned'] = (customer_orders['days_since_last_purchase'] > churn_threshold).astype(int)

# Calculate time between first and last purchase (in days)
customer_orders['customer_lifetime'] = (customer_orders['last_purchase_date'] - 
                                       customer_orders['first_purchase_date']).dt.days

# Add order value features
order_values = payments.groupby('order_id')['payment_value'].sum().reset_index()
customer_values = orders.merge(order_values, on='order_id').groupby('customer_id').agg({
    'payment_value': ['sum', 'mean', 'std', 'min', 'max']
}).reset_index()

# Flatten the multi-level column names
customer_values.columns = ['customer_id', 'total_spend', 'avg_order_value', 
                          'std_order_value', 'min_order_value', 'max_order_value']

# Add review features
customer_reviews = orders.merge(reviews[['order_id', 'review_score']], on='order_id').groupby('customer_id').agg({
    'review_score': ['mean', 'min', 'count']
}).reset_index()

# Flatten the multi-level column names
customer_reviews.columns = ['customer_id', 'avg_review_score', 'min_review_score', 'review_count']

# Add delivery experience features
orders['delivery_delay'] = (orders['order_delivered_customer_date'] - 
                           orders['order_estimated_delivery_date']).dt.days

customer_delivery = orders.groupby('customer_id').agg({
    'delivery_delay': ['mean', 'max']
}).reset_index()

# Flatten the multi-level column names
customer_delivery.columns = ['customer_id', 'avg_delivery_delay', 'max_delivery_delay']

# Merge all customer features
customer_features = customer_orders.merge(customer_values, on='customer_id', how='left')
customer_features = customer_features.merge(customer_reviews, on='customer_id', how='left')
customer_features = customer_features.merge(customer_delivery, on='customer_id', how='left')
customer_features = customer_features.merge(customers[['customer_id', 'customer_state']], on='customer_id', how='left')

# Fill missing values
customer_features['avg_review_score'] = customer_features['avg_review_score'].fillna(customer_features['avg_review_score'].mean())
customer_features['min_review_score'] = customer_features['min_review_score'].fillna(customer_features['min_review_score'].mean())
customer_features['review_count'] = customer_features['review_count'].fillna(0)
customer_features['avg_delivery_delay'] = customer_features['avg_delivery_delay'].fillna(0)
customer_features['max_delivery_delay'] = customer_features['max_delivery_delay'].fillna(0)
customer_features['std_order_value'] = customer_features['std_order_value'].fillna(0)

# Calculate average purchase frequency (for customers with more than one purchase)
customer_features['purchase_frequency'] = np.where(
    customer_features['order_count'] > 1,
    customer_features['customer_lifetime'] / (customer_features['order_count'] - 1),
    0
)

# Display the first few rows of the features
print("\nCustomer features for churn prediction:")
display(customer_features.head())

# Check class balance
churn_distribution = customer_features['churned'].value_counts(normalize=True) * 100
print(f"\nChurn distribution: {churn_distribution[1]:.2f}% churned, {churn_distribution[0]:.2f}% active")

# Prepare features and target
X = customer_features.drop(['customer_id', 'churned', 'first_purchase_date', 'last_purchase_date'], axis=1)
y = customer_features['churned']

# Split categorical and numerical features
categorical_features = ['customer_state']
numerical_features = [col for col in X.columns if col not in categorical_features]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("\nModel Evaluation:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()