# Task 4: Proxy Target Variable Engineering

**Objective:** Create a credit risk target variable (`is_high_risk`) by identifying "disengaged" customers using RFM analysis and K-Means clustering.

Since the provided `insurance.csv` does not contain transaction history, we will **generate synthetic transaction data** to demonstrate this process. In a real-world scenario, you would replace the synthetic data loading step with loading your actual transaction logs.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import sys
import os

# Add src to path to import custom modules
sys.path.append(os.path.abspath('../src'))
from proxy_labeling import calculate_rfm, assign_risk_label

# Set plot style
sns.set(style="whitegrid")

In [None]:
# 1. Load and Inspect Transaction Data (Synthetic Generation)

# Generate synthetic transaction data
np.random.seed(42)
n_customers = 1000
n_transactions = 5000

customer_ids = np.random.randint(1, n_customers + 1, n_transactions)
transaction_dates = pd.to_datetime('2023-01-01') + pd.to_timedelta(np.random.randint(0, 365, n_transactions), unit='D')
amounts = np.random.exponential(scale=100, size=n_transactions).round(2)

transactions_df = pd.DataFrame({
    'CustomerId': customer_ids,
    'TransactionDate': transaction_dates,
    'Amount': amounts
})

print("Synthetic Transaction Data:")
display(transactions_df.head())
print(f"\nShape: {transactions_df.shape}")

In [None]:
# 2. Calculate RFM Metrics
# We use the helper function from src/proxy_labeling.py

rfm_df = calculate_rfm(transactions_df, 'CustomerId', 'TransactionDate', 'Amount')

print("RFM Metrics:")
display(rfm_df.head())
print(f"\nShape: {rfm_df.shape}")

# Visualize RFM distributions
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.histplot(rfm_df['Recency'])
plt.title('Recency Distribution')

plt.subplot(1, 3, 2)
sns.histplot(rfm_df['Frequency'])
plt.title('Frequency Distribution')

plt.subplot(1, 3, 3)
sns.histplot(rfm_df['Monetary'])
plt.title('Monetary Distribution')
plt.show()

In [None]:
# 3. Pre-process and Scale RFM Features
# 4. Apply K-Means Clustering
# 5. Analyze Clusters and Define High-Risk Label

# All these steps are encapsulated in the assign_risk_label function
rfm_labeled = assign_risk_label(rfm_df, n_clusters=3, random_state=42)

print("\nLabeled Data:")
display(rfm_labeled.head())

# Visualize Clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(data=rfm_labeled, x='Frequency', y='Monetary', hue='Cluster', palette='viridis', style='is_high_risk', s=100)
plt.title('Customer Segments (RFM Clusters)')
plt.show()

# Visualize Risk Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='is_high_risk', data=rfm_labeled)
plt.title('High Risk Label Distribution')
plt.show()

In [None]:
# 6. Integrate Target Variable into Dataset
# In a real scenario, we would merge this back to the main feature set.
# For demonstration, we show the merge logic.

# Assuming 'df_features' is your main dataset (e.g., from Task 3)
# df_features = pd.read_csv('../data/processed/features.csv') 
# df_final = df_features.merge(rfm_labeled[['is_high_risk']], left_on='CustomerId', right_index=True, how='left')

print("Merge logic demonstrated (commented out as we lack the main feature set with CustomerId).")
print("The 'is_high_risk' column is now ready for model training.")
display(rfm_labeled[['is_high_risk']].head())