In [23]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [24]:
# Load your processed data
df = pd.read_csv('features_engineered.csv')

In [25]:
df.head()


Unnamed: 0,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_2,ChannelId_ChannelId_3,...,customer_Value_sum,customer_Value_mean,customer_Value_std,customer_FraudResult_sum,customer_FraudResult_mean,transaction_dayofweek,is_business_hours,is_weekend,is_refund,amount_category
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,203847,1713.0,2675.218372,0,0.0,3,0,0,0,medium
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,203847,1713.0,2675.218372,0,0.0,3,0,0,1,small
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1000,500.0,0.0,0,0.0,3,0,0,0,medium
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,286623,7542.710526,17691.401706,0,0.0,3,0,0,0,xlarge
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,286623,7542.710526,17691.401706,0,0.0,3,0,0,1,medium


In [26]:

# Step 1: Calculate RFM Metrics
def calculate_rfm(df, snapshot_date=None):
    """
    Calculate RFM metrics for each customer from processed data
    """
    # Set snapshot date
    if snapshot_date is None:
        snapshot_date = pd.to_datetime(df['TransactionStartTime']).max()

    # Convert dates
    df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

    # Calculate RFM per customer
    rfm = df.groupby('CustomerId').agg({
        'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  # Recency
        'TransactionId': 'count',  # Frequency
        'Amount': 'sum'  # Monetary
    }).reset_index()

    rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

    return rfm

In [27]:
# Step 2: Cluster Customers
def cluster_customers(rfm_df, n_clusters=3, random_state=42):
    """
    Cluster customers using K-Means on scaled RFM features
    """
    # Remove extreme outliers before clustering
    print("Removing extreme outliers...")
    Q1 = rfm_df[['Recency', 'Frequency', 'Monetary']].quantile(0.25)
    Q3 = rfm_df[['Recency', 'Frequency', 'Monetary']].quantile(0.75)
    IQR = Q3 - Q1

    # Define outlier bounds
    lower_bound = Q1 - 3 * IQR  # More lenient than 1.5*IQR
    upper_bound = Q3 + 3 * IQR

    # Filter out extreme outliers
    mask = ((rfm_df[['Recency', 'Frequency', 'Monetary']] >= lower_bound) &
            (rfm_df[['Recency', 'Frequency', 'Monetary']] <= upper_bound)).all(axis=1)

    rfm_clean = rfm_df[mask].copy()
    rfm_outliers = rfm_df[~mask].copy()

    print(f"Outliers removed: {len(rfm_outliers)} customers")
    print(f"Clean data: {len(rfm_clean)} customers")

    # Prepare features for clustering
    features = ['Recency', 'Frequency', 'Monetary']
    X = rfm_clean[features]

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    clusters = kmeans.fit_predict(X_scaled)

    # Add cluster labels to clean data
    rfm_clean['Cluster'] = clusters

    # Assign outliers to separate cluster or most similar cluster
    if len(rfm_outliers) > 0:
        X_outliers_scaled = scaler.transform(rfm_outliers[features])
        outlier_clusters = kmeans.predict(X_outliers_scaled)
        rfm_outliers['Cluster'] = outlier_clusters

    # Combine back
    rfm_final = pd.concat([rfm_clean, rfm_outliers]).sort_index()

    return rfm_final, scaler, kmeans


In [28]:
# Step 3: Analyze clusters and identify high-risk segment
def analyze_clusters(rfm_df):
    """
    Analyze cluster characteristics to identify high-risk segment
    """
    cluster_summary = rfm_df.groupby('Cluster').agg({
        'Recency': ['mean', 'median'],
        'Frequency': ['mean', 'median'],
        'Monetary': ['mean', 'median'],
        'CustomerId': 'count'
    }).round(2)

    print("Cluster Analysis:")
    print(cluster_summary)

    # Only consider clusters with reasonable size (>1% of total customers)
    min_cluster_size = len(rfm_df) * 0.01
    cluster_sizes = rfm_df.groupby('Cluster').size()
    valid_clusters = cluster_sizes[cluster_sizes >= min_cluster_size].index

    print(f"\nClusters with sufficient size (>{min_cluster_size:.0f} customers): {list(valid_clusters)}")

    # Calculate cluster means for valid clusters only
    cluster_means = rfm_df[rfm_df['Cluster'].isin(valid_clusters)].groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean()

    # Improved risk scoring: high recency + low frequency = high risk
    # Monetary is less reliable for risk assessment
    cluster_means['Risk_Score'] = (
        (cluster_means['Recency'] / cluster_means['Recency'].max()) * 0.6 +  # 60% weight on recency
        (1 - cluster_means['Frequency'] / cluster_means['Frequency'].max()) * 0.4  # 40% weight on frequency
    )

    print(f"\nRisk Scores for valid clusters:")
    for cluster in cluster_means.index:
        size = cluster_sizes[cluster]
        score = cluster_means.loc[cluster, 'Risk_Score']
        print(f"Cluster {cluster}: Risk Score = {score:.3f}, Size = {size}")

    high_risk_cluster = cluster_means['Risk_Score'].idxmax()
    print(f"\nHigh-risk cluster identified: {high_risk_cluster}")
    print(f"Characteristics: High Recency ({cluster_means.loc[high_risk_cluster, 'Recency']:.1f}), Low Frequency ({cluster_means.loc[high_risk_cluster, 'Frequency']:.1f})")

    return high_risk_cluster


In [29]:
# Step 4: Create target variable
def create_target_variable(rfm_df, high_risk_cluster):
    """
    Create binary target variable based on cluster assignment
    """
    rfm_df['is_high_risk'] = (rfm_df['Cluster'] == high_risk_cluster).astype(int)

    print(f"Target variable distribution:")
    print(rfm_df['is_high_risk'].value_counts())
    print(f"High-risk percentage: {rfm_df['is_high_risk'].mean():.2%}")

    return rfm_df

In [33]:
# Main execution function
def engineer_proxy_target(df_transactions, snapshot_date=None):
    """
    Complete pipeline to engineer proxy target variable
    """
    print("Step 1: Calculating RFM metrics...")
    rfm_df = calculate_rfm(df_transactions, snapshot_date)

    print("\nStep 2: Clustering customers...")
    rfm_df, scaler, kmeans = cluster_customers(rfm_df)

    print("\nStep 3: Analyzing clusters...")
    high_risk_cluster = analyze_clusters(rfm_df)

    print("\nStep 4: Creating target variable...")
    rfm_df = create_target_variable(rfm_df, high_risk_cluster)

    return rfm_df, scaler, kmeans

In [31]:
# Visualization function
def plot_clusters(rfm_df):
    """
    Visualize the clusters
    """
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))

    # RFM scatter plots
    scatter_plots = [
        ('Recency', 'Frequency'),
        ('Recency', 'Monetary'),
        ('Frequency', 'Monetary')
    ]

    for i, (x, y) in enumerate(scatter_plots):
        ax = axes[i//2, i%2]
        scatter = ax.scatter(rfm_df[x], rfm_df[y], c=rfm_df['Cluster'], cmap='viridis', alpha=0.6)
        ax.set_xlabel(x)
        ax.set_ylabel(y)
        ax.set_title(f'{x} vs {y}')
        plt.colorbar(scatter, ax=ax)

    # Cluster distribution
    axes[1,1].bar(rfm_df['Cluster'].value_counts().index, rfm_df['Cluster'].value_counts().values)
    axes[1,1].set_xlabel('Cluster')
    axes[1,1].set_ylabel('Count')
    axes[1,1].set_title('Cluster Distribution')

    plt.tight_layout()
    plt.show()

# Example usage:
# rfm_with_target = engineer_proxy_target(df_transactions)
# plot_clusters(rfm_with_target)

# To merge back to main dataset:
# df_main = df_main.merge(rfm_with_target[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')

In [34]:
# --- Step 6: Execute and Save Outputs ---
rfm_with_target, rfm_scaler, rfm_kmeans = engineer_proxy_target(df)

Step 1: Calculating RFM metrics...

Step 2: Clustering customers...
Removing extreme outliers...
Outliers removed: 522 customers
Clean data: 3220 customers

Step 3: Analyzing clusters...
Cluster Analysis:
        Recency        Frequency        Monetary        CustomerId
           mean median      mean median     mean median      count
Cluster                                                           
0         14.71   11.0      8.67    6.0    -0.04  -0.14       1542
1         60.93   60.0      5.84    3.0     2.64  -0.07       1335
2         11.51    4.0     86.12   47.0    -3.99  -1.51        865

Clusters with sufficient size (>37 customers): [0, 1, 2]

Risk Scores for valid clusters:
Cluster 0: Risk Score = 0.505, Size = 1542
Cluster 1: Risk Score = 0.973, Size = 1335
Cluster 2: Risk Score = 0.113, Size = 865

High-risk cluster identified: 1
Characteristics: High Recency (60.9), Low Frequency (5.8)

Step 4: Creating target variable...
Target variable distribution:
is_high_risk
0  

In [38]:
import os

# Create the directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# Save RFM + is_high_risk data
rfm_with_target.to_csv('data/processed/rfm_with_target.csv', index=False)

In [36]:
# Merge is_high_risk into feature engineered dataset
merged = df.merge(rfm_with_target[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')
merged['is_high_risk'] = merged['is_high_risk'].fillna(0).astype(int)
merged.to_csv('data/processed/features_engineered_with_target.csv', index=False)

In [37]:
import os

# Create the directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save models
joblib.dump(rfm_scaler, 'models/rfm_scaler.pkl')
joblib.dump(rfm_kmeans, 'models/rfm_kmeans.pkl')

print("✅ Proxy target engineering completed and saved.")

✅ Proxy target engineering completed and saved.
