In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist, squareform

In [20]:
class CustomerSegmentation:
    def __init__(self):
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='mean')
        self.best_model = None
        self.best_n_clusters = None
        self.feature_matrix = None
        self.feature_names = None
        
    def prepare_features(self, customers_df, transactions_df):
        # Create copies to avoid modifying original data
        customers_df = customers_df.copy()
        transactions_df = transactions_df.copy()
        
        # Customer profile features
        customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
        customers_df['account_age'] = (
            pd.Timestamp.now() - customers_df['SignupDate']
        ).dt.days
        
        # One-hot encode region
        region_dummies = pd.get_dummies(customers_df['Region'], prefix='region')
        
        # Transaction features
        transaction_features = transactions_df.groupby('CustomerID').agg({
            'TransactionID': 'count',
            'TotalValue': ['sum', 'mean', 'std'],
            'Quantity': ['sum', 'mean'],
            'Price': ['mean', 'max']
        }).round(2)
        
        # Flatten column names
        transaction_features.columns = [
            'transaction_count',
            'total_spend',
            'avg_transaction_value',
            'std_transaction_value',
            'total_quantity',
            'avg_quantity',
            'avg_price',
            'max_price'
        ]
        
        # Reset index to make CustomerID a column
        transaction_features = transaction_features.reset_index()
        
        # Calculate purchase frequency (transactions per month)
        transaction_features['purchase_frequency'] = (
            transaction_features['transaction_count'] / 
            (customers_df['account_age'] / 30)
        ).round(2)
        
        # Merge all features
        features_df = customers_df[['CustomerID', 'account_age']].merge(
            region_dummies,
            left_index=True,
            right_index=True
        ).merge(
            transaction_features,
            on='CustomerID',
            how='left'
        )
        
        # Replace infinite values with NaN
        features_df = features_df.replace([np.inf, -np.inf], np.nan)
        
        # Store feature names
        self.feature_names = [col for col in features_df.columns if col != 'CustomerID']
        
        return features_df
    
    def preprocess_data(self, features_df):
        # Extract features excluding CustomerID
        X = features_df[self.feature_names].values
        X = self.imputer.fit_transform(X)
        X = self.scaler.fit_transform(X)
        
        return X
    
    def fit(self, customers_df, transactions_df, n_clusters=None):
        """Fit the clustering model"""
        # Prepare features
        features_df = self.prepare_features(customers_df, transactions_df)
        
        # Preprocess data
        X = self.preprocess_data(features_df)
        self.feature_matrix = X
        
        # Find optimal number of clusters if not specified
        if n_clusters is None:
            metrics_df = self.find_optimal_clusters(X)
            self.best_n_clusters = metrics_df.loc[
                metrics_df['db_index'].idxmin(), 'n_clusters'
            ]
        else:
            self.best_n_clusters = n_clusters
        
        # Fit final model
        self.best_model = KMeans(
            n_clusters=self.best_n_clusters,
            random_state=42
        )
        self.best_model.fit(X)
        
        # Add cluster assignments to features DataFrame
        features_df['Cluster'] = self.best_model.labels_
        
        return features_df
    
    def find_optimal_clusters(self, X, max_clusters=10):
        """Find optimal number of clusters using multiple metrics"""
        metrics = {
            'n_clusters': [],
            'db_index': [],
            'silhouette': [],
            'calinski': []
        }
        
        for n_clusters in range(2, max_clusters + 1):
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            labels = kmeans.fit_predict(X)
            
            metrics['n_clusters'].append(n_clusters)
            metrics['db_index'].append(davies_bouldin_index(X, labels))
            metrics['silhouette'].append(silhouette_score(X, labels))
            metrics['calinski'].append(calinski_harabasz_score(X, labels))
        
        return pd.DataFrame(metrics)
    
    def get_cluster_metrics(self):
        """Calculate clustering metrics"""
        labels = self.best_model.labels_
        
        return {
            'n_clusters': self.best_n_clusters,
            'db_index': davies_bouldin_index(self.feature_matrix, labels),
            'silhouette_score': silhouette_score(self.feature_matrix, labels),
            'calinski_score': calinski_harabasz_score(self.feature_matrix, labels)
        }
    
    def plot_results(self):
        """Create visualizations of the clustering results"""
        # Reduce dimensionality for visualization
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(self.feature_matrix)
        
        # Create figure with subplots
        fig = plt.figure(figsize=(20, 10))
        
        # Plot 1: Cluster visualization
        plt.subplot(2, 2, 1)
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], 
                            c=self.best_model.labels_, cmap='viridis')
        plt.title('Customer Segments (PCA)')
        plt.xlabel('First Principal Component')
        plt.ylabel('Second Principal Component')
        plt.colorbar(scatter)
        
        # Plot 2: Feature importance per cluster
        cluster_centers = pd.DataFrame(
            self.scaler.inverse_transform(self.best_model.cluster_centers_),
            columns=self.feature_names
        )
        
        plt.subplot(2, 2, 2)
        sns.heatmap(cluster_centers, cmap='RdYlBu', center=0)
        plt.title('Feature Importance by Cluster')
        plt.xticks(rotation=45)
        
        # Plot 3: Cluster sizes
        plt.subplot(2, 2, 3)
        cluster_sizes = pd.Series(self.best_model.labels_).value_counts()
        cluster_sizes.plot(kind='bar')
        plt.title('Cluster Sizes')
        plt.xlabel('Cluster')
        plt.ylabel('Number of Customers')
        
        # Plot 4: Feature distributions by cluster
        plt.subplot(2, 2, 4)
        feature_importance = np.abs(cluster_centers - cluster_centers.mean()).mean()
        feature_importance.sort_values(ascending=True).plot(kind='barh')
        plt.title('Overall Feature Importance')
        
        plt.tight_layout()
        return fig

In [6]:
def davies_bouldin_index(X, labels):
    n_clusters = len(np.unique(labels))
    cluster_centers = np.array([X[labels == i].mean(axis=0) for i in range(n_clusters)])
    
    # Calculate cluster dispersions
    dispersions = np.zeros(n_clusters)
    for i in range(n_clusters):
        if sum(labels == i) > 0:
            cluster_points = X[labels == i]
            dispersions[i] = np.mean(np.linalg.norm(cluster_points - cluster_centers[i], axis=1))
    
    # Calculate distances between cluster centers
    center_distances = squareform(pdist(cluster_centers))
    
    # Calculate Davies-Bouldin Index
    db_index = 0
    for i in range(n_clusters):
        if sum(labels == i) > 0:
            ratios = np.zeros(n_clusters)
            for j in range(n_clusters):
                if i != j and sum(labels == j) > 0:
                    ratios[j] = (dispersions[i] + dispersions[j]) / center_distances[i, j]
            db_index += np.max(ratios[ratios != 0])
    
    return db_index / n_clusters

In [13]:
def main():
    # Load data
    try:
        customers_df = pd.read_csv("Customers.csv")
        transactions_df = pd.read_csv("Transactions.csv")
        
        # Initialize and fit segmentation model
        segmentation = CustomerSegmentation()
        clustered_data = segmentation.fit(customers_df, transactions_df)
        
        # Get metrics
        metrics = segmentation.get_cluster_metrics()
        print("\nClustering Metrics:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")
        
        # Create visualizations
        plots = segmentation.plot_results()
        plots.savefig('clustering_results.png')
        plt.close()
        
        # Save cluster assignments
        cluster_results = clustered_data[['CustomerID', 'Cluster']]
        cluster_results.to_csv('cluster_assignments.csv', index=False)
        
        print("\nSegmentation completed successfully!")
        print("- Cluster assignments saved to 'cluster_assignments.csv'")
        print("- Visualizations saved to 'clustering_results.png'")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print("Please check your input data and file paths.")

In [22]:
if __name__ == "__main__":
    main()




Clustering Metrics:
n_clusters: 10.0000
db_index: 1.4345
silhouette_score: 0.1988
calinski_score: 25.8122
Figure(2000x1000)

Segmentation completed successfully!
- Cluster assignments saved to 'cluster_assignments.csv'
- Visualizations saved to 'clustering_results.png'
