In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')

class CustomerSegmentation:
    def __init__(self, customer_file='Customers.csv',
                 product_file='Products.csv',
                 transaction_file='Transactions.csv'):
        """Initialize the CustomerSegmentation class with data files"""
        try:
            self.customers_df = pd.read_csv(customer_file)
            self.products_df = pd.read_csv(product_file)
            self.transactions_df = pd.read_csv(transaction_file)

            # Initialize other attributes
            self.feature_matrix = None
            self.scaler = StandardScaler()
            self.kmeans = None
            self.n_clusters = None
            self.features = None
            self.cluster_labels = None

            print("Data loaded successfully")
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            raise

    def _validate_data(self):
        """Validate the loaded data"""
        required_columns = {
            'customers': ['CustomerID', 'SignupDate', 'Region'],
            'products': ['ProductID', 'Category'],
            'transactions': ['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate', 'TotalValue']
        }

        try:
            # Check required columns
            for col in required_columns['customers']:
                assert col in self.customers_df.columns
            for col in required_columns['products']:
                assert col in self.products_df.columns
            for col in required_columns['transactions']:
                assert col in self.transactions_df.columns

            # Check for null values
            assert not self.customers_df['CustomerID'].isnull().any()
            assert not self.products_df['ProductID'].isnull().any()
            assert not self.transactions_df['TransactionID'].isnull().any()

            return True
        except AssertionError:
            print("Data validation failed. Please check the required columns and null values.")
            return False

    def prepare_features(self):
        """Prepare features for clustering with error handling"""
        try:
            if not self._validate_data():
                raise ValueError("Data validation failed")

            # Convert dates safely
            self.customers_df['SignupDate'] = pd.to_datetime(self.customers_df['SignupDate'], errors='coerce')
            self.transactions_df['TransactionDate'] = pd.to_datetime(self.transactions_df['TransactionDate'], errors='coerce')

            # Create customer level features
            customer_features = []
            current_date = datetime.now()

            for customer_id in self.customers_df['CustomerID'].unique():
                try:
                    # Get customer transactions
                    customer_transactions = self.transactions_df[
                        self.transactions_df['CustomerID'] == customer_id
                    ]
                    customer_data = self.customers_df[
                        self.customers_df['CustomerID'] == customer_id
                    ].iloc[0]

                    # Calculate basic features
                    features = {
                        'customer_id': customer_id,
                        'total_spend': customer_transactions['TotalValue'].sum(),
                        'avg_transaction_value': customer_transactions['TotalValue'].mean(),
                        'num_transactions': len(customer_transactions),
                        'num_unique_products': customer_transactions['ProductID'].nunique(),
                        'days_since_signup': (current_date - pd.Timestamp(customer_data['SignupDate'])).days,
                        'region': customer_data['Region']
                    }

                    # Add category preferences
                    customer_products = customer_transactions.merge(
                        self.products_df, on='ProductID', how='left'
                    )
                    category_spend = customer_products.groupby('Category')['TotalValue'].sum()

                    for category in self.products_df['Category'].unique():
                        features[f'spend_{category}'] = category_spend.get(category, 0)

                    customer_features.append(features)

                except Exception as e:
                    print(f"Error processing customer {customer_id}: {str(e)}")
                    continue

            # Convert to DataFrame
            self.features = pd.DataFrame(customer_features)

            # Prepare feature matrix for clustering
            feature_cols = [col for col in self.features.columns
                          if col not in ['customer_id', 'region']
                          and not self.features[col].isnull().any()]

            # Handle any remaining null values
            self.features[feature_cols] = self.features[feature_cols].fillna(0)

            # Scale features
            self.feature_matrix = self.scaler.fit_transform(self.features[feature_cols])

            print(f"Features prepared successfully. Shape: {self.feature_matrix.shape}")
            return self.features

        except Exception as e:
            print(f"Error in prepare_features: {str(e)}")
            raise

    def find_optimal_clusters(self, min_clusters=2, max_clusters=10):
        """Find optimal number of clusters with error handling"""
        try:
            if self.feature_matrix is None:
                raise ValueError("Features not prepared. Run prepare_features first.")

            db_scores = []
            silhouette_scores = []

            for n in range(min_clusters, max_clusters + 1):
                try:
                    kmeans = KMeans(n_clusters=n, random_state=42, n_init=10)
                    clusters = kmeans.fit_predict(self.feature_matrix)

                    db_score = davies_bouldin_score(self.feature_matrix, clusters)
                    silhouette_avg = silhouette_score(self.feature_matrix, clusters)

                    db_scores.append(db_score)
                    silhouette_scores.append(silhouette_avg)

                    print(f"Clusters {n}: DB Index = {db_score:.4f}, Silhouette = {silhouette_avg:.4f}")

                except Exception as e:
                    print(f"Error calculating metrics for {n} clusters: {str(e)}")
                    continue

            # Plot evaluation metrics
            plt.figure(figsize=(15, 6))

            plt.subplot(1, 2, 1)
            plt.plot(range(min_clusters, max_clusters + 1), db_scores, marker='o')
            plt.title('Davies-Bouldin Index by Number of Clusters')
            plt.xlabel('Number of Clusters')
            plt.ylabel('DB Index')
            plt.grid(True)

            plt.subplot(1, 2, 2)
            plt.plot(range(min_clusters, max_clusters + 1), silhouette_scores, marker='o')
            plt.title('Silhouette Score by Number of Clusters')
            plt.xlabel('Number of Clusters')
            plt.ylabel('Silhouette Score')
            plt.grid(True)

            plt.tight_layout()
            plt.savefig('FirstName_LastName_Clustering_Metrics.png')
            plt.close()

            # Select optimal number of clusters
            self.n_clusters = db_scores.index(min(db_scores)) + min_clusters
            print(f"Optimal number of clusters: {self.n_clusters}")

            return self.n_clusters, min(db_scores)

        except Exception as e:
            print(f"Error in find_optimal_clusters: {str(e)}")
            raise

    def perform_clustering(self):
        """Perform clustering with the optimal number of clusters"""
        try:
            if self.n_clusters is None:
                raise ValueError("Optimal number of clusters not determined. Run find_optimal_clusters first.")

            # Perform final clustering
            self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10)
            self.cluster_labels = self.kmeans.fit_predict(self.feature_matrix)

            # Add cluster labels to features
            self.features['Cluster'] = self.cluster_labels

            # Generate cluster insights
            cluster_insights = []
            for cluster in range(self.n_clusters):
                cluster_data = self.features[self.features['Cluster'] == cluster]
                insight = {
                    'Cluster': cluster,
                    'Size': len(cluster_data),
                    'Avg_Spend': cluster_data['total_spend'].mean(),
                    'Avg_Transactions': cluster_data['num_transactions'].mean(),
                    'Dominant_Region': cluster_data['region'].mode().iloc[0]
                }
                cluster_insights.append(insight)

            # Save results
            cluster_insights_df = pd.DataFrame(cluster_insights)
            cluster_insights_df.to_csv('FirstName_LastName_Cluster_Insights.csv', index=False)

            # Visualize clusters
            self._visualize_clusters()

            return self.features, cluster_insights_df

        except Exception as e:
            print(f"Error in perform_clustering: {str(e)}")
            raise

    def _visualize_clusters(self):
        """Create visualization of clusters"""
        try:
            # Create PCA visualization
            from sklearn.decomposition import PCA
            pca = PCA(n_components=2)
            pca_result = pca.fit_transform(self.feature_matrix)

            plt.figure(figsize=(10, 8))
            scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1],
                                c=self.cluster_labels, cmap='viridis')
            plt.title('Customer Segments Visualization (PCA)')
            plt.xlabel('First Principal Component')
            plt.ylabel('Second Principal Component')
            plt.colorbar(scatter)
            plt.savefig('FirstName_LastName_Clusters_Visual.png')
            plt.close()

        except Exception as e:
            print(f"Error in cluster visualization: {str(e)}")

def main():
    try:
        # Initialize segmentation
        segmentation = CustomerSegmentation()

        # Prepare features
        segmentation.prepare_features()

        # Find optimal number of clusters
        n_clusters, db_score = segmentation.find_optimal_clusters()

        # Perform clustering
        features, insights = segmentation.perform_clustering()

        # Save final report
        with open('FirstName_LastName_Clustering.pdf', 'w') as f:
            f.write("Customer Segmentation Analysis Report\n\n")
            f.write(f"Number of Clusters: {n_clusters}\n")
            f.write(f"Davies-Bouldin Index: {db_score:.4f}\n\n")
            f.write("Cluster Insights:\n")
            f.write(insights.to_string())

        print("Clustering analysis completed successfully!")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

Data loaded successfully
Features prepared successfully. Shape: (200, 8)
Clusters 2: DB Index = 1.5050, Silhouette = 0.2547
Clusters 3: DB Index = 1.8440, Silhouette = 0.1741
Clusters 4: DB Index = 1.7322, Silhouette = 0.1775
Clusters 5: DB Index = 1.5760, Silhouette = 0.1843
Clusters 6: DB Index = 1.5633, Silhouette = 0.1772
Clusters 7: DB Index = 1.4489, Silhouette = 0.1769
Clusters 8: DB Index = 1.4643, Silhouette = 0.1863
Clusters 9: DB Index = 1.4352, Silhouette = 0.1862
Clusters 10: DB Index = 1.4468, Silhouette = 0.1822
Optimal number of clusters: 9
Clustering analysis completed successfully!
