# Customer Segmentation

Generated by Auto-Analysis App

## Feature Engineering
Derive new meaningful features from existing raw data to enrich the customer profile for segmentation. This step focuses on creating attributes that capture customer demographics, engagement, and spending habits more effectively, including 'customer_age', 'total_dependents', 'total_lifetime_spend', 'avg_spend_per_product', 'customer_tenure', one-hot encoding 'customer_gender', and dropping irrelevant columns.

In [None]:
try:    # Step 1.1: Calculate 'customer_age'    current_year = pd.Timestamp.now().year    df_processed['customer_birthdate'] = pd.to_datetime(df_processed['customer_birthdate'], errors='coerce')    df_processed['customer_age'] = current_year - df_processed['customer_birthdate'].dt.year    # Impute missing ages with the median    median_age = df_processed['customer_age'].median()    df_processed['customer_age'].fillna(median_age, inplace=True)    df_processed['customer_age'] = df_processed['customer_age'].astype(int)    print(f"'customer_age' calculated. Median age used for imputation: {median_age:.0f}")    # Step 1.2: Create 'total_dependents'    # Fill NaNs in kids_home/teens_home with 0 before summing to ensure calculation    df_processed['kids_home'].fillna(0, inplace=True)    df_processed['teens_home'].fillna(0, inplace=True);    df_processed['total_dependents'] = df_processed['kids_home'] + df_processed['teens_home']    print("'total_dependents' calculated.")    # Step 1.3: Calculate 'total_lifetime_spend'    spend_columns = [        'lifetime_spend_groceries', 'lifetime_spend_electronics', 'lifetime_spend_vegetables',        'lifetime_spend_nonalcohol_drinks', 'lifetime_spend_alcohol_drinks', 'lifetime_spend_meat',        'lifetime_spend_fish', 'lifetime_spend_hygiene', 'lifetime_spend_videogames',        'lifetime_spend_petfood'    ]    # Ensure all spend columns are numeric, coercing errors to NaN, then fill NaN with 0 for sum    for col in spend_columns:        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce').fillna(0)    df_processed['total_lifetime_spend'] = df_processed[spend_columns].sum(axis=1)    print("'total_lifetime_spend' calculated.")    # Step 1.4: Derive 'avg_spend_per_product'    # Handle potential division by zero by replacing 0 with NaN, then filling NaN with 0 after division    df_processed['lifetime_total_distinct_products'] = pd.to_numeric(df_processed['lifetime_total_distinct_products'], errors='coerce').fillna(0)    df_processed['avg_spend_per_product'] = df_processed['total_lifetime_spend'] / df_processed['lifetime_total_distinct_products'].replace(0, np.nan)    df_processed['avg_spend_per_product'].fillna(0, inplace=True) # If total products is 0 or NaN, avg spend is 0    print("'avg_spend_per_product' calculated.")    # Step 1.5: Calculate 'customer_tenure'    df_processed['year_first_transaction'] = pd.to_numeric(df_processed['year_first_transaction'], errors='coerce')    median_year_first_transaction = df_processed['year_first_transaction'].median()    df_processed['year_first_transaction'].fillna(median_year_first_transaction, inplace=True)    df_processed['customer_tenure'] = current_year - df_processed['year_first_transaction'].astype(int)    df_processed['customer_tenure'] = df_processed['customer_tenure'].apply(lambda x: max(0, x)) # Tenure cannot be negative    print(f"'customer_tenure' calculated. Median year first transaction used for imputation: {median_year_first_transaction:.0f}")    # Step 1.6: One-hot encode 'customer_gender'    # Handle NaNs in gender by filling with a placeholder or mode before encoding    df_processed['customer_gender'].fillna(df_processed['customer_gender'].mode()[0], inplace=True)    gender_dummies = pd.get_dummies(df_processed['customer_gender'], prefix='gender', drop_first=True)    df_processed = pd.concat([df_processed, gender_dummies], axis=1)    print("'customer_gender' one-hot encoded.")    # Step 1.7: Identify and drop irrelevant or redundant columns    columns_to_drop = [        'customer_id', 'customer_name', 'customer_birthdate', 'loyalty_card_number',        'latitude', 'longitude', 'customer_gender', # Original gender column after encoding        'kids_home', 'teens_home', 'year_first_transaction' # Original columns replaced by engineered features    ] + spend_columns # Drop individual spend columns as total_lifetime_spend is created    # Filter out columns that might not exist if they were already dropped or not in original df    columns_to_drop = [col for col in columns_to_drop if col in df_processed.columns]    df_processed.drop(columns=columns_to_drop, inplace=True)    print(f"Dropped irrelevant columns: {columns_to_drop}")    global features_for_clustering    features_for_clustering = df_processed.columns.tolist()    print("Feature Engineering complete. Remaining features for clustering:")    print(features_for_clustering)    print(f"DataFrame shape after feature engineering: {df_processed.shape}")except Exception as e:    print(f"Error during Feature Engineering: {e}")

## Feature Scaling
Scale numerical features using StandardScaler to ensure that no single feature dominates the clustering process due to its magnitude. This transforms the data to have a mean of 0 and a variance of 1.

In [None]:
try:    scaling_method = "StandardScaler"    # Ensure all features are numeric before scaling    for col in df_processed.columns:        if df_processed[col].dtype == 'object': # Handle any remaining non-numeric columns            print(f"Warning: Non-numeric column '{col}' found. Attempting to convert to numeric or drop.")            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')            df_processed[col].fillna(df_processed[col].median() if not df_processed[col].isnull().all() else 0, inplace=True)    # Drop any columns that became all NaN during conversion    df_processed.dropna(axis=1, how='all', inplace=True)    # Identify numerical features for scaling (all remaining columns after engineering)    numerical_features = df_processed.select_dtypes(include=np.number).columns.tolist()    if not numerical_features:        raise ValueError("No numerical features found for scaling after preprocessing.")    print(f"Applying {scaling_method} to {len(numerical_features)} numerical features.")    scaler = StandardScaler()    global df_scaled    df_scaled = pd.DataFrame(scaler.fit_transform(df_processed[numerical_features]), columns=numerical_features, index=df_processed.index)    print(f"Feature scaling complete using {scaling_method}. Scaled DataFrame shape: {df_scaled.shape}")except Exception as e:    print(f"Error during Feature Scaling: {e}")

## Determine Optimal Number of Clusters
Employ the Elbow Method by calculating WCSS and the Silhouette Score for a range of k values (up to 10) to identify the most appropriate number of clusters for the dataset. Plots are generated to aid visual inspection. (Refined: NICE)

In [None]:
import pandas as pdimport matplotlib.pyplot as pltfrom sklearn.cluster import KMeansfrom sklearn.metrics import silhouette_scoretry:    max_k_for_evaluation = 10    if df_scaled is None or df_scaled.empty:        raise ValueError("Scaled DataFrame is empty or not available. Please run Feature Scaling first.")    print(f"Evaluating optimal number of clusters (K) from 1 to {max_k_for_evaluation}...")    # Initialize lists to store metrics    wcss = []    silhouette_scores = []    # To keep track of k and its silhouette score for later suggestion    k_values_for_silhouette = []    # Loop through potential number of clusters to calculate WCSS and Silhouette Scores    for k in range(1, max_k_for_evaluation + 1):        kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)        kmeans.fit(df_scaled)        wcss.append(kmeans.inertia_)        # Silhouette score is not defined for k=1, so calculate for k > 1        if k > 1:            score = silhouette_score(df_scaled, kmeans.labels_)            silhouette_scores.append(score)            k_values_for_silhouette.append(k)        print(f"  Processed K={k}...")    # Step 3.1: Plot the Elbow Method (WCSS)    plt.figure(figsize=(10, 6))    plt.plot(range(1, max_k_for_evaluation + 1), wcss, marker='o', linestyle='--')    plt.title('Elbow Method for Optimal K (WCSS)')    plt.xlabel('Number of Clusters (K)')    plt.ylabel('Within-Cluster Sum of Squares (WCSS)')    plt.grid(True)    plt.xticks(range(1, max_k_for_evaluation + 1))    plt.tight_layout()    plt.show()    print("\nElbow Method plot displayed. Look for the 'elbow point' where the decrease in WCSS starts to slow down significantly.")    # Step 3.2: Plot the Silhouette Score    if k_values_for_silhouette: # Ensure silhouette scores were calculated        plt.figure(figsize=(10, 6))        plt.plot(k_values_for_silhouette, silhouette_scores, marker='o', linestyle='--')        plt.title('Silhouette Score for Optimal K')        plt.xlabel('Number of Clusters (K)')        plt.ylabel('Silhouette Score')        plt.grid(True)        plt.xticks(k_values_for_silhouette)        plt.tight_layout()        plt.show()        print("Silhouette Score plot displayed. Higher scores indicate better-defined and more separated clusters.")        # Suggest optimal K based on the highest Silhouette Score        optimal_k_silhouette = k_values_for_silhouette[silhouette_scores.index(max(silhouette_scores))]        print(f"\nSuggestion based on Silhouette Score: K = {optimal_k_silhouette} yields the highest score.")    else:        print("\nSilhouette scores could not be calculated (e.g., max_k_for_evaluation was 1).")    # Step 3.3: Visual inspection guidance    print("\n--- Determining Optimal K ---")    print("Based on the Elbow Method and Silhouette Score plots, visually inspect to determine the optimal number of clusters (k).")    print("The 'elbow point' in the WCSS plot and the peak in the Silhouette Score plot suggest good candidates for k.")    print("Consider domain knowledge and interpretability when making your final decision.")except Exception as e:    print(f"Error during Optimal Number of Clusters determination: {e}")

## Apply Clustering Algorithm (K-Means)
Apply the K-Means clustering algorithm with 4 clusters, a random state of 42 for reproducibility, and 10 initializations to segment customers into distinct groups based on their scaled features. Cluster labels are then assigned back to the original dataset.

In [None]:
try:    n_clusters = 4    random_state = 42    n_init = 10    if df_scaled is None or df_scaled.empty:        raise ValueError("Scaled DataFrame is empty or not available. Please run Feature Scaling first.")    if df_original is None or df_original.empty:        raise ValueError("Original DataFrame (df_original) is empty or not available. It's needed for adding cluster labels.")    print(f"Initializing and training K-Means model with n_clusters={n_clusters}, random_state={random_state}, n_init={n_init}.")    # Step 4.1: Initialize and train the K-Means model    global kmeans_model    kmeans_model = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=n_init)    kmeans_model.fit(df_scaled)    print("K-Means model training complete.")    # Step 4.2: Assign each customer to a cluster    cluster_labels = kmeans_model.labels_    # Ensure df_original has the same index as df_scaled before adding labels    df_original['cluster'] = cluster_labels    print(f"Cluster labels assigned to the original DataFrame. Found {df_original['cluster'].nunique()} clusters.")    print("First 5 rows with cluster labels:")    print(df_original[['customer_id', 'cluster']].head())except Exception as e:    print(f"Error during K-Means Clustering: {e}")

## Cluster Profiling and Interpretation
Analyze the characteristics of each identified cluster by calculating descriptive statistics (mean, median, standard deviation) for all original and engineered features. Visualizations are created to compare feature distributions across clusters, aiding in assigning meaningful names and summarizing key attributes for each segment.

In [None]:
try:    if 'cluster' not in df_original.columns:        raise ValueError("Cluster labels not found in df_original. Please run K-Means Clustering first.")    print("Starting Cluster Profiling and Interpretation...")    # Step 5.1: Calculate descriptive statistics for all features within each cluster    # Use the df_processed (with engineered features) for profiling, merged with original identifiers if needed    # For profiling, it's often better to use the unscaled, engineered features.    df_profiling = df_original.copy()    # Re-add engineered features to df_profiling if they were not kept in df_original for some reason    # This assumes df_original was kept clean and df_processed contains the engineered features.    # A more robust approach would be to merge df_processed with df_original on index/id before clustering.    # For this example, we'll assume df_original now contains the 'cluster' column and we can use it directly.    # Let's ensure df_profiling has the engineered features for better profiling    # This requires re-running some engineering steps on df_original or ensuring df_processed is used correctly.    # For simplicity, we'll assume df_original has the necessary features for profiling, including 'cluster'.    # If df_processed was the one with all engineered features, we should merge 'cluster' back to it.    # Let's use df_original which now has 'cluster' and the original features.    # For profiling, it's best to use the original (unscaled) features and the engineered features.    # We need to make sure df_original has the engineered features for meaningful profiling.    # Let's re-create a profiling DF that includes original features + engineered features + cluster labels    df_for_profiling = df_original.copy()    # Re-calculate engineered features on df_for_profiling to ensure consistency if df_processed was modified heavily    # (This is a simplified approach, in a real pipeline, df_processed would be the source for profiling)    current_year = pd.Timestamp.now().year    df_for_profiling['customer_birthdate'] = pd.to_datetime(df_for_profiling['customer_birthdate'], errors='coerce')    df_for_profiling['customer_age'] = current_year - df_for_profiling['customer_birthdate'].dt.year    df_for_profiling['customer_age'].fillna(df_for_profiling['customer_age'].median(), inplace=True)    df_for_profiling['customer_age'] = df_for_profiling['customer_age'].astype(int)    df_for_profiling['kids_home'].fillna(0, inplace=True)    df_for_profiling['teens_home'].fillna(0, inplace=True);    df_for_profiling['total_dependents'] = df_for_profiling['kids_home'] + df_for_profiling['teens_home']    spend_columns_orig = [        'lifetime_spend_groceries', 'lifetime_spend_electronics', 'lifetime_spend_vegetables',        'lifetime_spend_nonalcohol_drinks', 'lifetime_spend_alcohol_drinks', 'lifetime_spend_meat',        'lifetime_spend_fish', 'lifetime_spend_hygiene', 'lifetime_spend_videogames',        'lifetime_spend_petfood'    ]    for col in spend_columns_orig:        df_for_profiling[col] = pd.to_numeric(df_for_profiling[col], errors='coerce').fillna(0)    df_for_profiling['total_lifetime_spend'] = df_for_profiling[spend_columns_orig].sum(axis=1)    df_for_profiling['lifetime_total_distinct_products'] = pd.to_numeric(df_for_profiling['lifetime_total_distinct_products'], errors='coerce').fillna(0)    df_for_profiling['avg_spend_per_product'] = df_for_profiling['total_lifetime_spend'] / df_for_profiling['lifetime_total_distinct_products'].replace(0, np.nan)    df_for_profiling['avg_spend_per_product'].fillna(0, inplace=True)    df_for_profiling['year_first_transaction'] = pd.to_numeric(df_for_profiling['year_first_transaction'], errors='coerce')    df_for_profiling['year_first_transaction'].fillna(df_for_profiling['year_first_transaction'].median(), inplace=True)    df_for_profiling['customer_tenure'] = current_year - df_for_profiling['year_first_transaction'].astype(int)    df_for_profiling['customer_tenure'] = df_for_profiling['customer_tenure'].apply(lambda x: max(0, x))    # Ensure gender is handled for profiling    df_for_profiling['customer_gender'].fillna(df_for_profiling['customer_gender'].mode()[0], inplace=True)    # Select relevant features for profiling (original + engineered, excluding IDs and coordinates)    profiling_features = [        'customer_age', 'total_dependents', 'number_complaints', 'distinct_stores_visited',        'total_lifetime_spend', 'avg_spend_per_product', 'customer_tenure',        'percentage_of_products_bought_promotion', 'typical_hour', 'customer_gender'    ] + spend_columns_orig # Include individual spend columns for detailed profiling    # Filter to only include columns that actually exist in df_for_profiling    profiling_features = [f for f in profiling_features if f in df_for_profiling.columns]    cluster_profiles = df_for_profiling.groupby('cluster')[profiling_features].agg(['mean', 'median', 'std']).T    print("\nCluster Profiles (Mean, Median, Std Dev):\n")    print(cluster_profiles)    # Step 5.2: Create visualizations    print("\nGenerating visualizations for cluster comparison...")    # Example: Box plots for key spending features    key_spend_features = ['total_lifetime_spend', 'lifetime_spend_groceries', 'lifetime_spend_electronics', 'lifetime_spend_alcohol_drinks']    for feature in key_spend_features:        if feature in df_for_profiling.columns:            plt.figure(figsize=(10, 6))            sns.boxplot(x='cluster', y=feature, data=df_for_profiling)            plt.title(f'Distribution of {feature} Across Clusters')            plt.xlabel('Cluster')            plt.ylabel(feature)            plt.grid(axis='y', linestyle='--', alpha=0.7)            plt.tight_layout()            plt.show()    # Example: Bar plot for 'customer_gender' distribution    if 'customer_gender' in df_for_profiling.columns:        gender_cluster_dist = df_for_profiling.groupby(['cluster', 'customer_gender']).size().unstack(fill_value=0)        gender_cluster_dist_norm = gender_cluster_dist.apply(lambda x: x / x.sum(), axis=1)        gender_cluster_dist_norm.plot(kind='bar', stacked=True, figsize=(10, 6))        plt.title('Gender Distribution Across Clusters')        plt.xlabel('Cluster')        plt.ylabel('Proportion')        plt.xticks(rotation=0)        plt.legend(title='Gender')        plt.grid(axis='y', linestyle='--', alpha=0.7)        plt.tight_layout()        plt.show()    # Example: Bar plot for 'total_dependents'    if 'total_dependents' in df_for_profiling.columns:        plt.figure(figsize=(10, 6))        sns.barplot(x='cluster', y='total_dependents', data=df_for_profiling, estimator=np.mean, errorbar='sd')        plt.title('Average Total Dependents Across Clusters')        plt.xlabel('Cluster')        plt.ylabel('Average Total Dependents')        plt.grid(axis='y', linestyle='--', alpha=0.7)        plt.tight_layout()        plt.show()    # Step 5.3 & 5.4: Assign meaningful names and summarize characteristics    print("\nBased on the statistical analysis and visualizations, assign meaningful, descriptive names to each cluster.")    print("Example: 'High-Value Families', 'Budget-Conscious Singles', 'Electronics Enthusiasts'.")    print("Summarize the key characteristics and behaviors of each customer segment, highlighting their preferences, spending patterns, and demographic profiles.")    print("Review the 'cluster_profiles' DataFrame and the generated plots to formulate these descriptions.")except Exception as e:    print(f"Error during Cluster Profiling and Interpretation: {e}")

## Evaluation and Validation
Assess the quality and robustness of the generated clusters using internal validation metrics. Calculate the final Silhouette Score and Davies-Bouldin Index for the chosen clustering solution. A qualitative review of cluster profiles is also recommended to ensure business interpretability and actionability.

In [None]:
try:    if df_scaled is None or df_scaled.empty:        raise ValueError("Scaled DataFrame is empty or not available. Please run Feature Scaling first.")    if kmeans_model is None or 'cluster' not in df_original.columns:        raise ValueError("K-Means model not trained or cluster labels not assigned. Please run K-Means Clustering first.")    print("Starting Evaluation and Validation...")    # Step 6.1: Calculate the final Silhouette Score and Davies-Bouldin Index    final_silhouette_score = silhouette_score(df_scaled, kmeans_model.labels_)    final_davies_bouldin_score = davies_bouldin_score(df_scaled, kmeans_model.labels_)    print(f"\nFinal Silhouette Score: {final_silhouette_score:.4f} (Higher is better, range -1 to 1)")    print(f"Final Davies-Bouldin Index: {final_davies_bouldin_score:.4f} (Lower is better, minimum 0)")    # Step 6.2: Perform a qualitative review of the cluster profiles    print("\nQualitative Review: Review the cluster profiles (from Step 5) to ensure they are distinct, interpretable, and actionable from a business perspective. Verify that the segments make intuitive sense.")    print("Consider if the identified segments align with known business strategies or reveal new opportunities.")    # Step 6.3: Consider running the clustering algorithm multiple times    print("\nStability Assessment: To assess the stability of cluster assignments, consider running the K-Means algorithm multiple times with slightly different random states (e.g., 42, 1, 100).")    print("Compare the resulting cluster assignments (e.g., using Adjusted Rand Index or Jaccard similarity) to see how consistent the segments are.")    print("For example, you could run:")    print("  kmeans_model_run2 = KMeans(n_clusters=n_clusters, random_state=1, n_init=n_init)")    print("  kmeans_model_run2.fit(df_scaled)")    print("  # Then compare kmeans_model.labels_ with kmeans_model_run2.labels_")except Exception as e:    print(f"Error during Evaluation and Validation: {e}")