In [None]:
# ==============================================================================
# ADVANCED CUSTOMER SEGMENTATION ANALYTICS SUITE - COLAB OPTIMIZED
# Enterprise-Grade ML Pipeline for Strategic Customer Intelligence
# ==============================================================================

# STEP 1: Install required packages (run this first in Colab)
"""
!pip install umap-learn hdbscan sentence-transformers xgboost lightgbm plotly --quiet
"""

print("🚀 ADVANCED CUSTOMER SEGMENTATION ANALYTICS SUITE")
print("=" * 80)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

# Core ML & Statistical Libraries
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.decomposition import PCA, TruncatedSVD, FactorAnalysis
from sklearn.ensemble import IsolationForest, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import cross_val_score
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

# Advanced ML Libraries
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage
import networkx as nx

# NLP & Deep Learning
from sentence_transformers import SentenceTransformer
try:
    import tensorflow as tf
    from tensorflow.keras.models import Model, Sequential
    from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, LSTM, Embedding
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
    from tensorflow.keras.regularizers import l2
    from tensorflow.keras import backend as K
    TF_AVAILABLE = True
except ImportError:
    print("⚠️ TensorFlow not available - some features will be skipped")
    TF_AVAILABLE = False

# Advanced Clustering & Visualization
import hdbscan
import umap
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

# Statistical Testing
from scipy.stats import chi2_contingency, kruskal
from statsmodels.stats.multitest import multipletests

# CRITICAL FIX: Proper Colab configuration for Plotly
import plotly.offline as pyo
from IPython.display import display, HTML
pyo.init_notebook_mode(connected=True)
pio.renderers.default = "colab"

# Enable interactive plots in Colab
from google.colab import output
output.enable_custom_widget_manager()

# ==============================================================================
# SECTION 1: ENHANCED DATA LOADING & EXPLORATION
# ==============================================================================

class DataLoader:
    """Enhanced data loading with comprehensive profiling"""

    def __init__(self, file_path='int_online_tx.csv'):
        self.file_path = file_path
        self.df_raw = None
        self.data_profile = {}

    def load_data(self):
        try:
            self.df_raw = pd.read_csv(self.file_path)
            print(f"✅ Data loaded: {self.df_raw.shape[0]:,} rows × {self.df_raw.shape[1]} columns")
            self.profile_data()
            return self.df_raw
        except FileNotFoundError:
            print(f"❌ Error: '{self.file_path}' not found.")
            print("📁 Creating sample data for demonstration...")
            return self.create_sample_data()

    def create_sample_data(self):
        """Create sample e-commerce data for demonstration"""
        np.random.seed(42)
        n_customers = 500
        n_transactions = 5000
        
        # Sample customer data
        customers = np.random.randint(10000, 20000, n_customers)
        
        data = []
        for _ in range(n_transactions):
            customer_id = np.random.choice(customers)
            invoice_date = pd.Timestamp('2023-01-01') + pd.Timedelta(days=np.random.randint(0, 365))
            quantity = np.random.randint(1, 50)
            unit_price = np.random.uniform(1.0, 100.0)
            stock_code = f"ITEM{np.random.randint(1000, 9999)}"
            description = f"Product {np.random.choice(['Electronics', 'Clothing', 'Home', 'Books', 'Sports'])} Item"
            
            data.append({
                'InvoiceNo': f"INV{np.random.randint(10000, 99999)}",
                'StockCode': stock_code,
                'Description': description,
                'Quantity': quantity,
                'InvoiceDate': invoice_date,
                'UnitPrice': unit_price,
                'CustomerID': customer_id,
                'Country': np.random.choice(['UK', 'USA', 'Germany', 'France'])
            })
        
        self.df_raw = pd.DataFrame(data)
        print(f"✅ Sample data created: {self.df_raw.shape[0]:,} rows × {self.df_raw.shape[1]} columns")
        self.profile_data()
        return self.df_raw

    def profile_data(self):
        """Comprehensive data profiling"""
        self.data_profile = {
            'shape': self.df_raw.shape,
            'memory_usage': self.df_raw.memory_usage(deep=True).sum() / 1024**2,
            'missing_values': self.df_raw.isnull().sum(),
            'duplicates': self.df_raw.duplicated().sum(),
            'unique_customers': self.df_raw['CustomerID'].nunique() if 'CustomerID' in self.df_raw else 0,
            'date_range': self.get_date_range(),
            'numeric_summary': self.df_raw.select_dtypes(include=[np.number]).describe()
        }

        print(f"📊 Data Profile:")
        print(f"    Memory Usage: {self.data_profile['memory_usage']:.2f} MB")
        print(f"    Unique Customers: {self.data_profile['unique_customers']:,}")
        print(f"    Duplicates: {self.data_profile['duplicates']:,}")

    def get_date_range(self):
        if 'InvoiceDate' in self.df_raw.columns:
            dates = pd.to_datetime(self.df_raw['InvoiceDate'])
            return {'start': dates.min(), 'end': dates.max(), 'days': (dates.max() - dates.min()).days}
        return None

# ==============================================================================
# SECTION 2: ADVANCED FEATURE ENGINEERING FRAMEWORK
# ==============================================================================

class AdvancedFeatureEngineer:
    """Comprehensive feature engineering with statistical validation"""

    def __init__(self, df):
        self.df = df.copy()
        self.features = {}
        self.feature_importance = {}

    def create_temporal_features(self):
        """Advanced temporal pattern analysis"""
        print("🕐 Engineering temporal features...")

        # Convert InvoiceDate to datetime
        self.df['InvoiceDate'] = pd.to_datetime(self.df['InvoiceDate'])
        self.df['Sales'] = self.df['Quantity'] * self.df['UnitPrice']

        # Reference date for recency calculations
        reference_date = self.df['InvoiceDate'].max() + timedelta(days=1)

        # Customer-level temporal aggregations
        temporal_features = self.df.groupby('CustomerID').agg({
            'InvoiceDate': [
                ('recency_days', lambda x: (reference_date - x.max()).days),
                ('first_purchase', 'min'),
                ('last_purchase', 'max'),
                ('purchase_span_days', lambda x: (x.max() - x.min()).days if len(x) > 1 else 0),
                ('purchase_frequency', 'count')
            ],
            'InvoiceNo': [('total_orders', 'nunique')],
            'Sales': [
                ('total_revenue', 'sum'),
                ('avg_order_value', 'mean'),
                ('revenue_volatility', 'std'),
                ('max_order_value', 'max'),
                ('min_order_value', 'min')
            ],
            'Quantity': [
                ('total_quantity', 'sum'),
                ('avg_basket_size', 'mean'),
                ('quantity_volatility', 'std')
            ]
        })

        # Flatten column names
        temporal_features.columns = ['_'.join(col).strip() for col in temporal_features.columns.values]

        # Advanced temporal metrics
        if 'InvoiceDate_purchase_span_days' in temporal_features.columns:
            temporal_features['purchase_regularity'] = (
                temporal_features['InvoiceDate_purchase_span_days'] / temporal_features['InvoiceNo_total_orders']
            ).fillna(0)
        else:
            temporal_features['purchase_regularity'] = 0

        if 'InvoiceDate_last_purchase' in temporal_features.columns and 'InvoiceDate_first_purchase' in temporal_features.columns:
            temporal_features['customer_lifetime_days'] = (
                temporal_features['InvoiceDate_last_purchase'] - temporal_features['InvoiceDate_first_purchase']
            ).dt.days
        else:
            temporal_features['customer_lifetime_days'] = 0

        if 'Sales_total_revenue' in temporal_features.columns and 'customer_lifetime_days' in temporal_features.columns:
            temporal_features['revenue_per_day'] = (
                temporal_features['Sales_total_revenue'] / (temporal_features['customer_lifetime_days'] + 1)
            )
        else:
            temporal_features['revenue_per_day'] = 0

        # Seasonal patterns
        seasonal_data = self.create_seasonal_features()
        temporal_features = temporal_features.join(seasonal_data)

        # Clean up
        cols_to_drop = ['InvoiceDate_first_purchase', 'InvoiceDate_last_purchase']
        temporal_features.drop(columns=cols_to_drop, axis=1, inplace=True, errors='ignore')
        temporal_features.fillna(0, inplace=True)

        self.features['temporal'] = temporal_features
        print(f"    ✅ Created {temporal_features.shape[1]} temporal features")
        return temporal_features

    def create_seasonal_features(self):
        """Advanced seasonal and cyclical patterns"""
        df_seasonal = self.df.copy()
        df_seasonal['month'] = df_seasonal['InvoiceDate'].dt.month
        df_seasonal['day_of_week'] = df_seasonal['InvoiceDate'].dt.dayofweek
        df_seasonal['quarter'] = df_seasonal['InvoiceDate'].dt.quarter
        df_seasonal['is_weekend'] = df_seasonal['day_of_week'].isin([5, 6]).astype(int)

        seasonal_agg = df_seasonal.groupby('CustomerID').agg({
            'month': [('favorite_month', lambda x: x.mode().iloc[0] if not x.mode().empty else 1)],
            'day_of_week': [('favorite_day', lambda x: x.mode().iloc[0] if not x.mode().empty else 0)],
            'is_weekend': [('weekend_shopper', 'mean')],
            'quarter': [('seasonal_spread', 'nunique')]
        })

        seasonal_agg.columns = ['_'.join(col).strip() for col in seasonal_agg.columns.values]
        return seasonal_agg

    def create_product_diversity_features(self):
        """Advanced product portfolio analysis"""
        print("📦 Engineering product diversity features...")

        product_features = self.df.groupby('CustomerID').agg({
            'StockCode': [
                ('unique_products', 'nunique'),
                ('product_concentration', lambda x: 1 - (x.value_counts().std() / x.value_counts().mean()) if x.value_counts().mean() > 0 else 0)
            ],
            'Description': [
                ('avg_description_length', lambda x: x.str.len().mean())
            ]
        })
        product_features.columns = ['_'.join(col).strip() for col in product_features.columns.values]

        self.df['description_words'] = self.df['Description'].str.split().str.len()
        category_features = self.df.groupby('CustomerID').agg({
            'description_words': [('avg_product_complexity', 'mean')]
        })
        category_features.columns = ['_'.join(col).strip() for col in category_features.columns.values]

        product_features = product_features.join(category_features)
        product_features.fillna(0, inplace=True)

        self.features['products'] = product_features
        print(f"    ✅ Created {product_features.shape[1]} product diversity features")
        return product_features

    def create_behavioral_sequences(self):
        """Advanced sequential pattern mining"""
        print("🔄 Engineering behavioral sequence features...")

        df_sorted = self.df.sort_values(['CustomerID', 'InvoiceDate'])
        sequences = df_sorted.groupby('CustomerID').agg({
            'Sales': [
                ('purchase_trend', lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) > 1 else 0),
                ('purchase_acceleration', lambda x: np.polyfit(range(len(x)), x, 2)[0] if len(x) > 2 else 0),
                ('purchase_volatility_trend', lambda x: x.rolling(window=min(3, len(x))).std().mean())
            ],
            'Quantity': [
                ('quantity_trend', lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) > 1 else 0)
            ]
        })

        sequences.columns = ['_'.join(col).strip() for col in sequences.columns.values]
        sequences.fillna(0, inplace=True)

        self.features['sequences'] = sequences
        print(f"    ✅ Created {sequences.shape[1]} behavioral sequence features")
        return sequences

class EnhancedNLPAnalyzer:
    """Advanced NLP analysis with multiple embedding strategies"""
    def __init__(self, df):
        self.df = df
        self.models = {}
        
    def create_multi_embedding_features(self):
        """Multiple embedding strategies for product analysis"""
        print("🧠 Creating advanced NLP embeddings...")
        
        try:
            # Use a simpler approach without sentence transformers for now
            # This can be enhanced when sentence-transformers is available
            print("    📝 Using basic text features (Sentence Transformers not available)")
            
            unique_products = self.df[['StockCode', 'Description']].drop_duplicates()
            unique_products['Description_clean'] = unique_products['Description'].fillna('unknown')
            
            # Simple text features
            unique_products['desc_length'] = unique_products['Description_clean'].str.len()
            unique_products['desc_words'] = unique_products['Description_clean'].str.split().str.len()
            
            # Map back to original data
            self.df = self.df.merge(
                unique_products[['StockCode', 'desc_length', 'desc_words']], 
                on='StockCode', 
                how='left'
            )
            
            # Customer-level aggregation of text features
            customer_text_features = self.df.groupby('CustomerID').agg({
                'desc_length': 'mean',
                'desc_words': 'mean'
            }).add_prefix('text_')
            
            return {'text': customer_text_features}
            
        except Exception as e:
            print(f"    ⚠️ NLP features skipped: {e}")
            return {'text': pd.DataFrame()}

# ==============================================================================
# SECTION 3: ADVANCED CLUSTERING FRAMEWORK
# ==============================================================================

class AdvancedClusteringFramework:
    """Comprehensive clustering with ensemble methods and validation"""
    def __init__(self, feature_matrix):
        self.X = feature_matrix
        self.results = {}
        self.ensemble_labels = None

    def optimize_clustering_parameters(self):
        """Hyperparameter optimization for clustering algorithms"""
        print("🎯 Optimizing clustering parameters...")
        param_results = {}
        if self.X.shape[0] < 2:
            print("    ❌ Not enough samples for clustering.")
            return param_results

        # K-Means (more stable than HDBSCAN for small datasets)
        kmeans_scores = []
        for k in range(2, min(10, self.X.shape[0]//2)):
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels = kmeans.fit_predict(self.X)
            if len(np.unique(labels)) > 1:
                score = silhouette_score(self.X, labels)
                kmeans_scores.append((k, score, labels, kmeans))
        
        if kmeans_scores:
            best_kmeans = max(kmeans_scores, key=lambda x: x[1])
            param_results['kmeans'] = {
                'best_params': {'n_clusters': best_kmeans[0]}, 
                'score': best_kmeans[1], 
                'labels': best_kmeans[2],
                'model': best_kmeans[3]
            }

        # HDBSCAN (if available and suitable)
        try:
            hdbscan_scores = []
            for min_size in [max(5, self.X.shape[0]//20), max(10, self.X.shape[0]//15), max(15, self.X.shape[0]//10)]:
                if min_size >= self.X.shape[0]: continue
                clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size, min_samples=3, metric='euclidean')
                labels = clusterer.fit_predict(self.X)
                if len(np.unique(labels)) > 1:
                    score = silhouette_score(self.X, labels)
                    hdbscan_scores.append((min_size, score, labels))
            
            if hdbscan_scores:
                best_hdbscan = max(hdbscan_scores, key=lambda x: x[1])
                param_results['hdbscan'] = {
                    'best_params': {'min_cluster_size': best_hdbscan[0]}, 
                    'score': best_hdbscan[1], 
                    'labels': best_hdbscan[2]
                }
        except Exception as e:
            print(f"    ⚠️ HDBSCAN skipped: {e}")

        # GMM
        try:
            gmm_scores = []
            for n in range(2, min(8, self.X.shape[0]//3)):
                gmm = GaussianMixture(n_components=n, random_state=42)
                labels = gmm.fit_predict(self.X)
                score = silhouette_score(self.X, labels)
                gmm_scores.append((n, score, labels, gmm))
            
            if gmm_scores:
                best_gmm = max(gmm_scores, key=lambda x: x[1])
                param_results['gmm'] = {
                    'best_params': {'n_components': best_gmm[0]}, 
                    'score': best_gmm[1], 
                    'labels': best_gmm[2], 
                    'model': best_gmm[3]
                }
        except Exception as e:
            print(f"    ⚠️ GMM skipped: {e}")

        self.results = param_results
        print(f"    ✅ Best K-Means score: {param_results.get('kmeans', {}).get('score', 0):.3f}")
        print(f"    ✅ Best HDBSCAN score: {param_results.get('hdbscan', {}).get('score', 0):.3f}")
        print(f"    ✅ Best GMM score: {param_results.get('gmm', {}).get('score', 0):.3f}")
        return param_results

    def ensemble_clustering(self):
        """Ensemble clustering with consensus mechanism"""
        print("🤝 Creating ensemble clustering...")
        if not self.results:
            print("    ❌ No clustering results to ensemble.")
            return np.zeros(self.X.shape[0])

        # Use the best performing algorithm
        best_algorithm = None
        best_score = -1
        for alg_name, result in self.results.items():
            if result['score'] > best_score:
                best_score = result['score']
                best_algorithm = alg_name

        if best_algorithm:
            self.ensemble_labels = self.results[best_algorithm]['labels']
            print(f"    ✅ Using {best_algorithm} with {len(np.unique(self.ensemble_labels))} clusters")
        else:
            self.ensemble_labels = np.zeros(self.X.shape[0])
            print("    ❌ No valid clustering results.")
        
        return self.ensemble_labels

    def validate_clustering_quality(self, labels):
        """Comprehensive clustering validation"""
        print("📊 Validating clustering quality...")
        unique_labels = np.unique(labels)
        if len(unique_labels) < 2:
            print("    ❌ Less than 2 clusters found for validation.")
            return {'error': 'Less than 2 clusters found'}
        
        if self.X.shape[0] != len(labels):
            print("    ❌ Mismatch between feature matrix and labels.")
            return {'error': 'Feature matrix and labels mismatch'}

        try:
            silhouette = silhouette_score(self.X, labels)
            calinski = calinski_harabasz_score(self.X, labels)
            davies_bouldin = davies_bouldin_score(self.X, labels)
            
            metrics = {
                'silhouette_score': silhouette,
                'calinski_harabasz_score': calinski,
                'davies_bouldin_score': davies_bouldin,
                'n_clusters': len(unique_labels) - (1 if -1 in labels else 0),
                'n_outliers': np.sum(labels == -1) if -1 in labels else 0,
            }
            
            print(f"    ✅ Silhouette Score: {metrics['silhouette_score']:.3f}")
            print(f"    ✅ Calinski-Harabasz Score: {metrics['calinski_harabasz_score']:.3f}")
            print(f"    ✅ Davies-Bouldin Score: {metrics['davies_bouldin_score']:.3f}")
            print(f"    ✅ Number of Clusters: {metrics['n_clusters']}")
            print(f"    ✅ Number of Outliers: {metrics['n_outliers']}")
            
        except Exception as e:
            print(f"    ⚠️ Validation error: {e}")
            metrics = {'error': str(e)}
            
        return metrics

# ==============================================================================
# SECTION 4: STATISTICAL ANALYSIS & BUSINESS INSIGHTS
# ==============================================================================

class BusinessIntelligenceAnalyzer:
    """Advanced statistical analysis and business insights"""
    def __init__(self, feature_matrix, cluster_labels, original_features):
        self.X = feature_matrix
        self.labels = cluster_labels
        self.features = original_features
        self.insights = {}

    def statistical_cluster_analysis(self):
        """Comprehensive statistical analysis of clusters"""
        print("📈 Performing statistical cluster analysis...")
        if self.features.empty or self.labels is None or len(self.features) != len(self.labels):
            print("    ❌ Skipping analysis: Data missing or mismatched.")
            return {}

        analysis_df = self.features.copy()
        analysis_df['cluster'] = self.labels
        clustered_df = analysis_df[analysis_df['cluster'] != -1]
        if clustered_df.empty:
            print("    ❌ Skipping analysis: No clustered samples found.")
            return {}

        cluster_stats = {}
        for cluster_id, cluster_data in clustered_df.groupby('cluster'):
            stats = {
                'size': len(cluster_data),
                'percentage': len(cluster_data) / len(clustered_df) * 100
            }
            cluster_stats[cluster_id] = stats
            
        self.insights['cluster_statistics'] = cluster_stats
        print(f"    ✅ Analyzed {len(cluster_stats)} clusters")
        return cluster_stats

    def customer_lifetime_value_analysis(self):
        """Advanced CLV analysis by segment"""
        print("💰 Analyzing Customer Lifetime Value by segment...")
        if 'cluster_statistics' not in self.insights or not self.insights['cluster_statistics']:
            print("    ❌ Skipping CLV analysis: Cluster statistics not available.")
            return {}

        analysis_df = self.features.copy()
        analysis_df['cluster'] = self.labels
        clustered_df = analysis_df[analysis_df['cluster'] != -1]

        clv_analysis = {}
        for cluster_id, stats in self.insights['cluster_statistics'].items():
            cluster_data = clustered_df[clustered_df['cluster'] == cluster_id]
            
            # Safely get values with fallbacks
            avg_order_value = cluster_data.get('Sales_avg_order_value', pd.Series([0])).mean()
            purchase_frequency = cluster_data.get('InvoiceDate_purchase_frequency', pd.Series([0])).mean()
            customer_lifetime = cluster_data.get('customer_lifetime_days', pd.Series([0])).mean()
            total_revenue = cluster_data.get('Sales_total_revenue', pd.Series([0])).mean()
            
            # Calculate CLV with safety checks
            if customer_lifetime > 0 and purchase_frequency > 0:
                clv = (avg_order_value * purchase_frequency / 365) * customer_lifetime
            else:
                clv = total_revenue  # Fallback to total revenue

            clv_analysis[cluster_id] = {
                'predicted_clv': clv,
                'avg_order_value': avg_order_value,
                'purchase_frequency': purchase_frequency,
                'customer_lifetime_days': customer_lifetime,
                'total_revenue': total_revenue,
                'size': stats['size']
            }

        self.insights['clv_analysis'] = clv_analysis
        print("    ✅ CLV analysis completed")
        return clv_analysis

    def generate_business_recommendations(self):
        """Generate actionable business recommendations"""
        print("💡 Generating business recommendations...")
        if 'clv_analysis' not in self.insights or not self.insights['clv_analysis']:
            print("    ❌ Skipping recommendations: CLV analysis not available.")
            return {}

        recommendations = {}
        clv_sorted = sorted(
            self.insights['clv_analysis'].items(), 
            key=lambda x: x[1]['predicted_clv'], 
            reverse=True
        )

        for i, (cluster_id, data) in enumerate(clv_sorted):
            if i == 0:  # Highest CLV
                segment_type = "Premium Champions"
                strategy = [
                    "Implement VIP program with exclusive benefits",
                    "Personalized high-value recommendations",
                    "Priority customer service"
                ]
            elif i == len(clv_sorted) - 1:  # Lowest CLV
                segment_type = "At-Risk/Low Value"
                strategy = [
                    "Re-engagement campaigns with special offers",
                    "Win-back email sequences",
                    "Customer feedback surveys"
                ]
            else:  # Middle segments
                if data['purchase_frequency'] > data['avg_order_value']:
                    segment_type = f"Frequent Buyers (Segment {cluster_id})"
                    strategy = [
                        "Upselling campaigns for higher-value items",
                        "Bundle deals to increase order value",
                        "Loyalty program enrollment"
                    ]
                else:
                    segment_type = f"High-Value Infrequent (Segment {cluster_id})"
                    strategy = [
                        "Targeted marketing to increase purchase frequency",
                        "Product education and recommendations",
                        "Seasonal campaign targeting"
                    ]

            recommendations[cluster_id] = {
                'segment_type': segment_type,
                'strategy': strategy,
                'priority': len(clv_sorted) - i,
                'metrics': data
            }
            
        self.insights['recommendations'] = recommendations
        print("    ✅ Business recommendations generated")
        return recommendations

# ==============================================================================
# SECTION 5: ADVANCED VISUALIZATION SUITE - COLAB OPTIMIZED
# ==============================================================================

class AdvancedVisualizationSuite:
    """Comprehensive visualization for business stakeholders - Colab optimized"""

    def __init__(self, features_df, cluster_labels, insights):
        self.features_df = features_df
        self.cluster_labels = cluster_labels
        self.insights = insights

    def create_executive_dashboard(self):
        """Create comprehensive executive dashboard"""
        print("📊 Creating executive dashboard...")

        if self.features_df.empty or self.cluster_labels is None or len(self.features_df) != len(self.cluster_labels):
            print("    ❌ Skipping dashboard: Data missing or mismatched.")
            return

        viz_df = self.features_df.copy()
        viz_df['cluster'] = self.cluster_labels
        viz_df = viz_df[viz_df['cluster'] != -1]
        if viz_df.empty:
            print("    ❌ Skipping dashboard: No clustered samples found.")
            return

        # Create subplots
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=[
                'Customer Segments by Revenue & Size',
                'CLV Distribution by Segment',
                'Purchase Behavior Patterns',
                'Segment Performance Matrix'
            ],
            specs=[[{"type": "scatter"}, {"type": "bar"}],
                   [{"type": "scatter"}, {"type": "scatter"}]]
        )

        # Plot 1: Bubble chart - Revenue vs CLV
        if 'clv_analysis' in self.insights and self.insights['clv_analysis']:
            for cluster_id, data in self.insights['clv_analysis'].items():
                fig.add_trace(go.Scatter(
                    x=[data['avg_order_value']], 
                    y=[data['predicted_clv']], 
                    mode='markers+text',
                    marker=dict(
                        size=max(10, np.sqrt(data['size']) * 3),
                        opacity=0.7,
                        color=cluster_id,
                        colorscale='viridis'
                    ),
                    text=[f"S{cluster_id}"], 
                    textposition='middle center',
                    name=f"Segment {cluster_id}",
                    hovertemplate=f"<b>Segment {cluster_id}</b><br>" +
                                f"Avg Order Value: ${data['avg_order_value']:.2f}<br>" +
                                f"Predicted CLV: ${data['predicted_clv']:.2f}<br>" +
                                f"Size: {data['size']} customers<extra></extra>"
                ), row=1, col=1)

        # Plot 2: CLV bar chart
        if 'clv_analysis' in self.insights and self.insights['clv_analysis']:
            clv_df = pd.DataFrame(self.insights['clv_analysis']).T.sort_values('predicted_clv', ascending=False)
            fig.add_trace(go.Bar(
                x=[f"Segment {idx}" for idx in clv_df.index], 
                y=clv_df['predicted_clv'],
                name='Predicted CLV',
                marker_color='lightblue',
                text=[f"${val:.0f}" for val in clv_df['predicted_clv']],
                textposition='outside'
            ), row=1, col=2)

        # Plot 3: Purchase patterns
        revenue_col = 'Sales_total_revenue'
        frequency_col = 'InvoiceDate_purchase_frequency'
        
        if revenue_col in viz_df.columns and frequency_col in viz_df.columns:
            colors = px.colors.qualitative.Set3
            for i, cluster_id in enumerate(sorted(viz_df['cluster'].unique())):
                cluster_data = viz_df[viz_df['cluster'] == cluster_id]
                fig.add_trace(go.Scatter(
                    x=cluster_data[frequency_col], 
                    y=cluster_data[revenue_col],
                    mode='markers', 
                    name=f"Segment {cluster_id}",
                    marker=dict(
                        color=colors[i % len(colors)],
                        opacity=0.6,
                        size=8
                    ),
                    hovertemplate=f"<b>Segment {cluster_id}</b><br>" +
                                f"Purchase Frequency: %{{x}}<br>" +
                                f"Total Revenue: $%{{y:.2f}}<extra></extra>"
                ), row=2, col=1)

        # Plot 4: Segment Performance Matrix
        if revenue_col in viz_df.columns:
            recency_col = 'InvoiceDate_recency_days'
            if recency_col not in viz_df.columns:
                # Use a fallback column
                available_cols = [col for col in viz_df.columns if 'recency' in col.lower()]
                if available_cols:
                    recency_col = available_cols[0]
                else:
                    recency_col = viz_df.select_dtypes(include=[np.number]).columns[0]
            
            cluster_agg = viz_df.groupby('cluster').agg({
                revenue_col: 'mean',
                recency_col: 'mean'
            }).reset_index()
            
            # Add size information
            cluster_sizes = viz_df.groupby('cluster').size().reset_index(name='size')
            cluster_agg = cluster_agg.merge(cluster_sizes, on='cluster')
            
            fig.add_trace(go.Scatter(
                x=cluster_agg[recency_col], 
                y=cluster_agg[revenue_col],
                mode='markers+text',
                marker=dict(
                    size=cluster_agg['size'] / 2,
                    opacity=0.7,
                    color=cluster_agg['cluster'],
                    colorscale='plasma',
                    showscale=True
                ),
                text=[f'S{c}' for c in cluster_agg['cluster']],
                textposition='middle center',
                name='Segments',
                hovertemplate="<b>Segment %{text}</b><br>" +
                            f"Avg {recency_col}: %{{x:.1f}}<br>" +
                            f"Avg {revenue_col}: $%{{y:.2f}}<br>" +
                            "Size: %{marker.size} customers<extra></extra>"
            ), row=2, col=2)

        # Update layout
        fig.update_layout(
            title={
                'text': "<b>🎯 Customer Segmentation Executive Dashboard</b>",
                'x': 0.5,
                'xanchor': 'center',
                'font': {'size': 20}
            },
            height=800,
            showlegend=True,
            template='plotly_white'
        )
        
        # Update axis labels
        fig.update_xaxes(title_text="Average Order Value ($)", row=1, col=1)
        fig.update_yaxes(title_text="Predicted CLV ($)", row=1, col=1)
        fig.update_yaxes(title_text="Predicted CLV ($)", row=1, col=2)
        fig.update_xaxes(title_text="Purchase Frequency", row=2, col=1)
        fig.update_yaxes(title_text="Total Revenue ($)", row=2, col=1)
        fig.update_xaxes(title_text="Recency Metric", row=2, col=2)
        fig.update_yaxes(title_text="Average Revenue ($)", row=2, col=2)
        
        # CRITICAL: Use display() to show in Colab
        display(fig)
        print("    ✅ Executive dashboard created and displayed")

    def create_3d_cluster_visualization(self, X_reduced):
        """Advanced 3D cluster visualization"""
        print("🎨 Creating 3D cluster visualization...")
        
        if X_reduced is None or X_reduced.shape[1] < 2 or self.cluster_labels is None:
            print("    ❌ Skipping 3D visualization: Insufficient data or dimensions.")
            return

        try:
            # Use UMAP for 3D embedding
            if X_reduced.shape[1] >= 3:
                embedding_3d = X_reduced[:, :3]  # Use first 3 dimensions
            else:
                # Create 3D embedding from 2D
                reducer_3d = umap.UMAP(n_components=3, random_state=42, n_neighbors=15)
                embedding_3d = reducer_3d.fit_transform(X_reduced)
            
            # Create color mapping for clusters
            unique_labels = np.unique(self.cluster_labels)
            colors = px.colors.qualitative.Set3
            color_map = {label: colors[i % len(colors)] for i, label in enumerate(unique_labels)}
            
            # Create 3D scatter plot
            fig = go.Figure()
            
            for label in unique_labels:
                if label == -1:  # Outliers
                    mask = self.cluster_labels == label
                    fig.add_trace(go.Scatter3d(
                        x=embedding_3d[mask, 0],
                        y=embedding_3d[mask, 1],
                        z=embedding_3d[mask, 2],
                        mode='markers',
                        marker=dict(
                            size=4,
                            color='lightgray',
                            opacity=0.6
                        ),
                        name='Outliers',
                        hovertemplate="<b>Outlier</b><br>" +
                                    "X: %{x:.2f}<br>" +
                                    "Y: %{y:.2f}<br>" +
                                    "Z: %{z:.2f}<extra></extra>"
                    ))
                else:  # Regular clusters
                    mask = self.cluster_labels == label
                    fig.add_trace(go.Scatter3d(
                        x=embedding_3d[mask, 0],
                        y=embedding_3d[mask, 1],
                        z=embedding_3d[mask, 2],
                        mode='markers',
                        marker=dict(
                            size=5,
                            color=color_map[label],
                            opacity=0.8
                        ),
                        name=f'Segment {label}',
                        hovertemplate=f"<b>Segment {label}</b><br>" +
                                    "X: %{x:.2f}<br>" +
                                    "Y: %{y:.2f}<br>" +
                                    "Z: %{z:.2f}<extra></extra>"
                    ))
            
            fig.update_layout(
                title={
                    'text': "<b>🎨 3D Customer Segmentation Visualization</b>",
                    'x': 0.5,
                    'xanchor': 'center',
                    'font': {'size': 18}
                },
                scene=dict(
                    xaxis_title='Dimension 1',
                    yaxis_title='Dimension 2',
                    zaxis_title='Dimension 3',
                    bgcolor='white'
                ),
                template='plotly_white',
                height=700
            )
            
            # CRITICAL: Use display() to show in Colab
            display(fig)
            print("    ✅ 3D visualization created and displayed")
            
        except Exception as e:
            print(f"    ⚠️ 3D visualization error: {e}")
            self.create_2d_fallback_visualization(X_reduced)

    def create_2d_fallback_visualization(self, X_reduced):
        """Fallback 2D visualization"""
        print("    📊 Creating 2D fallback visualization...")
        
        try:
            # Use UMAP for 2D embedding
            if X_reduced.shape[1] > 2:
                reducer_2d = umap.UMAP(n_components=2, random_state=42, n_neighbors=15)
                embedding_2d = reducer_2d.fit_transform(X_reduced)
            else:
                embedding_2d = X_reduced
            
            fig = go.Figure()
            
            unique_labels = np.unique(self.cluster_labels)
            colors = px.colors.qualitative.Set3
            
            for i, label in enumerate(unique_labels):
                mask = self.cluster_labels == label
                if label == -1:
                    name = 'Outliers'
                    color = 'lightgray'
                else:
                    name = f'Segment {label}'
                    color = colors[i % len(colors)]
                
                fig.add_trace(go.Scatter(
                    x=embedding_2d[mask, 0],
                    y=embedding_2d[mask, 1],
                    mode='markers',
                    marker=dict(
                        size=8,
                        color=color,
                        opacity=0.7
                    ),
                    name=name,
                    hovertemplate=f"<b>{name}</b><br>" +
                                "X: %{x:.2f}<br>" +
                                "Y: %{y:.2f}<extra></extra>"
                ))
            
            fig.update_layout(
                title="🎨 2D Customer Segmentation Visualization",
                xaxis_title='Dimension 1',
                yaxis_title='Dimension 2',
                template='plotly_white',
                height=600
            )
            
            display(fig)
            print("    ✅ 2D visualization created and displayed")
            
        except Exception as e:
            print(f"    ❌ 2D visualization failed: {e}")

    def create_summary_charts(self):
        """Create additional summary charts"""
        print("📈 Creating summary charts...")
        
        if 'clv_analysis' not in self.insights or not self.insights['clv_analysis']:
            print("    ❌ No CLV data for summary charts")
            return
        
        # Segment size pie chart
        fig1 = go.Figure(data=[go.Pie(
            labels=[f"Segment {cid}" for cid in self.insights['clv_analysis'].keys()],
            values=[data['size'] for data in self.insights['clv_analysis'].values()],
            hole=.3
        )])
        
        fig1.update_layout(
            title="🥧 Customer Distribution by Segment",
            template='plotly_white'
        )
        
        display(fig1)
        
        # CLV comparison
        clv_data = self.insights['clv_analysis']
        segments = list(clv_data.keys())
        clv_values = [clv_data[seg]['predicted_clv'] for seg in segments]
        sizes = [clv_data[seg]['size'] for seg in segments]
        
        fig2 = go.Figure()
        fig2.add_trace(go.Bar(
            x=[f"Segment {seg}" for seg in segments],
            y=clv_values,
            name='CLV',
            yaxis='y',
            marker_color='lightblue'
        ))
        
        fig2.add_trace(go.Scatter(
            x=[f"Segment {seg}" for seg in segments],
            y=sizes,
            mode='lines+markers',
            name='Customer Count',
            yaxis='y2',
            line=dict(color='orange', width=3)
        ))
        
        fig2.update_layout(
            title='📊 CLV vs Customer Count by Segment',
            xaxis_title='Segments',
            yaxis=dict(title='Predicted CLV ($)', side='left'),
            yaxis2=dict(title='Customer Count', side='right', overlaying='y'),
            template='plotly_white'
        )
        
        display(fig2)
        print("    ✅ Summary charts created")

# ==============================================================================
# SECTION 6: PREDICTIVE ANALYTICS MODULE
# ==============================================================================

class PredictiveAnalyticsEngine:
    """Advanced predictive modeling for customer behavior"""

    def __init__(self, features_df, cluster_labels):
        self.features_df = features_df
        self.cluster_labels = cluster_labels
        self.models = {}
        self.predictions = {}

    def build_churn_prediction_model(self):
        """Advanced churn prediction using ensemble methods"""
        print("🔮 Building churn prediction model...")
        if self.features_df.empty or self.cluster_labels is None: 
            return None

        try:
            df = self.features_df.copy()
            df['cluster'] = self.cluster_labels
            
            # Find recency column
            recency_cols = [col for col in df.columns if 'recency' in col.lower()]
            if not recency_cols:
                print("    ⚠️ No recency column found for churn prediction")
                return None
                
            recency_col = recency_cols[0]
            
            if df[recency_col].nunique() < 2: 
                return None

            churn_threshold = df[recency_col].quantile(0.75)  # Top 25% are considered at risk
            df['is_churn'] = (df[recency_col] > churn_threshold).astype(int)

            # Prepare features
            feature_cols = [col for col in df.columns if col not in ['cluster', 'is_churn']]
            X = df[feature_cols].fillna(df[feature_cols].median())
            y = df['is_churn']
            
            if X.empty or len(y.unique()) < 2: 
                return None

            # Use LightGBM for robustness
            model = lgb.LGBMClassifier(random_state=42, verbose=-1, n_estimators=100)
            
            try:
                score = np.mean(cross_val_score(model, X, y, cv=min(5, len(y)), scoring='roc_auc'))
                model.fit(X, y)
                
                self.models['churn_prediction'] = {
                    'model': model, 
                    'features': X.columns.tolist(), 
                    'score': score,
                    'threshold': churn_threshold
                }
                print(f"    ✅ Churn model AUC: {score:.3f}")
                
            except Exception as e:
                print(f"    ⚠️ Churn model training failed: {e}")
                return None
                
        except Exception as e:
            print(f"    ⚠️ Churn prediction setup failed: {e}")
            return None
            
        return self.models.get('churn_prediction')

    def build_clv_prediction_model(self):
        """Advanced Customer Lifetime Value prediction"""
        print("💎 Building CLV prediction model...")
        if self.features_df.empty: 
            return None

        try:
            # Find revenue column
            revenue_cols = [col for col in self.features_df.columns if 'revenue' in col.lower() or 'sales' in col.lower()]
            if not revenue_cols:
                print("    ⚠️ No revenue column found for CLV prediction")
                return None
                
            target_col = revenue_cols[0]
            
            feature_cols = [col for col in self.features_df.columns if col != target_col]
            X = self.features_df[feature_cols].fillna(self.features_df[feature_cols].median())
            y = self.features_df[target_col]
            
            if X.empty or y.nunique() < 2: 
                return None

            model = lgb.LGBMRegressor(random_state=42, verbose=-1, n_estimators=100)
            
            try:
                score = np.mean(cross_val_score(model, X, y, cv=min(5, len(y)), scoring='r2'))
                model.fit(X, y)

                self.models['clv_prediction'] = {
                    'model': model, 
                    'features': X.columns.tolist(), 
                    'score': score,
                    'target_col': target_col
                }
                print(f"    ✅ CLV model R²: {score:.3f}")
                
            except Exception as e:
                print(f"    ⚠️ CLV model training failed: {e}")
                return None
                
        except Exception as e:
            print(f"    ⚠️ CLV prediction setup failed: {e}")
            return None
            
        return self.models.get('clv_prediction')

    def generate_predictions(self):
        """Generate predictions for all customers"""
        print("🎯 Generating customer predictions...")
        
        try:
            predictions_df = self.features_df.copy()
            predictions_df['cluster'] = self.cluster_labels

            if 'churn_prediction' in self.models:
                model_info = self.models['churn_prediction']
                X_churn = predictions_df[model_info['features']].fillna(predictions_df.median())
                predictions_df['churn_probability'] = model_info['model'].predict_proba(X_churn)[:, 1]

            if 'clv_prediction' in self.models:
                model_info = self.models['clv_prediction']
                X_clv = predictions_df[model_info['features']].fillna(predictions_df.median())
                predictions_df['predicted_future_clv'] = model_info['model'].predict(X_clv)

            self.predictions = predictions_df
            print("    ✅ Predictions generated")
            
        except Exception as e:
            print(f"    ⚠️ Prediction generation failed: {e}")
            self.predictions = self.features_df.copy()
            self.predictions['cluster'] = self.cluster_labels
            
        return self.predictions

# ==============================================================================
# SECTION 7: MAIN EXECUTION PIPELINE - COLAB OPTIMIZED
# ==============================================================================

class CustomerSegmentationPipeline:
    """Main orchestration class for the entire analytics pipeline - Colab optimized"""

    def __init__(self, data_path='int_online_tx.csv'):
        self.data_path = data_path
        self.results = {}

    def execute_pipeline(self):
        """Execute the complete analytics pipeline"""
        print("🚀 EXECUTING ADVANCED CUSTOMER SEGMENTATION PIPELINE")
        print("=" * 80)

        # Step 1: Load Data
        loader = DataLoader(self.data_path)
        df_raw = loader.load_data()
        if df_raw is None: 
            return None

        # Step 2: Preprocess Data
        df_clean = self.preprocess_data(df_raw)

        # Step 3: Feature Engineering
        print("\n🔧 FEATURE ENGINEERING PHASE")
        print("-" * 50)
        
        feature_engineer = AdvancedFeatureEngineer(df_clean)
        temporal_features = feature_engineer.create_temporal_features()
        product_features = feature_engineer.create_product_diversity_features()
        sequence_features = feature_engineer.create_behavioral_sequences()

        # Step 4: NLP Analysis
        nlp_analyzer = EnhancedNLPAnalyzer(df_clean)
        nlp_embeddings = nlp_analyzer.create_multi_embedding_features()

        # Step 5: Combine Features
        print("\n🔗 FEATURE COMBINATION PHASE")
        print("-" * 50)
        
        feature_dfs = [temporal_features, product_features, sequence_features]
        
        # Add NLP features if available
        for emb_type, emb_df in nlp_embeddings.items():
            if not emb_df.empty:
                feature_dfs.append(emb_df)

        all_features = temporal_features
        for df in feature_dfs[1:]:
            if not df.empty:
                all_features = all_features.join(df, how='inner')
        
        all_features = all_features.fillna(0)

        # Remove non-variant features
        non_variant_cols = all_features.columns[all_features.nunique() <= 1]
        if not non_variant_cols.empty:
            all_features.drop(non_variant_cols, axis=1, inplace=True)
            print(f"🗑️ Removed {len(non_variant_cols)} non-variant features.")

        print(f"\n📊 FINAL FEATURE MATRIX: {all_features.shape[0]:,} customers × {all_features.shape[1]} features")

        if all_features.empty or all_features.shape[0] < 5:
            print("    ❌ Insufficient data for analysis")
            return None
        
        # Step 6: Feature Scaling
        print("\n⚖️ FEATURE SCALING PHASE")
        print("-" * 50)
        
        features_scaled = RobustScaler().fit_transform(all_features)
        print("    ✅ Features scaled using RobustScaler")

        # Step 7: Clustering
        print("\n🎯 CLUSTERING PHASE")
        print("-" * 50)
        
        clustering_framework = AdvancedClusteringFramework(features_scaled)
        clustering_framework.optimize_clustering_parameters()
        ensemble_labels = clustering_framework.ensemble_clustering()
        cluster_metrics = clustering_framework.validate_clustering_quality(ensemble_labels)

        # Step 8: Business Intelligence
        print("\n💼 BUSINESS INTELLIGENCE PHASE")
        print("-" * 50)
        
        bi_analyzer = BusinessIntelligenceAnalyzer(features_scaled, ensemble_labels, all_features)
        cluster_stats = bi_analyzer.statistical_cluster_analysis()
        clv_analysis = bi_analyzer.customer_lifetime_value_analysis()
        recommendations = bi_analyzer.generate_business_recommendations()

        # Step 9: Visualizations
        print("\n📊 VISUALIZATION PHASE")
        print("-" * 50)
        
        viz_suite = AdvancedVisualizationSuite(all_features, ensemble_labels, bi_analyzer.insights)
        viz_suite.create_executive_dashboard()
        
        if features_scaled.shape[1] >= 2:
            viz_suite.create_3d_cluster_visualization(features_scaled)
            
        viz_suite.create_summary_charts()

        # Step 10: Predictive Analytics
        print("\n🔮 PREDICTIVE ANALYTICS PHASE")
        print("-" * 50)
        
        predictive_engine = PredictiveAnalyticsEngine(all_features, ensemble_labels)
        churn_model = predictive_engine.build_churn_prediction_model()
        clv_model = predictive_engine.build_clv_prediction_model()
        predictions = predictive_engine.generate_predictions()

        # Store results
        self.results = {
            'raw_data': df_raw,
            'clean_data': df_clean,
            'feature_matrix': all_features,
            'scaled_features': features_scaled,
            'cluster_labels': ensemble_labels,
            'cluster_metrics': cluster_metrics,
            'cluster_statistics': cluster_stats,
            'clv_analysis': clv_analysis,
            'business_recommendations': recommendations,
            'churn_model': churn_model,
            'clv_model': clv_model,
            'customer_predictions': predictions,
            'data_profile': loader.data_profile
        }
        
        return self.results

    def preprocess_data(self, df_raw):
        """Enhanced data preprocessing"""
        print("\n🧹 DATA PREPROCESSING PHASE")
        print("-" * 50)
        
        initial_rows = len(df_raw)
        
        # Basic cleaning
        df = df_raw.dropna(subset=['CustomerID']).copy()
        df['CustomerID'] = df['CustomerID'].astype(str)  # More flexible
        
        # Remove negative quantities and prices
        df = df[df['Quantity'] > 0]
        df = df[df['UnitPrice'] > 0]
        
        # Create sales column
        df['Sales'] = df['Quantity'] * df['UnitPrice']
        
        # Remove extreme outliers using IQR method
        Q1 = df['Sales'].quantile(0.25)
        Q3 = df['Sales'].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Keep only data within bounds
        df = df[~((df['Sales'] < lower_bound) | (df['Sales'] > upper_bound))]
        
        final_rows = len(df)
        removed_rows = initial_rows - final_rows
        
        print(f"    ✅ Cleaned data: {df.shape[0]:,} transactions, {df['CustomerID'].nunique():,} customers")
        print(f"    📉 Removed {removed_rows:,} rows ({removed_rows/initial_rows*100:.1f}% of original data)")
        
        return df

    def generate_executive_summary(self):
        """Generate comprehensive executive summary"""
        print("\n" + "="*80)
        print("📋 EXECUTIVE SUMMARY")
        print("="*80)
        
        if not self.results: 
            print("    ❌ No results to summarize.")
            return

        # Data Overview
        print(f"\n📊 DATA OVERVIEW:")
        print(f"    • Total Customers Analyzed: {self.results['feature_matrix'].shape[0]:,}")
        print(f"    • Total Features Created: {self.results['feature_matrix'].shape[1]:,}")
        print(f"    • Data Quality Score: {(1 - self.results['data_profile']['missing_values'].sum() / len(self.results['clean_data'])) * 100:.1f}%")

        # Clustering Results
        print(f"\n🎯 SEGMENTATION RESULTS:")
        metrics = self.results['cluster_metrics']
        if 'error' not in metrics:
            print(f"    • Clusters Identified: {metrics.get('n_clusters', 'N/A')}")
            print(f"    • Clustering Quality (Silhouette): {metrics.get('silhouette_score', 0):.3f}")
            print(f"    • Outliers Detected: {metrics.get('n_outliers', 0):,}")

        # Segment Overview
        print(f"\n💰 SEGMENT PERFORMANCE:")
        if 'clv_analysis' in self.results and self.results['clv_analysis']:
            clv_data = self.results['clv_analysis']
            total_clv = sum(data['predicted_clv'] * data['size'] for data in clv_data.values())
            total_customers = sum(data['size'] for data in clv_data.values())
            
            print(f"    • Average CLV Across All Segments: ${total_clv/total_customers if total_customers > 0 else 0:.2f}")
            
            # Sort segments by CLV
            sorted_segments = sorted(clv_data.items(), key=lambda x: x[1]['predicted_clv'], reverse=True)
            
            for i, (seg_id, data) in enumerate(sorted_segments[:3]):  # Top 3 segments
                print(f"    • Segment {seg_id}: {data['size']:,} customers, Avg CLV: ${data['predicted_clv']:.2f}")

        # Model Performance
        print(f"\n🤖 PREDICTIVE MODEL PERFORMANCE:")
        if 'churn_model' in self.results and self.results['churn_model']:
            print(f"    • Churn Prediction AUC: {self.results['churn_model']['score']:.3f}")
        if 'clv_model' in self.results and self.results['clv_model']:
            print(f"    • CLV Prediction R²: {self.results['clv_model']['score']:.3f}")

        # Key Recommendations
        print(f"\n💡 KEY BUSINESS RECOMMENDATIONS:")
        if 'business_recommendations' in self.results and self.results['business_recommendations']:
            recs = sorted(
                self.results['business_recommendations'].values(), 
                key=lambda x: x['priority'], 
                reverse=True
            )
            for i, rec in enumerate(recs[:3], 1):
                print(f"    {i}. {rec['segment_type']}: {rec['strategy'][0]}")
        
        print("\n🎉 PIPELINE EXECUTION COMPLETED SUCCESSFULLY!")
        print("="*80)

    def display_sample_predictions(self, n_samples=10):
        """Display sample customer predictions"""
        if 'customer_predictions' not in self.results or self.results['customer_predictions'].empty:
            print("    ❌ No customer predictions available")
            return
            
        predictions = self.results['customer_predictions']
        sample_predictions = predictions.head(n_samples)
        
        print(f"\n📋 SAMPLE CUSTOMER PREDICTIONS (Top {n_samples}):")
        print("-" * 80)
        
        # Display columns that exist
        display_cols = ['cluster']
        if 'churn_probability' in predictions.columns:
            display_cols.append('churn_probability')
        if 'predicted_future_clv' in predictions.columns:
            display_cols.append('predicted_future_clv')
            
        # Add some key features for context
        feature_cols = [col for col in predictions.columns if 'revenue' in col.lower() or 'frequency' in col.lower()][:3]
        display_cols.extend(feature_cols)
        
        print(sample_predictions[display_cols].round(2).to_string())
        print("-" * 80)

    def export_results(self, filename='customer_segmentation_results.csv'):
        """Export results to CSV"""
        if 'customer_predictions' not in self.results:
            print("    ❌ No results to export")
            return
            
        try:
            self.results['customer_predictions'].to_csv(filename)
            print(f"    ✅ Results exported to {filename}")
        except Exception as e:
            print(f"    ❌ Export failed: {e}")

# ==============================================================================
# SECTION 8: EXECUTION - COLAB OPTIMIZED
# ==============================================================================

def run_customer_segmentation_analysis(data_path='int_online_tx.csv'):
    """
    Main function to run the complete customer segmentation analysis
    Optimized for Google Colab
    """
    try:
        # Initialize and run pipeline
        pipeline = CustomerSegmentationPipeline(data_path)
        results = pipeline.execute_pipeline()

        if results:
            print(f"\n✅ Analysis completed successfully!")
            print(f"📊 Results stored in pipeline.results dictionary")
            
            # Generate executive summary
            pipeline.generate_executive_summary()
            
            # Display sample predictions
            pipeline.display_sample_predictions()
            
            # Optional: Export results
            # pipeline.export_results()
            
            return pipeline
        else:
            print("❌ Analysis failed - no results generated")
            return None
            
    except Exception as e:
        print(f"❌ Pipeline execution failed: {e}")
        import traceback
        traceback.print_exc()
        return None

# ==============================================================================
# COLAB SETUP INSTRUCTIONS AND EXECUTION
# ==============================================================================

def setup_colab_environment():
    """Setup function for Google Colab environment"""
    print("🔧 Setting up Colab environment...")
    
    # Install required packages
    import subprocess
    import sys
    
    packages = [
        'umap-learn',
        'hdbscan', 
        'sentence-transformers',
        'xgboost',
        'lightgbm',
        'plotly',
        'networkx'
    ]
    
    for package in packages:
        try:
            __import__(package.replace('-', '_'))
            print(f"    ✅ {package} already installed")
        except ImportError:
            print(f"    📦 Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
    
    print("    ✅ Environment setup complete!")

# Main execution
if __name__ == "__main__":
    print("🚀 ADVANCED CUSTOMER SEGMENTATION ANALYTICS SUITE")
    print("=" * 80)
    print("📋 GOOGLE COLAB OPTIMIZED VERSION")
    print("=" * 80)
    
    # Uncomment the next line if running for the first time in Colab
    # setup_colab_environment()
    
    # Run the analysis
    pipeline = run_customer_segmentation_analysis()
    
    if pipeline and pipeline.results:
        print("\n🎯 QUICK ACCESS TO RESULTS:")
        print("=" * 50)
        print("• pipeline.results['feature_matrix'] - Customer features")
        print("• pipeline.results['cluster_labels'] - Segment assignments") 
        print("• pipeline.results['clv_analysis'] - CLV by segment")
        print("• pipeline.results['business_recommendations'] - Action items")
        print("• pipeline.results['customer_predictions'] - Individual predictions")
        print("=" * 50)
        
        # Display key insights
        if 'business_recommendations' in pipeline.results:
            print(f"\n💡 TOP BUSINESS INSIGHTS:")
            recs = pipeline.results['business_recommendations']
            for cluster_id, rec_data in list(recs.items())[:3]:
                print(f"\n🎯 {rec_data['segment_type']} (Segment {cluster_id}):")
                for strategy in rec_data['strategy'][:2]:
                    print(f"   • {strategy}")
    
    print(f"\n🎉 Analysis Complete! All visualizations should be displayed above.")
    print("=" * 80)