In [2]:
# Print current working directory to verify location
import os
print(os.getcwd())

# Check if directory exists
import os
os.makedirs('path/to/directory', exist_ok=True)

# Full path example
full_path = os.path.join(os.getcwd(), 'customer_hyper_personalization_dataset.csv')

/content


In [4]:
pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.9 MB[0m [31m22.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [5]:
import pandas as pd
import numpy as np
import faker
import random
from datetime import datetime, timedelta

class CustomerDataGenerator:
    def __init__(self, num_customers=1000):
        self.fake = faker.Faker()
        self.num_customers = num_customers

    def generate_demographics(self):
        demographics = []
        for _ in range(self.num_customers):
            demographics.append({
                'customer_id': self.fake.uuid4(),
                'age': np.random.randint(18, 65),
                'gender': np.random.choice(['Male', 'Female', 'Other']),
                'income_level': np.random.choice([
                    'Low', 'Medium', 'High', 'Very High'
                ]),
                'education': np.random.choice([
                    'High School', 'Bachelors', 'Masters', 'PhD'
                ]),
                'marital_status': np.random.choice([
                    'Single', 'Married', 'Divorced', 'Widowed'
                ]),
                'location': self.fake.city()
            })
        return pd.DataFrame(demographics)

    def generate_purchase_history(self, demographics_df):
        purchase_categories = [
            'Electronics', 'Fashion', 'Home & Kitchen',
            'Books', 'Sports', 'Beauty', 'Groceries'
        ]

        purchases = []
        for _, customer in demographics_df.iterrows():
            num_purchases = np.random.randint(1, 20)
            for _ in range(num_purchases):
                purchases.append({
                    'customer_id': customer['customer_id'],
                    'product_category': np.random.choice(purchase_categories),
                    'purchase_amount': round(np.random.uniform(10, 500), 2),
                    'purchase_date': self.fake.date_between(
                        start_date='-2y', end_date='today'
                    ),
                    'purchase_platform': np.random.choice([
                        'Mobile App', 'Website', 'In-Store'
                    ])
                })
        return pd.DataFrame(purchases)

    def generate_social_media_activity(self, demographics_df):
        platforms = ['Instagram', 'Facebook', 'Twitter', 'LinkedIn']
        interests = [
            'Technology', 'Fashion', 'Travel', 'Fitness',
            'Cooking', 'Gaming', 'Music', 'Sports'
        ]

        social_data = []
        for _, customer in demographics_df.iterrows():
            social_data.append({
                'customer_id': customer['customer_id'],
                'primary_platform': np.random.choice(platforms),
                'followers_count': np.random.randint(50, 5000),
                'engagement_rate': round(np.random.uniform(0.01, 0.1), 3),
                'top_interests': ', '.join(
                    np.random.choice(interests, size=3, replace=False)
                ),
                'average_daily_usage_hours': round(np.random.uniform(0.5, 4), 2)
            })
        return pd.DataFrame(social_data)

    def generate_sentiment_data(self, demographics_df):
        sentiment_keywords = {
            'positive': ['amazing', 'great', 'excellent', 'love'],
            'negative': ['poor', 'bad', 'terrible', 'disappointed'],
            'neutral': ['okay', 'average', 'standard']
        }

        sentiment_data = []
        for _, customer in demographics_df.iterrows():
            sentiment_data.append({
                'customer_id': customer['customer_id'],
                'product_review_sentiment': np.random.choice([
                    'positive', 'negative', 'neutral'
                ]),
                'customer_service_sentiment': np.random.choice([
                    'positive', 'negative', 'neutral'
                ]),
                'brand_perception': np.random.choice([
                    'loyal', 'neutral', 'critical'
                ]),
                'complaint_frequency': np.random.randint(0, 5)
            })
        return pd.DataFrame(sentiment_data)

    def generate_complete_dataset(self):
        # Generate base demographic data
        demographics_df = self.generate_demographics()

        # Generate related datasets
        purchase_history_df = self.generate_purchase_history(demographics_df)
        social_media_df = self.generate_social_media_activity(demographics_df)
        sentiment_df = self.generate_sentiment_data(demographics_df)

        # Merge all datasets
        merged_df = demographics_df.merge(
            purchase_history_df, on='customer_id'
        ).merge(
            social_media_df, on='customer_id'
        ).merge(
            sentiment_df, on='customer_id'
        )

        return merged_df

# Generate the dataset
generator = CustomerDataGenerator(num_customers=5000)
hyper_personalization_dataset = generator.generate_complete_dataset()

# Save to CSV for further analysis
hyper_personalization_dataset.to_csv('customer_hyper_personalization_dataset.csv', index=False)

print(hyper_personalization_dataset.head())
print("\nDataset Shape:", hyper_personalization_dataset.shape)

                            customer_id  age  gender income_level  education  \
0  f082f9e3-46c0-43d6-a627-6e90083726ae   53  Female    Very High  Bachelors   
1  f082f9e3-46c0-43d6-a627-6e90083726ae   53  Female    Very High  Bachelors   
2  f082f9e3-46c0-43d6-a627-6e90083726ae   53  Female    Very High  Bachelors   
3  f082f9e3-46c0-43d6-a627-6e90083726ae   53  Female    Very High  Bachelors   
4  f082f9e3-46c0-43d6-a627-6e90083726ae   53  Female    Very High  Bachelors   

  marital_status    location product_category  purchase_amount purchase_date  \
0        Widowed  Brownmouth           Sports           293.85    2023-05-01   
1        Widowed  Brownmouth           Sports           402.08    2024-10-03   
2        Widowed  Brownmouth   Home & Kitchen           117.82    2023-08-07   
3        Widowed  Brownmouth        Groceries           461.59    2023-08-27   
4        Widowed  Brownmouth      Electronics           261.41    2025-02-21   

  purchase_platform primary_platform  

In [7]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [8]:
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import faiss
import torch.nn as nn
import torch.optim as optim
from typing import Dict, List, Any

class HyperPersonalizationSystem:
    def __init__(self):
        # Text Embedding Model
        self.text_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Vision Embedding Model
        self.vision_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.vision_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

        # Numerical Feature Scaler
        self.numerical_scaler = StandardScaler()

        # Label Encoders
        self.label_encoders = {}

        # Vector Database
        self.vector_index = None
        self.metadata = None

    def preprocess_data(self, data: pd.DataFrame) -> Dict[str, Any]:
        """
        Comprehensive data preprocessing

        Args:
            data (pd.DataFrame): Raw customer data

        Returns:
            Dict containing processed features
        """
        processed_data = {}

        # Numerical Features Scaling
        numerical_columns = ['age', 'income', 'purchase_frequency']
        processed_data['numerical_features'] = self.numerical_scaler.fit_transform(
            data[numerical_columns]
        )

        # Categorical Feature Encoding
        categorical_columns = ['gender', 'location', 'education']
        for col in categorical_columns:
            le = LabelEncoder()
            processed_data[f'{col}_encoded'] = le.fit_transform(data[col])
            self.label_encoders[col] = le

        # Text Embeddings
        processed_data['text_embeddings'] = self.text_embedding_model.encode(
            data['interests'].tolist()
        )

        return processed_data

    def create_composite_embedding(self, processed_data: Dict[str, Any]) -> np.ndarray:
        """
        Create a unified embedding across different modalities

        Args:
            processed_data (Dict): Preprocessed data features

        Returns:
            Composite customer embeddings
        """
        composite_embedding = np.hstack([
            processed_data['numerical_features'],
            processed_data['text_embeddings'],
            np.array([
                processed_data['gender_encoded'],
                processed_data['location_encoded'],
                processed_data['education_encoded']
            ]).T
        ])

        return composite_embedding

    def build_vector_database(self, embeddings: np.ndarray, metadata: pd.DataFrame):
        """
        Construct vector database for similarity search

        Args:
            embeddings (np.ndarray): Customer embeddings
            metadata (pd.DataFrame): Customer metadata
        """
        # Normalize embeddings
        normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis]

        # Create FAISS index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(normalized_embeddings.astype('float32'))

        self.vector_index = index
        self.metadata = metadata

    def find_similar_customers(self, query_embedding: np.ndarray, top_k: int = 5) -> pd.DataFrame:
        """
        Find similar customers using vector similarity

        Args:
            query_embedding (np.ndarray): Embedding to search
            top_k (int): Number of similar customers to retrieve

        Returns:
            DataFrame of similar customers
        """
        normalized_query = query_embedding / np.linalg.norm(query_embedding)
        distances, indices = self.vector_index.search(
            normalized_query.reshape(1, -1).astype('float32'),
            top_k
        )

        return self.metadata.iloc[indices[0]], distances[0]

    class RecommendationModel(nn.Module):
        def __init__(self, input_dim, hidden_dims=[128, 64], output_dim=10):
            super().__init__()
            layers = []
            prev_dim = input_dim

            for hidden_dim in hidden_dims:
                layers.append(nn.Linear(prev_dim, hidden_dim))
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(0.3))
                prev_dim = hidden_dim

            layers.append(nn.Linear(prev_dim, output_dim))

            self.model = nn.Sequential(*layers)

        def forward(self, x):
            return self.model(x)

    def train_recommendation_model(self, embeddings, targets):
        """
        Train personalized recommendation model

        Args:
            embeddings (np.ndarray): Customer embeddings
            targets (np.ndarray): Target recommendations

        Returns:
            Trained recommendation model
        """
        # Convert to PyTorch tensors
        X = torch.FloatTensor(embeddings)
        y = torch.FloatTensor(targets)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Initialize model
        model = self.RecommendationModel(X.shape[1])
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters())

        # Training loop
        for epoch in range(50):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train)
            loss = criterion(outputs, y_train)
            loss.backward()
            optimizer.step()

        return model

    def generate_personalized_recommendations(self, customer_profile, recommendation_model):
        """
        Generate personalized recommendations

        Args:
            customer_profile (np.ndarray): Customer embedding
            recommendation_model (nn.Module): Trained recommendation model

        Returns:
            Personalized recommendations
        """
        customer_tensor = torch.FloatTensor(customer_profile)
        recommendation_model.eval()

        with torch.no_grad():
            recommendations = recommendation_model(customer_tensor)

        return recommendations.numpy()

# Example Usage Workflow
def main():
    # Load Dataset (Simulated)
    data = pd.DataFrame({
        'age': np.random.randint(18, 65, 1000),
        'income': np.random.uniform(20000, 200000, 1000),
        'purchase_frequency': np.random.randint(1, 50, 1000),
        'gender': np.random.choice(['M', 'F'], 1000),
        'location': np.random.choice(['Urban', 'Suburban', 'Rural'], 1000),
        'education': np.random.choice(['Bachelors', 'Masters', 'PhD'], 1000),
        'interests': [' '.join(np.random.choice(['tech', 'fashion', 'sports', 'travel'],
                                                np.random.randint(1, 4))) for _ in range(1000)]
    })

    # Initialize Hyper-Personalization System
    hyper_system = HyperPersonalizationSystem()

    # Preprocess Data
    processed_data = hyper_system.preprocess_data(data)

    # Create Composite Embeddings
    composite_embeddings = hyper_system.create_composite_embedding(processed_data)

    # Build Vector Database
    hyper_system.build_vector_database(composite_embeddings, data)

    # Train Recommendation Model
    recommendation_targets = np.random.rand(len(data), 10)  # Simulated recommendation targets
    recommendation_model = hyper_system.train_recommendation_model(
        composite_embeddings,
        recommendation_targets
    )

    # Find Similar Customers
    sample_customer = composite_embeddings[0]
    similar_customers, distances = hyper_system.find_similar_customers(sample_customer)
    print("Similar Customers:\n", similar_customers)

    # Generate Personalized Recommendations
    personalized_rec = hyper_system.generate_personalized_recommendations(
        sample_customer,
        recommendation_model
    )
    print("\nPersonalized Recommendations:", personalized_rec)

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Similar Customers:
      age        income  purchase_frequency gender  location  education  \
0     39  64538.190003                   8      F  Suburban  Bachelors   
297   43  67981.970072                   6      F  Suburban  Bachelors   
249   31  23689.309692                   7      F  Suburban    Masters   
772   45  45796.155608                   8      F     Urban  Bachelors   
444   44  42805.207395                   9      F  Suburban  Bachelors   

                  interests  
0                    sports  
297          fashion sports  
249                  sports  
772  fashion sports fashion  
444      sports tech travel  

Personalized Recommendations: [0.43681926 0.40052223 0.4328537  0.4435931  0.44331324 0.49059835
 0.45861742 0.44230026 0.40400496 0.4491257 ]


In [11]:
import pandas as pd
import numpy as np
import faker
import random
from datetime import datetime, timedelta
from typing import Dict, Any

# Mock LLM Model (since we can't use actual API in this example)
class MockLLMModel:
    def generate(self, prompt: str) -> str:
        """
        Simulate LLM response generation

        Args:
            prompt (str): Input prompt for generation

        Returns:
            str: Simulated generated response
        """
        # Simulate different response types based on prompt content
        if 'generate insights' in prompt.lower():
            return self._generate_profile_insights(prompt)
        elif 'personalized recommendations' in prompt.lower():
            return self._generate_recommendations(prompt)
        elif 'segmentation' in prompt.lower():
            return self._generate_segmentation_insights(prompt)
        else:
            return "Generic LLM response based on input prompt."

    def _generate_profile_insights(self, prompt: str) -> str:
        """Generate simulated profile insights"""
        insights = [
            "Potential Lifestyle: Tech-savvy professional with urban lifestyle",
            "Potential Hobbies: Travel and fitness enthusiast",
            "Personalization Strategy: Recommend tech and wellness products",
            "Marketing Approach: Targeted digital campaigns with personalized content"
        ]
        return "\n".join(insights)

    def _generate_recommendations(self, prompt: str) -> str:
        """Generate simulated personalized recommendations"""
        recommendations = [
            "1. Smart Fitness Tracker - Aligns with fitness interests",
            "2. Noise-Cancelling Headphones - Tech preferences",
            "3. Travel Backpack - Matches travel interests",
            "4. Online Learning Subscription - Supports professional growth",
            "5. Sustainable Fashion Item - Reflects modern lifestyle choices"
        ]
        return "\n".join(recommendations)

    def _generate_segmentation_insights(self, prompt: str) -> str:
        """Generate simulated customer segmentation insights"""
        return """
Customer Segmentation Strategy:
1. Digital Natives Segment
   - Age: 25-35
   - High digital engagement
   - Tech and experience-driven purchases

2. Professional Achievers Segment
   - Age: 35-45
   - High income
   - Focus on quality and convenience

3. Value-Conscious Consumers
   - Age: 45-55
   - Pragmatic purchasing decisions
   - Strong brand loyalty
"""

class CustomerDataGenerator:
    def __init__(self, num_customers=1000):
        self.fake = faker.Faker()
        self.num_customers = num_customers

    def generate_demographics(self):
        demographics = []
        for _ in range(self.num_customers):
            demographics.append({
                'customer_id': self.fake.uuid4(),
                'age': np.random.randint(18, 65),
                'gender': np.random.choice(['Male', 'Female', 'Other']),
                'income_level': np.random.choice([
                    'Low', 'Medium', 'High', 'Very High'
                ]),
                'education': np.random.choice([
                    'High School', 'Bachelors', 'Masters', 'PhD'
                ]),
                'marital_status': np.random.choice([
                    'Single', 'Married', 'Divorced', 'Widowed'
                ]),
                'location': self.fake.city()
            })
        return pd.DataFrame(demographics)

    def generate_purchase_history(self, demographics_df):
        purchase_categories = [
            'Electronics', 'Fashion', 'Home & Kitchen',
            'Books', 'Sports', 'Beauty', 'Groceries'
        ]

        purchases = []
        for _, customer in demographics_df.iterrows():
            num_purchases = np.random.randint(1, 20)
            for _ in range(num_purchases):
                purchases.append({
                    'customer_id': customer['customer_id'],
                    'product_category': np.random.choice(purchase_categories),
                    'purchase_amount': round(np.random.uniform(10, 500), 2),
                    'purchase_date': self.fake.date_between(
                        start_date='-2y', end_date='today'
                    ),
                    'purchase_platform': np.random.choice([
                        'Mobile App', 'Website', 'In-Store'
                    ])
                })
        return pd.DataFrame(purchases)

    def generate_social_media_activity(self, demographics_df):
        platforms = ['Instagram', 'Facebook', 'Twitter', 'LinkedIn']
        interests = [
            'Technology', 'Fashion', 'Travel', 'Fitness',
            'Cooking', 'Gaming', 'Music', 'Sports'
        ]

        social_data = []
        for _, customer in demographics_df.iterrows():
            social_data.append({
                'customer_id': customer['customer_id'],
                'primary_platform': np.random.choice(platforms),
                'followers_count': np.random.randint(50, 5000),
                'engagement_rate': round(np.random.uniform(0.01, 0.1), 3),
                'top_interests': ', '.join(
                    np.random.choice(interests, size=3, replace=False)
                ),
                'average_daily_usage_hours': round(np.random.uniform(0.5, 4), 2)
            })
        return pd.DataFrame(social_data)

    def generate_sentiment_data(self, demographics_df):
        sentiment_data = []
        for _, customer in demographics_df.iterrows():
            sentiment_data.append({
                'customer_id': customer['customer_id'],
                'product_review_sentiment': np.random.choice([
                    'positive', 'negative', 'neutral'
                ]),
                'customer_service_sentiment': np.random.choice([
                    'positive', 'negative', 'neutral'
                ]),
                'brand_perception': np.random.choice([
                    'loyal', 'neutral', 'critical'
                ]),
                'complaint_frequency': np.random.randint(0, 5)
            })
        return pd.DataFrame(sentiment_data)

    def generate_complete_dataset(self):
        # Generate base demographic data
        demographics_df = self.generate_demographics()

        # Generate related datasets
        purchase_history_df = self.generate_purchase_history(demographics_df)
        social_media_df = self.generate_social_media_activity(demographics_df)
        sentiment_df = self.generate_sentiment_data(demographics_df)

        # Merge all datasets
        merged_df = demographics_df.merge(
            purchase_history_df, on='customer_id'
        ).merge(
            social_media_df, on='customer_id'
        ).merge(
            sentiment_df, on='customer_id'
        )

        return merged_df

def enrich_customer_profiles(customer_data: pd.DataFrame, llm_model: MockLLMModel) -> pd.DataFrame:
    """
    Enrich customer profiles using LLM

    Args:
        customer_data (pd.DataFrame): Customer dataset
        llm_model (MockLLMModel): LLM model for generating insights

    Returns:
        pd.DataFrame: Enriched customer profiles
    """
    enriched_profiles = []

    # Sample a few customers for demonstration
    sample_customers = customer_data.sample(min(10, len(customer_data)))

    for _, customer in sample_customers.iterrows():
        # Construct comprehensive profile prompt
        profile_prompt = f"""
        Generate insights for a customer with the following profile:

        Demographics:
        - Age: {customer['age']}
        - Gender: {customer['gender']}
        - Income Level: {customer['income_level']}
        - Education: {customer['education']}
        - Location: {customer['location']}

        Purchase Behavior:
        - Purchase Category: {customer['product_category']}
        - Purchase Amount: ${customer['purchase_amount']:.2f}
        - Purchase Platform: {customer['purchase_platform']}

        Social Media:
        - Primary Platform: {customer['primary_platform']}
        - Top Interests: {customer['top_interests']}

        Brand Perception: {customer['brand_perception']}
        """

        # Generate enriched profile
        enriched_profile = llm_model.generate(profile_prompt)

        enriched_profiles.append({
            'customer_id': customer['customer_id'],
            'llm_generated_insights': enriched_profile
        })

    return pd.DataFrame(enriched_profiles)

def generate_personalized_recommendations(customer_data: pd.DataFrame, llm_model: MockLLMModel) -> pd.DataFrame:
    """
    Generate personalized recommendations using LLM

    Args:
        customer_data (pd.DataFrame): Customer dataset
        llm_model (MockLLMModel): LLM model for generating recommendations

    Returns:
        pd.DataFrame: Personalized recommendations
    """
    personalized_recommendations = []

    # Sample a few customers for demonstration
    sample_customers = customer_data.sample(min(10, len(customer_data)))

    for _, customer in sample_customers.iterrows():
        recommendation_prompt = f"""
        Generate 5 personalized product recommendations for a customer:

        Profile:
        - Age: {customer['age']}
        - Income Level: {customer['income_level']}
        - Top Interests: {customer['top_interests']}
        - Purchase History: {customer['product_category']}
        - Brand Perception: {customer['brand_perception']}
        """

        personalized_rec = llm_model.generate(recommendation_prompt)

        personalized_recommendations.append({
            'customer_id': customer['customer_id'],
            'personalized_recommendations': personalized_rec
        })

    return pd.DataFrame(personalized_recommendations)

def llm_powered_customer_segmentation(customer_data: pd.DataFrame, llm_model: MockLLMModel) -> Dict[str, Any]:
    """
    Perform LLM-powered customer segmentation

    Args:
        customer_data (pd.DataFrame): Customer dataset
        llm_model (MockLLMModel): LLM model for generating segmentation insights

    Returns:
        Dict containing segmentation insights
    """
    # Aggregate dataset characteristics
    segment_analysis_prompt = f"""
    Analyze customer dataset characteristics:

    Demographics:
    - Age Range: {customer_data['age'].min()} - {customer_data['age'].max()}
    - Income Levels: {customer_data['income_level'].value_counts().to_dict()}

    Purchase Behavior:
    - Top Purchase Categories: {customer_data['product_category'].value_counts().head().to_dict()}
    - Average Purchase Amount: ${customer_data['purchase_amount'].mean():.2f}

    Social Media:
    - Primary Platforms: {customer_data['primary_platform'].value_counts().to_dict()}
    - Top Interests: {customer_data['top_interests'].str.split(', ', expand=True).stack().value_counts().head().to_dict()}
    """

    # Generate segmentation insights
    segmentation_insights = llm_model.generate(segment_analysis_prompt)

    return {
        'segmentation_strategy': segmentation_insights,
        'dataset_summary': {
            'total_customers': len(customer_data),
            'avg_age': customer_data['age'].mean(),
            'avg_purchase_amount': customer_data['purchase_amount'].mean()
        }
    }

def main():
    # Initialize components
    generator = CustomerDataGenerator(num_customers=5000)
    llm_model = MockLLMModel()

    # Generate complete dataset
    hyper_personalization_dataset = generator.generate_complete_dataset()

    print("=== Dataset Overview ===")
    print(hyper_personalization_dataset.head())
    print("\nDataset Shape:", hyper_personalization_dataset.shape)

    # Enrich Customer Profiles
    print("\n\n=== Customer Profile Enrichment ===")
    enriched_profiles = enrich_customer_profiles(hyper_personalization_dataset, llm_model)
    print(enriched_profiles)

    # Generate Personalized Recommendations
    print("\n\n=== Personalized Recommendations ===")
    personalized_recommendations = generate_personalized_recommendations(hyper_personalization_dataset, llm_model)
    print(personalized_recommendations)

    # Perform Customer Segmentation
    print("\n\n=== Customer Segmentation Insights ===")
    segmentation_results = llm_powered_customer_segmentation(hyper_personalization_dataset, llm_model)

    print("Segmentation Strategy:")
    print(segmentation_results['segmentation_strategy'])

    print("\nDataset Summary:")
    for key, value in segmentation_results['dataset_summary'].items():
        print(f"{key}: {value}")

if __name__ == "__main__":
    main()

=== Dataset Overview ===
                            customer_id  age  gender income_level  \
0  4038061b-547e-40b7-a8fa-1a637a1be3ff   31  Female         High   
1  4038061b-547e-40b7-a8fa-1a637a1be3ff   31  Female         High   
2  4038061b-547e-40b7-a8fa-1a637a1be3ff   31  Female         High   
3  4038061b-547e-40b7-a8fa-1a637a1be3ff   31  Female         High   
4  a0b12ed5-831f-4992-bacf-db30ac8b8283   24   Other       Medium   

     education marital_status        location product_category  \
0  High School       Divorced      Bentonbury           Sports   
1  High School       Divorced      Bentonbury      Electronics   
2  High School       Divorced      Bentonbury      Electronics   
3  High School       Divorced      Bentonbury      Electronics   
4          PhD       Divorced  East Dianeport            Books   

   purchase_amount purchase_date purchase_platform primary_platform  \
0            82.09    2024-06-25          In-Store         LinkedIn   
1           184.33   