Data Generation (Mock Data)

In [14]:
# CELL 1: Environment Setup & Imports
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import asyncio
import os
import json
import nest_asyncio
from typing import Dict, List, Any, Optional
from dotenv import load_dotenv

load_dotenv()

# Patch for Jupyter/Colab
nest_asyncio.apply()

# Cerebras SDK Setup
CEREBRAS_AVAILABLE = False
try:
    from cerebras.cloud.sdk import Cerebras
    CEREBRAS_AVAILABLE = True
    print("✅ Cerebras SDK available")
except ImportError:
    print("⚠️  Cerebras SDK not installed — install with: pip install cerebras-cloud-sdk")

CEREBRAS_API_KEY = os.getenv("CEREBRAS_API_KEY")
client = None
if CEREBRAS_AVAILABLE and CEREBRAS_API_KEY:
    try:
        client = Cerebras(api_key=CEREBRAS_API_KEY)
        print("✅ Cerebras client initialized")
    except Exception as e:
        print(f"❌ Failed to initialize Cerebras client: {e}")

✅ Cerebras SDK available
✅ Cerebras client initialized


In [15]:
# CELL 2: Mock Data Generator
def generate_cart_abandonment_dataset(n_users: int = 25000) -> pd.DataFrame:
    """Generate realistic cart abandonment dataset"""
    print(f"📊 Generating {n_users:,} cart abandonment records...")

    np.random.seed(42)
    base_date = datetime.now() - timedelta(days=7)
    
    data = {
        'user_id': [f"user_{i:06d}" for i in range(n_users)],
        'cart_abandoned_date': [
            (base_date + timedelta(days=np.random.randint(0, 8))).strftime('%Y-%m-%d')
            for _ in range(n_users)
        ],
        'last_order_date': [
            None if np.random.random() < 0.3 else
            (base_date - timedelta(days=np.random.randint(1, 365))).strftime('%Y-%m-%d')
            for _ in range(n_users)
        ],
        'avg_order_value': np.round(np.random.lognormal(mean=6.5, sigma=0.8, size=n_users) * 10, 2),
        'sessions_last_30d': np.random.poisson(lam=8, size=n_users),
        'num_cart_items': np.random.choice(range(1, 11), size=n_users, p=[0.3,0.2,0.15,0.1,0.08,0.07,0.05,0.03,0.01,0.01]),
        'engagement_score': np.random.beta(2, 5, size=n_users),
        'profitability_score': np.random.beta(3, 3, size=n_users)
    }
    
    df = pd.DataFrame(data)
    print(f"✅ Dataset generated: {len(df):,} records")
    return df

In [16]:
# CELL 3: Define Universe
def define_universe(df: pd.DataFrame) -> pd.DataFrame:
    """Select users who abandoned carts in last 7 days"""
    df['cart_abandoned_date'] = pd.to_datetime(df['cart_abandoned_date'])
    now = pd.Timestamp.now()
    universe = df[df['cart_abandoned_date'] >= (now - timedelta(days=7))].copy()
    print(f"🎯 Universe defined: {len(universe):,} users abandoned cart in last 7 days")
    return universe

In [17]:
# CELL 4: Rule-Based Bucketing
def create_bucket_combinations(universe_df: pd.DataFrame) -> pd.DataFrame:
    """Create MECE bucket combinations"""
    print("🔧 Creating MECE bucket combinations...")

    aov_high = round(universe_df['avg_order_value'].quantile(0.8), -2)
    aov_mid = round(universe_df['avg_order_value'].quantile(0.4), -2)
    engagement_high = round(universe_df['engagement_score'].quantile(0.6), 2)
    profitability_high = round(universe_df['profitability_score'].quantile(0.6), 2)

    print(f"📊 Thresholds: AOV_high=${aov_high:,.0f}, AOV_mid=${aov_mid:,.0f}, Eng>={engagement_high}, Profit>={profitability_high}")

    def get_aov_bucket(aov):
        if pd.isna(aov): return "Other"
        elif aov >= aov_high: return "High"
        elif aov >= aov_mid: return "Medium"
        else: return "Low"

    def get_engagement_bucket(eng):
        if pd.isna(eng): return "Other"
        return "High" if eng >= engagement_high else "Low"

    def get_profitability_bucket(prof):
        if pd.isna(prof): return "Other"
        return "High" if prof >= profitability_high else "Low"

    universe_df['aov_bucket'] = universe_df['avg_order_value'].apply(get_aov_bucket)
    universe_df['engagement_bucket'] = universe_df['engagement_score'].apply(get_engagement_bucket)
    universe_df['profitability_bucket'] = universe_df['profitability_score'].apply(get_profitability_bucket)
    universe_df['bucket_combo'] = universe_df['aov_bucket'] + "_" + universe_df['engagement_bucket'] + "_" + universe_df['profitability_bucket']

    print(f"✅ MECE buckets created — {universe_df['bucket_combo'].nunique()} unique combinations")
    return universe_df

In [36]:
# CELL 1: Pydantic Models for Structured LLM Outputs
from pydantic import BaseModel, Field
from typing import List, Dict, Optional

class SegmentResponse(BaseModel):
    """Structured output for jury and judge"""
    segment_name: str = Field(..., description="Marketer-friendly segment name, e.g., 'Premium_Engaged_Profitable'")
    rule: str = Field(..., description="MECE rule in format: 'AOV_High & Engagement_High & Profitability_High'")

class JurySubmission(BaseModel):
    """Single jury member's submission"""
    role: str
    segment: SegmentResponse

class JudgeDecision(BaseModel):
    """Judge's final decision"""
    chosen_segment: SegmentResponse
    reasoning: Optional[str] = Field(None, description="Brief reason for choice (optional)")

In [38]:
# CELL 2: LangChain + Groq Setup
import os
from groq import Groq
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from dotenv import load_dotenv
import nest_asyncio

nest_asyncio.apply()
load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY not set — required for LangChain + Groq")

# Initialize LangChain LLMs (sync for simplicity)
llm_creative = ChatGroq(model="qwen/qwen3-32b", temperature=0.3, groq_api_key=GROQ_API_KEY)
llm_strategist = ChatGroq(model="meta-llama/llama-4-maverick-17b-128e-instruct", temperature=0.3, groq_api_key=GROQ_API_KEY)
llm_data_sci = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.3, groq_api_key=GROQ_API_KEY)
llm_risk = ChatGroq(model="moonshotai/kimi-k2-instruct-0905", temperature=0.2, groq_api_key=GROQ_API_KEY)
llm_judge = ChatGroq(model="openai/gpt-oss-120b", temperature=0.1, groq_api_key=GROQ_API_KEY)  # Use strongest for judge

In [39]:
# CELL 3: Jury Prompt & Output Parser
jury_parser = PydanticOutputParser(pydantic_object=SegmentResponse)

JURY_PROMPT_TEMPLATE = ChatPromptTemplate.from_template("""
You are a {persona}. Generate a SINGLE, MARKETER-FRIENDLY segment name AND rule for users with this profile:

Bucket Combo: {bucket_combo}

Context:
- High/Medium/Low AOV = Average Order Value tiers
- High/Low Engagement = User engagement with brand  
- High/Low Profitability = Customer profitability score

Rules:
- Must be MECE-compliant (no overlaps, fully exhaustive)
- Segment Name: Use clear, business-friendly naming (e.g., "Premium_Engaged_Profitable")
- Rule: Describe logic in format: "AOV_[Tier] & Engagement_[Tier] & Profitability_[Tier]"
- Do NOT include thresholds or technical terms in segment name
- Keep segment name under 4 words, use underscores
- {format_instructions}

Output ONLY valid JSON. No explanation.
""")

# Define jury configs
JURY_CONFIGS = {
    "creative_marketer": {
        "llm": llm_creative,
        "persona": "creative marketer focused on catchy, memorable segment names"
    },
    "growth_strategist": {
        "llm": llm_strategist,
        "persona": "growth strategist focused on scalable, actionable segments"
    },
    "data_scientist": {
        "llm": llm_data_sci,
        "persona": "data scientist focused on precise, technical segment definitions"
    },
    "risk_manager": {
        "llm": llm_risk,
        "persona": "risk-averse manager focused on safe, conservative segment names"
    }
}

In [50]:
# CELL 4: Judge Prompt & Output Parser
judge_parser = PydanticOutputParser(pydantic_object=JudgeDecision)

JUDGE_PROMPT_TEMPLATE = ChatPromptTemplate.from_template("""
You are the Chief Segmentation Officer. Pick the BEST segment name and rule for: {bucket_combo}

CRITERIA:
1. MECE Compliance (30%) — Rule must be mutually exclusive, collectively exhaustive
2. Marketing Clarity (25%) — Segment name must be instantly understandable
3. Business Structure (20%) — Segment name should follow "Tier_Attribute_Quality" pattern
4. Rule Precision (15%) — Rule must exactly match bucket combo logic
5. Memorability (10%) — Segment name should be short, catchy, easy to remember

JURY SUBMISSIONS (JSON format):
{jury_submissions_json}

{format_instructions}

Return EXACTLY this JSON format with no other text:
{
  \"segment_name\": \"Premium_Engaged_Profitable\",
  \"rule\": \"AOV=High & Engagement=High & Profitability=High\"
}
No explanation, no thinking, just the JSON.

""")

In [41]:
# CELL 5: LLM Jury + Judge Functions
import json
from typing import Dict, List

def call_jury_member(role: str, bucket_combo: str) -> JurySubmission:
    """Call one jury member via LangChain"""
    config = JURY_CONFIGS[role]
    
    # Create chain
    chain = JURY_PROMPT_TEMPLATE | config["llm"] | jury_parser
    
    try:
        result = chain.invoke({
            "persona": config["persona"],
            "bucket_combo": bucket_combo,
            "format_instructions": jury_parser.get_format_instructions()
        })
        print(f"✅ {role}: {result.segment_name} | Rule: {result.rule}")
        return JurySubmission(role=role, segment=result)
    except Exception as e:
        print(f"⚠️  Error in {role}: {e} — using fallback")
        fallback_name = f"Fallback_{role}_{bucket_combo}"
        fallback_rule = f"AOV_{bucket_combo.split('_')[0]} & Engagement_{bucket_combo.split('_')[1]} & Profitability_{bucket_combo.split('_')[2]}"
        return JurySubmission(
            role=role,
            segment=SegmentResponse(segment_name=fallback_name, rule=fallback_rule)
        )

def call_judge(bucket_combo: str, jury_submissions: List[JurySubmission]) -> SegmentResponse:
    """Call judge via LangChain"""
    # Convert jury submissions to JSON
    jury_json = json.dumps([{
        "role": js.role,
        "segment_name": js.segment.segment_name,
        "rule": js.segment.rule
    } for js in jury_submissions], indent=2)
    
    # Create chain
    chain = JUDGE_PROMPT_TEMPLATE | llm_judge | judge_parser
    
    try:
        result = chain.invoke({
            "bucket_combo": bucket_combo,
            "jury_submissions_json": jury_json,
            "format_instructions": judge_parser.get_format_instructions()
        })
        print(f"⚖️  Judge Selected: {result.chosen_segment.segment_name} | Rule: {result.chosen_segment.rule}")
        return result.chosen_segment
    except Exception as e:
        print(f"⚠️  Judge Error: {e} — using highest scored fallback")
        # Fallback: pick first submission
        return jury_submissions[0].segment if jury_submissions else SegmentResponse(
            segment_name=f"Segment_{bucket_combo}",
            rule=f"AOV_{bucket_combo.split('_')[0]} & Engagement_{bucket_combo.split('_')[1]} & Profitability_{bucket_combo.split('_')[2]}"
        )

In [42]:
# CELL 6: Generate Segment Names + Rules
def generate_segment_names_with_langchain_jury(universe_df: pd.DataFrame) -> tuple:
    """Generate segment names and rules using LangChain + Groq"""
    print("🧠 INITIALIZING LANGCHAIN LLM JURY + JUDGE SYSTEM")
    
    unique_buckets = universe_df['bucket_combo'].unique()
    segment_name_map = {}
    segment_rule_map = {}

    for i, bucket in enumerate(unique_buckets, 1):
        print(f"\n--- Processing Bucket {i}/{len(unique_buckets)}: {bucket} ---")

        try:
            # Call jury members
            jury_submissions = [
                call_jury_member(role, bucket)
                for role in JURY_CONFIGS.keys()
            ]
            
            # Call judge
            judge_decision = call_judge(bucket, jury_submissions)
            
            segment_name_map[bucket] = judge_decision.segment_name
            segment_rule_map[bucket] = judge_decision.rule
            
        except Exception as e:
            print(f"⚠️  Error processing bucket {bucket}: {e}")
            segment_name_map[bucket] = f"Segment_{bucket.replace('_', '')}"
            segment_rule_map[bucket] = f"AOV_{bucket.split('_')[0]} & Engagement_{bucket.split('_')[1]} & Profitability_{bucket.split('_')[2]}"

    print(f"✅ Generated {len(segment_name_map)} segment names and rules")
    return segment_name_map, segment_rule_map

In [47]:
def compute_segment_scores(universe_df: pd.DataFrame, bucket_combo_to_rule_map: Dict[str, str]) -> pd.DataFrame:
    """
    Compute multi-dimensional scores for each segment.
    Uses bucket_combo to look up rules — NOT final_segment.
    """
    print("📈 Computing segment scores...")

    results = []
    total_universe = len(universe_df)
    min_size = 500   # ✅ Fixed: Define min_size
    max_size = 20000 # ✅ Fixed: Define max_size

    # Get unique final segments
    for segment in universe_df['final_segment'].unique():
        # Get all users in this segment
        seg_df = universe_df[universe_df['final_segment'] == segment].copy()  # ✅ Avoid SettingWithCopyWarning
        size = len(seg_df)

        if size == 0:
            continue

        # ✅ Use .loc for safe assignment — as per pandas docs
        seg_df.loc[:, 'days_since_abandon'] = (pd.Timestamp.now() - seg_df['cart_abandoned_date']).dt.days + 1

        # 1. conversion_potential = engagement_score * (1 / days_since_abandonment)
        conv_potential = (seg_df['engagement_score'] * (1 / seg_df['days_since_abandon'])).mean()

        # 2. lift_vs_control = mock random between 0.5 and 1.0 (seeded for reproducibility)
        np.random.seed(hash(segment) % 2**32)
        lift_vs_control = np.random.uniform(0.5, 1.0)

        # 3. size (normalized)
        norm_size = size / total_universe

        # 4. profitability = mean of profitability_score
        profitability = seg_df['profitability_score'].mean()

        # 5. strategic_fit = (conversion_potential + profitability) / 2
        strategic_fit = (conv_potential + profitability) / 2

        # 6. overall_score = weighted sum
        weights = [0.3, 0.2, 0.1, 0.3, 0.1]  # Conv, Lift, Size, Profit, Strat
        overall_score = (
            weights[0] * conv_potential +
            weights[1] * lift_vs_control +
            weights[2] * norm_size +
            weights[3] * profitability +
            weights[4] * strategic_fit
        )

        # ✅ CRITICAL FIX: Get rule from bucket_combo, not final_segment
        # Sample one user to get their bucket_combo
        sample_user = seg_df.iloc[0] if len(seg_df) > 0 else None
        if sample_user is not None and 'bucket_combo' in seg_df.columns:
            bucket_combo = sample_user['bucket_combo']
            rules = bucket_combo_to_rule_map.get(bucket_combo, "Catch_All_Other")
        else:
            rules = "Catch_All_Other"

        # Append to results
        results.append({
            'Segment Name': segment,
            'Rules Applied': rules, 
            'Size': size,
            'Conv_Pot': round(conv_potential, 2),
            'Lift_vs_Control': round(lift_vs_control, 2),
            'Profitability': round(profitability, 2),
            'Strategic_Fit': round(strategic_fit, 2),
            'Overall_Score': round(overall_score, 2),
            'Valid': 'Yes' if min_size <= size <= max_size else 'No'
        })

    # Create results DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Overall_Score', ascending=False).reset_index(drop=True)

    print(f"✅ Computed scores for {len(results_df)} segments")
    return results_df

In [48]:
# CELL 7: Main Workflow (Updated)
def main():
    print("="*70)
    print("🚀 STARTING MECE CART ABANDONER SEGMENTATION — LANGCHAIN + GROQ")
    print("="*70)

    # 1. Generate data
    df = generate_cart_abandonment_dataset(25000)
    universe = define_universe(df)

    # 2. Create buckets
    universe = create_bucket_combinations(universe)

    # 3. LLM Jury + Judge — LANGCHAIN
    try:
        segment_name_map, segment_rule_map = generate_segment_names_with_langchain_jury(universe)
    except Exception as e:
        print(f"❌ LLM system failed: {e}")
        return None, None, None

    # 4. Apply constraints
    universe = apply_size_constraints(universe, segment_name_map)

    # 5. Compute scores (pass segment_rule_map)
    results_df = compute_segment_scores(universe, segment_rule_map)

    # 6. Generate outputs
    generate_outputs(universe, results_df)

    # 7. Display sample output
    print("\n📋 SAMPLE OUTPUT (TOP 5 SEGMENTS):")
    print(results_df.head().to_string(index=False))

    print(f"\n🎉 COMPLETED — {len(results_df)} segments ready for marketing campaigns!")
    return universe, results_df, segment_rule_map

In [49]:
# CELL 9: Run End-to-End
print("🚀 Starting workflow...")
try:
    universe_df, results_df, segment_rule_map = main()
    if results_df is not None:
        print("\n✨ SUCCESS! Your MECE segmentation system is complete.")
        print("\n📄 Final CSV Columns:")
        print(results_df.columns.tolist())
except Exception as e:
    print(f"\n❌ WORKFLOW FAILED: {e}")

🚀 Starting workflow...
🚀 STARTING MECE CART ABANDONER SEGMENTATION — LANGCHAIN + GROQ
📊 Generating 25,000 cart abandonment records...
✅ Dataset generated: 25,000 records
🎯 Universe defined: 21,858 users abandoned cart in last 7 days
🔧 Creating MECE bucket combinations...
📊 Thresholds: AOV_high=$13,200, AOV_mid=$5,500, Eng>=0.31, Profit>=0.56
✅ MECE buckets created — 12 unique combinations
🧠 INITIALIZING LANGCHAIN LLM JURY + JUDGE SYSTEM

--- Processing Bucket 1/12: Low_Low_High ---
✅ creative_marketer: Budget_Disengaged_Profitable | Rule: AOV_Low & Engagement_Low & Profitability_High
✅ growth_strategist: Low_Value_Low_Engaged_High_Profit | Rule: AOV_Low & Engagement_Low & Profitability_High
✅ data_scientist: Low_Value_Loyal | Rule: AOV_Low & Engagement_Low & Profitability_High
✅ risk_manager: Budget_Watchers | Rule: AOV_Low & Engagement_Low & Profitability_High
⚖️  Judge Selected: Budget_Disengaged_Profitable | Rule: AOV_Low & Engagement_Low & Profitability_High

--- Processing Bucket 

✅ growth_strategist: Mid_Value_Low_Engaged_High_Profit | Rule: AOV_Medium & Engagement_Low & Profitability_High
✅ data_scientist: Medium_Engaged_Profitable | Rule: AOV_Medium & Engagement_Medium & Profitability_High
✅ risk_manager: Steady_Core | Rule: AOV_Medium & Engagement_Low & Profitability_High
⚖️  Judge Selected: Medium_Low_High | Rule: AOV_Medium & Engagement_Low & Profitability_High

--- Processing Bucket 3/12: High_Low_High ---
⚠️  Error in creative_marketer: Invalid json output: <think>
Okay, let's tackle this problem. The user wants a segment name and rule for the High_Low_High bucket. First, I need to break down what each tier means.

High_Low_High refers to AOV being High, Engagement being Low, and Profitability being High. The segment name needs to be marketer-friendly and under four words. Let's think about each part. High AOV could be "HighSpending" or "Premium". Low engagement is tricky because it's a negative, maybe "Inactive" or "LowEngagement"? But since we need pos

✅ growth_strategist: Low_Value_Low_Profit | Rule: AOV_Medium & Engagement_Low & Profitability_Low
✅ data_scientist: Medium_Engaged_Unprofitable | Rule: AOV_Medium & Engagement_Low & Profitability_Low
✅ risk_manager: Steady_Core | Rule: AOV_Medium & Engagement_Low & Profitability_Low
⚖️  Judge Selected: Medium_LowEngagement_LowProfit | Rule: AOV_Medium & Engagement_Low & Profitability_Low

--- Processing Bucket 7/12: Low_High_Low ---
⚠️  Error in creative_marketer: Invalid json output: <think>
Okay, let's tackle this. The user wants a segment name and rule for the Low_High_Low bucket combo. First, I need to figure out which part corresponds to AOV, Engagement, and Profitability. The context says that AOV has tiers High, Medium, Low. Engagement is High/Low, Profitability is High/Low.

Wait, the Bucket Combo is Low_High_Low. So each part corresponds to AOV, Engagement, Profitability? Let me check the rules. The rule format is AOV_[Tier] & Engagement_[Tier] & Profitability_[Tier]. So the t

⚖️  Judge Selected: Low_Value_Customers | Rule: AOV_Low & Engagement_Low & Profitability_Low

--- Processing Bucket 9/12: Low_High_High ---
⚠️  Error in creative_marketer: Invalid json output: <think>
Okay, let's tackle this. The user wants a segment name and rule based on the combo Low_High_High. First, I need to decode what each of those tiers represents. The context says that the first part is AOV (Low, Medium, High), the second is Engagement (High or Low), and the third is Profitability (High or Low). So for Low_High_High, that's AOV Low, Engagement High, Profitability High.

Now, the segment name should be marketer-friendly and under four words with underscores. Let's break it down. AOV is Low, but using "Low" in a segment name might not be as appealing. Maybe "Budget" as a synonym for Low AOV? Then Engagement is High, so something like "Engaged". Profitability is High, so "Profitable". Putting it together: "Budget_Engaged_Profitable". That's three words, under four, uses undersco

✅ growth_strategist: Mid_Value_Low_Active | Rule: AOV_Medium & Engagement_Low & Profitability_High
✅ data_scientist: Medium_Engaged_Low | Rule: AOV_Medium & Engagement_High & Profitability_Low
✅ risk_manager: Core_Supporters | Rule: AOV_Medium & Engagement_High & Profitability_Low
⚖️  Judge Selected: Medium_Engaged_Low | Rule: AOV_Medium & Engagement_High & Profitability_Low

--- Processing Bucket 12/12: High_High_High ---
✅ creative_marketer: Premium_Engaged_Profitable | Rule: AOV_High & Engagement_High & Profitability_High
✅ growth_strategist: High_Value_Loyal | Rule: AOV_High & Engagement_High & Profitability_High
✅ data_scientist: High_Value_Customers | Rule: AOV_High & Engagement_High & Profitability_High
✅ risk_manager: VIP_Core | Rule: AOV_High & Engagement_High & Profitability_High
⚖️  Judge Selected: High_Value_Loyal | Rule: AOV_High & Engagement_High & Profitability_High
✅ Generated 12 segment names and rules
✅ Size constraints applied — 12 final segments
📈 Computing segment 