In [8]:
import pandas as pd
from pathlib import Path
from typing import Dict, List
from pydantic import BaseModel
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize OpenAI client with API key from environment
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


# Original Altice Data Sample

In [9]:
# Load the CSV file
df = pd.read_csv("FullCallDataSummary.csv").head(10)

# Display the data as a table
df

Unnamed: 0,NaturalId,Conversation
0,Call1,"[Agent] ""Thank you for choosing Optimum Busine..."
1,Call10,"[Agent] ""Thank you for choosing Optimum Busine..."
2,Call100,"[Agent] ""Good morning. Thank you for calling O..."
3,Call101,"[Agent] ""Good morning. Thank you for calling O..."
4,Call102,"[Agent] ""Hold on one second, hold on, do not d..."
5,Call103,"[Agent] ""Thank you for calling Optimum. No off..."
6,Call104,"[Agent] ""Thank you for calling Optimum Busines..."
7,Call105,"[Agent] ""Thank you for calling O Business. Thi..."
8,Call106,"[Agent] ""Thank you for calling Optimum Busines..."
9,Call107,"[Agent] ""For calling Optimum for Business now ..."


# Categorical Classification Function

In [None]:
# Define ClassificationResult model
class ClassificationResult(BaseModel):
    classification: str
    explanation: str

def classify_call_transcripts(
    classifications: List[str],
    dataframe: pd.DataFrame,
    context_prompt: str,
    transcript_column: str,
    classification_column: str = "Classification",
    explanation_column: str = "Explanation"
) -> pd.DataFrame:
    """
    Classify call transcripts using LLM analysis.
    
    Args:
        classifications (List[str]): List of possible classifications (including "None")
        dataframe (pd.DataFrame): DataFrame with call data
        context_prompt (str): Context/question for classification
        transcript_column (str): Name of the column containing transcript text
        classification_column (str): Name for the output classification column
        explanation_column (str): Name for the output explanation column
    
    Returns:
        pd.DataFrame: Original DataFrame with added classification columns
    """
    
    # Initialize results list
    results: List[ClassificationResult] = []
    
    # Create classifications string for the prompt
    classifications_str = ", ".join(classifications)
    
    # Define the system prompt
    system_prompt = f"""
    You are an expert at analyzing sales call transcripts for classification purposes.
    
    Your task: {context_prompt}
    
    Available classifications: {classifications_str}
    
    For each conversation transcript, you must:
    1. Select exactly ONE classification from the provided list
    2. Provide a comprehensive explanation that includes:
        - Key customer statements or behaviors that influenced your decision
        - Any commitments, objections, or next steps mentioned
        - Your reasoning for why this classification best fits the conversation
    
    Return your response as a JSON object with this exact format:
    {{
        "classification": "selected_classification_from_list",
        "explanation": "One sentence explaining why you chose this classification"
    }}
    
    Important: The classification must be exactly one of the provided options: {classifications_str}
    """
    
    print(f"Processing {len(dataframe)} conversations...")
    
    # Process each conversation
    for idx, row in dataframe.iterrows():
        call_id = str(row.iloc[0])  # First column is call ID
        transcript_text = row[transcript_column]  # Use the specified column name
        
        if pd.isna(transcript_text):
            # Handle missing transcript
            results.append(ClassificationResult(
                explanation="No transcript text available for analysis",
                classification="None"
            ))
            continue
            
        try:
            # Create the user prompt with the conversation
            user_prompt = f"""
            Call ID: {call_id}
            
            Transcript:-
            {transcript_text}
            
            Analyze this transcript and return the JSON with your classification and explanation.
            """
            
            # Make the API call using structured output
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0,
                max_tokens=300,
                response_format={"type": "json_object"}
            )
            
            # Parse the response
            response_text = response.choices[0].message.content.strip()
            
            # Parse JSON and create ClassificationResult
            import json
            parsed_response = json.loads(response_text)
            result = ClassificationResult(
                explanation=parsed_response["explanation"],
                classification=parsed_response["classification"]
            )
            results.append(result)
                
        except Exception as e:
            print(f"Error processing call {call_id}: {str(e)}")
            results.append(ClassificationResult(
                explanation=f"Error during processing: {str(e)}",
                classification="None"
            ))
        
        # Progress indicator
        if (idx + 1) % 10 == 0:
            print(f"Processed {idx + 1}/{len(dataframe)} conversations...")
    
    print("Classification complete!")

    # Convert results list to DataFrame
    results_df = pd.DataFrame([result.model_dump() for result in results])
    
    # Merge with original dataframe using variable column names
    merged_df = dataframe.copy()
    merged_df[classification_column] = results_df['classification']
    merged_df[explanation_column] = results_df['explanation']
    
    return merged_df

 Use Categorical Classification to Identify Call Outcome Stage

In [23]:
# Define the classifications list
classifications = [
    "Definitive Sale on the Call",
    "No Interest from Customer", 
    "Potential Interest as a Lead",
    "Inconclusive"
]

# Define the context prompt
context_prompt = """Analyze this sales call transcript and classify it into one of the following sales process stages based on the customer's level of engagement, interest, and the outcome of the conversation:

1. **Definitive Sale on the Call** - Customer commits to purchasing, signs up, or agrees to a contract during this specific call
2. **No Interest from Customer** - Customer explicitly declines, shows no interest, or is not a viable prospect
3. **Potential Interest as a Lead** - Customer shows some interest, asks questions, or requests information but doesn't commit
4. **Inconclusive** - The conversation is unclear or the customer's interest level is not clear

Consider the following factors when classifying:
- Customer's explicit statements about interest level
- Whether any commitment or agreement was made
- Customer's tone and engagement level
- Specific next steps mentioned
- Whether the customer requested follow-up information or meetings
- Any objections raised and how they were addressed

Focus on the customer's behavior and statements rather than the salesperson's actions. A call should only be classified as "Definitive Sale on the Call" if there is clear evidence of a commitment or agreement made during this specific conversation."""

#Run classify_call_transcripts
results_with_sales_stage = classify_call_transcripts(classifications, df, context_prompt, "Conversation", "stage")
results_with_sales_stage

Processing 10 conversations...
Processed 10/10 conversations...
Classification complete!


Unnamed: 0,NaturalId,Conversation,stage,Explanation
0,Call1,"[Agent] ""Thank you for choosing Optimum Busine...",Potential Interest as a Lead,The customer showed interest in the services a...
1,Call10,"[Agent] ""Thank you for choosing Optimum Busine...",Definitive Sale on the Call,"The customer, Heather, explicitly agreed to an..."
2,Call100,"[Agent] ""Good morning. Thank you for calling O...",Potential Interest as a Lead,The customer showed interest in porting their ...
3,Call101,"[Agent] ""Good morning. Thank you for calling O...",Definitive Sale on the Call,The customer committed to the service by agree...
4,Call102,"[Agent] ""Hold on one second, hold on, do not d...",Definitive Sale on the Call,"The customer, Rosa, explicitly committed to se..."
5,Call103,"[Agent] ""Thank you for calling Optimum. No off...",Definitive Sale on the Call,The customer committed to purchasing services ...
6,Call104,"[Agent] ""Thank you for calling Optimum Busines...",Definitive Sale on the Call,The customer committed to upgrading their serv...
7,Call105,"[Agent] ""Thank you for calling O Business. Thi...",Potential Interest as a Lead,"The customer, Ali, shows interest in installin..."
8,Call106,"[Agent] ""Thank you for calling Optimum Busines...",Definitive Sale on the Call,The customer explicitly committed to the servi...
9,Call107,"[Agent] ""For calling Optimum for Business now ...",Definitive Sale on the Call,The customer committed to the service installa...


# Theme Analysis Function

In [14]:
# Modified Theme Analysis Functions - No Batching, Parallel Processing Only

def _make_api_call(messages: List[Dict], model: str = "gpt-4o-mini", max_tokens: int = 2000) -> Dict:
    """Centralized API call with error handling."""
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0,
            max_tokens=max_tokens,
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content.strip())
    except Exception as e:
        print(f"API call failed: {e}")
        return {}

def _create_theme_prompt(context_prompt: str, transcript: str) -> Tuple[str, str]:
    """Create system and user prompts for theme generation."""
    system_prompt = f"""You are an expert at creating MECE categorization frameworks.

Task: {context_prompt}

Generate 3-10 themes that are:
- Mutually Exclusive: No overlap
- Collectively Exhaustive: All content fits

Return JSON: {{"themes": [{{"themeName": "...", "themeDescription": "..."}}], "mece_validation": "..."}}"""

    user_prompt = f"Analyze this transcript and create MECE themes:\n\n{transcript}"
    
    return system_prompt, user_prompt

def _create_classification_prompt(context_prompt: str, themes: List[Theme], transcript: str, single_theme: bool = True) -> Tuple[str, str]:
    """Create prompts for transcript classification."""
    themes_text = "\n".join([f"- {t.themeName}: {t.themeDescription}" for t in themes])
    instruction = "Assign exactly ONE theme" if single_theme else "Assign one or more themes"
    
    system_prompt = f"""Classify transcript using these themes:

{themes_text}

Rules:
1. {instruction}
2. Use only provided themes
3. Assign transcript to at least one theme

Return JSON: {{"classifications": {{"id": ["Theme"]}}}}"""

    user_prompt = f"Context: {context_prompt}\n\nTranscript:\n{transcript}"
    
    return system_prompt, user_prompt


In [15]:
def _generate_themes_parallel(dataframe: pd.DataFrame, transcript_column: str, context_prompt: str, max_workers: int = 4) -> List[Theme]:
    """Generate themes using parallel processing - no batching, each transcript processed individually."""
    def process_single_transcript(row):
        transcript = row[transcript_column]
        system_prompt, user_prompt = _create_theme_prompt(context_prompt, transcript)
        
        response = _make_api_call([
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ])
        
        return [Theme(**theme) for theme in response.get("themes", [])]
    
    print(f"Generating themes from {len(dataframe)} individual transcripts...")
    
    # Process each transcript individually in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        transcript_results = list(executor.map(process_single_transcript, [row for _, row in dataframe.iterrows()]))
    
    # Flatten results
    all_themes = [theme for transcript_themes in transcript_results for theme in transcript_themes]
    
    print(f"Generated {len(all_themes)} total themes from individual transcripts")
    
    # Merge similar themes
    if len(all_themes) > 1:
        return _merge_themes(all_themes, context_prompt)
    
    return all_themes

def _merge_themes(themes: List[Theme], context_prompt: str) -> List[Theme]:
    """Merge similar themes semantically."""
    themes_text = "\n".join([f"- {t.themeName}: {t.themeDescription}" for t in themes])
    
    system_prompt = f"""Merge similar themes while maintaining MECE principles.

Task: {context_prompt}

Merge these themes into 3-10 final themes:
{themes_text}

Return JSON: {{"themes": [{{"themeName": "...", "themeDescription": "..."}}], "mece_validation": "..."}}"""

    user_prompt = "Create final MECE framework by merging similar themes."
    
    response = _make_api_call([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ])
    
    return [Theme(**theme) for theme in response.get("themes", themes)]


In [16]:
def _classify_transcripts_parallel(dataframe: pd.DataFrame, transcript_column: str, themes: List[Theme], 
                                 context_prompt: str, id_column: str, single_theme: bool = True, 
                                 max_workers: int = 4) -> Tuple[Dict[str, List[str]], str]:
    """Classify transcripts using parallel processing - no batching, each transcript processed individually."""
    def process_single_transcript(row):
        transcript = row[transcript_column]
        transcript_id = str(row[id_column])
        system_prompt, user_prompt = _create_classification_prompt(context_prompt, themes, transcript, single_theme)
        
        response = _make_api_call([
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ])
        
        # Extract classification for this single transcript
        classifications = response.get("classifications", {})
        # The response should have a single "id" key, but we'll use the actual transcript_id
        if "id" in classifications:
            return {transcript_id: classifications["id"]}
        else:
            return {transcript_id: ["Unclassified"]}
    
    print(f"Classifying {len(dataframe)} individual transcripts...")
    
    # Process each transcript individually in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        transcript_results = list(executor.map(process_single_transcript, [row for _, row in dataframe.iterrows()]))
    
    # Merge all classifications
    all_classifications = {}
    for transcript_result in transcript_results:
        all_classifications.update(transcript_result)
    
    return all_classifications, f"Generated {len(themes)} themes. Classified {len(all_classifications)} transcripts."


In [17]:
def mece_theme_analysis(
    dataframe: pd.DataFrame,
    transcript_column: str,
    context_prompt: str,
    id_column: str = None,
    target_column: str = "Theme_Analysis",
    themes_per_transcript: Union[int, str] = 1,
    max_workers: int = 4
) -> Tuple[pd.DataFrame, MECEThemeAnalysis]:
    """
    Optimized MECE theme analysis with parallel processing - no batching.
    Each transcript is processed individually in parallel.
    
    Args:
        dataframe: DataFrame with transcript data
        transcript_column: Column containing transcript text
        context_prompt: Analysis context/question
        id_column: ID column (defaults to first column)
        target_column: Output column name
        themes_per_transcript: 1 for single theme, "multiple" for multiple themes
        max_workers: Number of parallel workers
    
    Returns:
        Tuple of (DataFrame with themes, MECE analysis results)
    """
    id_column = id_column or dataframe.columns[0]
    single_theme = themes_per_transcript == 1
    
    print(f"MECE Analysis: {len(dataframe)} transcripts (no batching, parallel processing)")
    
    # Phase 1: Generate themes
    themes = _generate_themes_parallel(dataframe, transcript_column, context_prompt, max_workers)
    if not themes:
        return dataframe.assign(**{target_column: "Error: No themes generated"}), MECEThemeAnalysis(
            themes=[], theme_mappings={}, mece_validation="Error"
        )
    
    print(f"Generated {len(themes)} themes")
    
    # Phase 2: Classify transcripts
    theme_mappings, mece_validation = _classify_transcripts_parallel(
        dataframe, transcript_column, themes, context_prompt, id_column, single_theme, max_workers
    )
    
    # Apply themes to dataframe
    result_df = dataframe.copy()
    if single_theme:
        theme_map = {str(id_val): themes[0] if themes else "Unclassified" 
                    for id_val, themes in theme_mappings.items()}
        result_df[target_column] = result_df[id_column].astype(str).map(theme_map).fillna("Unclassified")
    else:
        theme_map = {str(id_val): ", ".join(themes) if themes else "Unclassified" 
                    for id_val, themes in theme_mappings.items()}
        result_df[target_column] = result_df[id_column].astype(str).map(theme_map).fillna("Unclassified")
    
    return result_df, MECEThemeAnalysis(
        themes=themes,
        theme_mappings=theme_mappings,
        mece_validation=mece_validation
    )


Theme Analysis Usage

In [21]:
# Define your analysis question
context_prompt = "Identify the primary sales techniques or methodologies used by the sales agent in each call transcript. Focus on approaches used to convert the customer that are OUTSIDE OF logistical or product-based selling. The techniques should be a specific, commonly recognized sales strategy."

# Run MECE theme analysis
theme_results, analysis = mece_theme_analysis(
    dataframe=results_with_sales_stage,
    transcript_column='Conversation',
    context_prompt=context_prompt,
    id_column='NaturalId',
    target_column='sales_technique',
    themes_per_transcript=1,  # Single theme per transcript
    max_workers=4
)

# Display results
print("ðŸ“Š Results:")
print(theme_results[['NaturalId', 'sales_technique']])

print("\nðŸŽ¯ Generated Themes:")
for i, theme in enumerate(analysis.themes, 1):
    print(f"{i}. {theme.themeName}: {theme.themeDescription}")

print(f"\nâœ… MECE Validation: {analysis.mece_validation}")

MECE Analysis: 10 transcripts (no batching, parallel processing)
Generating themes from 10 individual transcripts...
Generated 71 total themes from individual transcripts
Generated 9 themes
Classifying 10 individual transcripts...
ðŸ“Š Results:
  NaturalId       sales_technique
0     Call1  Consultative Selling
1    Call10  Consultative Selling
2   Call100         Cross-Selling
3   Call101  Consultative Selling
4   Call102      Building Rapport
5   Call103      Building Rapport
6   Call104  Consultative Selling
7   Call105  Consultative Selling
8   Call106  Consultative Selling
9   Call107  Consultative Selling

ðŸŽ¯ Generated Themes:
1. Consultative Selling: The agent engages in a dialogue to understand the customer's specific needs and challenges, asking probing questions to tailor solutions that align with the customer's business requirements.
2. Value Proposition Highlighting: The agent emphasizes the benefits and features of the service, including speed, reliability, and cost savi

# Unique Value Splitter Function

In [None]:
def unique_value_splitter(
    dataframe: pd.DataFrame,
    splitter_column: str
) -> Dict[str, pd.DataFrame]:
    """
    Split a dataframe into separate dataframes based on unique values in a specified column.
    
    Args:
        dataframe (pd.DataFrame): DataFrame to split
        splitter_column (str): Name of the column to use for splitting
    
    Returns:
        Dict[str, pd.DataFrame]: Dictionary where keys are unique values and values are filtered DataFrames
    """
    
    # Get unique values from the splitter column
    unique_values = dataframe[splitter_column].unique()
    
    print(f"Splitting dataframe into {len(unique_values)} groups based on '{splitter_column}' column...")
    print(f"Unique values found: {list(unique_values)}")
    
    # Create dictionary to store split dataframes
    split_dataframes = {}
    
    # Split dataframe for each unique value
    for value in unique_values:
        # Filter dataframe for current value
        filtered_df = dataframe[dataframe[splitter_column] == value].copy()
        
        # Store in dictionary with value as key
        split_dataframes[str(value)] = filtered_df
        
        print(f"  - '{value}': {len(filtered_df)} rows")
    
    print("Dataframe splitting complete!")
    
    return split_dataframes

Split Dataframe w/ Outcome Stage and Sales Technique by Outcome Stage

In [None]:
# Use unique_value_splitter on categorized_results
split_results = unique_value_splitter(theme_results, "stage")

# Display the split results
print("\nSplit DataFrames:")
for group_name, group_df in split_results.items():
    print(f"\n{group_name} ({len(group_df)} rows):")
    display(group_df)

Access a specific split DataFrame by its group name -- e.g. split_results["Definitive Sale on the Call"]:

In [None]:
#split_results["Potential Interest as a Lead"]
#split_results["Definitive Sale on the Call"]
#split_results["No Interest from Customer"]
#split_results["Inconclusive"]

# Comparator


In [None]:
import pandas as pd
from typing import List, Dict
from pydantic import BaseModel
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load environment
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

# -------------------------------
# Schema for structured comparison
# -------------------------------
class GroupSummary(BaseModel):
    group_name: str
    summary: str

class ComparisonOutput(BaseModel):
    introduction: str
    key_findings: List[str]
    similarities: List[str]
    differences: List[str]
    group_summaries: List[GroupSummary]

# -------------------------------
# Comparison function
# -------------------------------
def comparison_function(grouped_dfs: Dict[str, pd.DataFrame],
                        columns_to_analyze: List[str],
                        context_prompt: str) -> Dict:
    """
    Compare multiple groups of dataframes directly against each other.

    Args:
        grouped_dfs: dictionary of {group_name: dataframe}
        columns_to_analyze: list of column names to analyze
        context_prompt: guiding analysis question (e.g. "What makes sales calls effective?")
    
    Returns:
        Dict with overall comparison, similarities, differences, and group summaries
    """
    # Build compact summaries for each group
    group_texts = {}
    for group_name, df in grouped_dfs.items():
        sample_text = df[columns_to_analyze].astype(str).apply(lambda row: " | ".join(row), axis=1)
        combined_text = "\n".join(sample_text.tolist()[:150])  # cap records per group
        group_texts[group_name] = combined_text

    # Prepare prompt with all groups
    group_descriptions = "\n\n".join(
        [f"### {name}:\n{txt}" for name, txt in group_texts.items()]
    )

    messages = [
        {
            "role": "system",
            "content": """
            You are an expert data analyst. Your job is to compare groups of records,
            identify similarities and differences, and explain what variables contribute to outcomes.
            Always return valid JSON matching the schema.
            """,
        },
        {
            "role": "user",
            "content": f"""
            Context: {context_prompt}

            Here are the grouped records (truncated samples shown for each):

            {group_descriptions}

            Please provide:
            - overall_comparison: a narrative comparing all groups directly
            - similarities: what traits or variables appear across groups
            - differences: what distinguishes successful vs unsuccessful outcomes
            - group_summaries: return as a JSON list of objects, each with keys "group_name" and "summary"
            """,
        },
    ]

    try:
        response = client.responses.parse(
            model="gpt-4o-mini",
            input=messages,
            text_format=ComparisonOutput,
            temperature=0,
            max_output_tokens=800,
        )

        parsed: ComparisonOutput = response.output_parsed
        return parsed.model_dump()

    except Exception as e:
        print(f"Comparison failed: {e}")
        return {
            "overall_comparison": "No comparison generated",
            "similarities": "N/A",
            "differences": "N/A",
            "group_summaries": [{"group_name": name, "summary": "No summary"} for name in grouped_dfs.keys()]
        }
 

In [None]:
grouped_dfs = {
    "Definitive Sale": split_results["Definitive Sale on the Call"],
    "Potential Interest": split_results["Potential Interest as a Lead"],
}

results = comparison_function(
    grouped_dfs=grouped_dfs,
    columns_to_analyze=["Sales_Technique_Used"],
    context_prompt="Compare sales call outcomes. What makes sales calls effective vs ineffective?"
)

# Print clean structured output
print("\n=== INTRODUCTION ===")
print(results["introduction"])

print("\n=== KEY FINDINGS ===")
for finding in results["key_findings"]:
    print(f"- {finding}")

print("\n=== SIMILARITIES ===")
for sim in results["similarities"]:
    print(f"- {sim}")

print("\n=== DIFFERENCES ===")
for diff in results["differences"]:
    print(f"- {diff}")

print("\n=== GROUP SUMMARIES ===")
for summary in results["group_summaries"]:
    print(f"[{summary['group_name']}] {summary['summary']}")
