In [1]:
import pandas as pd
from pathlib import Path
from typing import Dict, List, Union
from pydantic import BaseModel
from openai import OpenAI
import os
from dotenv import load_dotenv
import json

# Load environment variables
load_dotenv()

# Initialize OpenAI client with API key from environment
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


# Original Altice Data Sample

In [2]:
# Load the CSV file
df = pd.read_csv("FullCallDataSummary.csv").head(10)

# Display the data as a table
df

Unnamed: 0,NaturalId,Conversation
0,Call1,"[Agent] ""Thank you for choosing Optimum Busine..."
1,Call10,"[Agent] ""Thank you for choosing Optimum Busine..."
2,Call100,"[Agent] ""Good morning. Thank you for calling O..."
3,Call101,"[Agent] ""Good morning. Thank you for calling O..."
4,Call102,"[Agent] ""Hold on one second, hold on, do not d..."
5,Call103,"[Agent] ""Thank you for calling Optimum. No off..."
6,Call104,"[Agent] ""Thank you for calling Optimum Busines..."
7,Call105,"[Agent] ""Thank you for calling O Business. Thi..."
8,Call106,"[Agent] ""Thank you for calling Optimum Busines..."
9,Call107,"[Agent] ""For calling Optimum for Business now ..."


# Categorical Classification Function

In [4]:
# Define ClassificationResult model
class ClassificationResult(BaseModel):
    classification: str
    explanation: str

def classify_call_transcripts(
    classifications: List[str],
    dataframe: pd.DataFrame,
    context_prompt: str,
    transcript_column: str,
    classification_column: str = "Classification",
    explanation_column: str = "Explanation"
) -> pd.DataFrame:
    """
    Classify call transcripts using LLM analysis.
    
    Args:
        classifications (List[str]): List of possible classifications (including "None")
        dataframe (pd.DataFrame): DataFrame with call data
        context_prompt (str): Context/question for classification
        transcript_column (str): Name of the column containing transcript text
        classification_column (str): Name for the output classification column
        explanation_column (str): Name for the output explanation column
    
    Returns:
        pd.DataFrame: Original DataFrame with added classification columns
    """
    
    # Initialize results list
    results: List[ClassificationResult] = []
    
    # Create classifications string for the prompt
    classifications_str = ", ".join(classifications)
    
    # Define the system prompt
    system_prompt = f"""
    You are an expert at analyzing sales call transcripts for classification purposes.
    
    Your task: {context_prompt}
    
    Available classifications: {classifications_str}
    
    For each conversation transcript, you must:
    1. Select exactly ONE classification from the provided list
    2. Provide a comprehensive explanation that includes:
        - Key customer statements or behaviors that influenced your decision
        - Any commitments, objections, or next steps mentioned
        - Your reasoning for why this classification best fits the conversation
    
    Return your response as a JSON object with this exact format:
    {{
        "classification": "selected_classification_from_list",
        "explanation": "One sentence explaining why you chose this classification"
    }}
    
    Important: The classification must be exactly one of the provided options: {classifications_str}
    """
    
    print(f"Processing {len(dataframe)} conversations...")
    
    # Process each conversation
    for idx, row in dataframe.iterrows():
        call_id = str(row.iloc[0])  # First column is call ID
        transcript_text = row[transcript_column]  # Use the specified column name
        
        if pd.isna(transcript_text):
            # Handle missing transcript
            results.append(ClassificationResult(
                explanation="No transcript text available for analysis",
                classification="None"
            ))
            continue
            
        try:
            # Create the user prompt with the conversation
            user_prompt = f"""
            Call ID: {call_id}
            
            Transcript:-
            {transcript_text}
            
            Analyze this transcript and return the JSON with your classification and explanation.
            """
            
            # Make the API call using structured output
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0,
                max_tokens=300,
                response_format={"type": "json_object"}
            )
            
            # Parse the response
            response_text = response.choices[0].message.content.strip()
            
            # Parse JSON and create ClassificationResult
            import json
            parsed_response = json.loads(response_text)
            result = ClassificationResult(
                explanation=parsed_response["explanation"],
                classification=parsed_response["classification"]
            )
            results.append(result)
                
        except Exception as e:
            print(f"Error processing call {call_id}: {str(e)}")
            results.append(ClassificationResult(
                explanation=f"Error during processing: {str(e)}",
                classification="None"
            ))
        
        # Progress indicator
        if (idx + 1) % 10 == 0:
            print(f"Processed {idx + 1}/{len(dataframe)} conversations...")
    
    print("Classification complete!")

    # Convert results list to DataFrame
    results_df = pd.DataFrame([result.model_dump() for result in results])
    
    # Merge with original dataframe using variable column names
    merged_df = dataframe.copy()
    merged_df[classification_column] = results_df['classification']
    merged_df[explanation_column] = results_df['explanation']
    
    return merged_df

# Use Categorical Classification to Identify Call Outcome Stage

In [6]:
# Define the classifications list
classifications = [
    "Closed Sale",
    "Strong Lead",
    "Moderate Lead",
    "Weak Lead",
    "Not a Lead", 
    "Insufficient Data"
]

# Define the context prompt
context_prompt = """Analyze this sales call transcript and classify it into one of the following sales process stages based on the customer's level of engagement, interest, and the outcome of the conversation:

1. **Closed Sale** - Customer commits to purchasing/signing up during this call
2. **Strong Lead** - High intent, decision maker, budget available, urgent need/timeline
3. **Moderate Lead** - Moderate intent, needs identified, budget constraints or longer timeline
4. **Weak Lead** - Low intent, unclear needs, budget/authority concerns
5. **Not a Lead** - Customer explicitly declines, no budget/authority, or not viable
6. **Insufficient Data** - Call incomplete or customer interest unclear


Consider the following factors when classifying:
- Customer's explicit statements about interest level
- Whether any commitment or agreement was made
- Customer's tone and engagement level
- Specific next steps mentioned
- Whether the customer requested follow-up information or meetings
- Any objections raised and how they were addressed

Focus on the customer's behavior and statements rather than the salesperson's actions. A call should only be classified as "Definitive Sale on the Call" if there is clear evidence of a commitment or agreement made during this specific conversation."""

#Run classify_call_transcripts
results_with_sales_stage = classify_call_transcripts(classifications, df, context_prompt, "Conversation", "stage")
results_with_sales_stage

Processing 10 conversations...
Processed 10/10 conversations...
Classification complete!


Unnamed: 0,NaturalId,Conversation,stage,Explanation
0,Call1,"[Agent] ""Thank you for choosing Optimum Busine...",Moderate Lead,The customer expressed a clear interest in set...
1,Call10,"[Agent] ""Thank you for choosing Optimum Busine...",Strong Lead,The customer expressed a clear need for system...
2,Call100,"[Agent] ""Good morning. Thank you for calling O...",Moderate Lead,The customer expressed interest in porting the...
3,Call101,"[Agent] ""Good morning. Thank you for calling O...",Closed Sale,The customer committed to installing the inter...
4,Call102,"[Agent] ""Hold on one second, hold on, do not d...",Closed Sale,"The customer, Rosa, explicitly committed to se..."
5,Call103,"[Agent] ""Thank you for calling Optimum. No off...",Closed Sale,The customer confirmed their order for service...
6,Call104,"[Agent] ""Thank you for calling Optimum Busines...",Closed Sale,The customer committed to upgrading their serv...
7,Call105,"[Agent] ""Thank you for calling O Business. Thi...",Strong Lead,"The customer, Ali, demonstrates high intent to..."
8,Call106,"[Agent] ""Thank you for calling Optimum Busines...",Closed Sale,The customer explicitly committed to the servi...
9,Call107,"[Agent] ""For calling Optimum for Business now ...",Closed Sale,The customer committed to the service installa...


In [None]:
# Usage Example: Optimized Category Extractor with Parallel Processing
# This demonstrates the improved performance and functionality

# Example usage with parallel processing
extracted_categories_optimized = category_extractor_optimized(
    dataframe=df,
    transcript_column="Conversation", 
    context_prompt="Extract names of products offered by Optimum telecommunications to business customers mentioned in the conversations between Optimum Business sales agents and customers. Exclude competitor mentions and non-Optimum related products",
    max_workers=8  # Process 8 transcripts in parallel
)

# View results
print(f"Found {extracted_categories_optimized.total_categories} products:")
for category in extracted_categories_optimized.categories:
    print(f"- {category.categoryName}: {category.categoryDefinition}")

# Performance comparison note:
# The optimized version processes transcripts in parallel, significantly reducing processing time
# for large datasets while maintaining the same accuracy and functionality.


# Category Extractor without category definitions

In [17]:
# Category Name Extractor Function (No Definitions)
from typing import Dict, List, Union
from pydantic import BaseModel
from concurrent.futures import ThreadPoolExecutor
import json

class CategoryName(BaseModel):
    categoryName: str

class CategoryNameExtractionResult(BaseModel):
    categories: List[CategoryName]
    total_categories: int

def category_name_extractor(
    dataframe: pd.DataFrame,
    transcript_column: str,
    context_prompt: str,
    target_column: str = "Category_Extraction",
    max_workers: int = 8
) -> CategoryNameExtractionResult:
    """
    Extract category names from transcripts through open-ended generation with parallel processing.
    Processes transcripts in parallel and generates 1-10 category names per call (no definitions).
    
    Args:
        dataframe: DataFrame with transcript data
        transcript_column: Name of the column containing transcript text
        context_prompt: Context/question for category extraction
        target_column: Column name for storing extraction results (used internally)
        max_workers: Maximum number of parallel workers (default: 8)
    
    Returns:
        CategoryNameExtractionResult: Object containing all extracted category names
    """
    print(f"Starting category name extraction for {len(dataframe)} transcripts...")
    print(f"Context: {context_prompt}")
    
    # Store all unique category names
    all_categories = set()  # Set to automatically handle duplicates
    
    # Process transcripts in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = [
            executor.submit(
                _extract_category_names_from_transcript,
                str(row[transcript_column]),
                context_prompt,
                idx + 1
            )
            for idx, row in dataframe.iterrows()
        ]
        
        # Collect results as they complete
        for i, future in enumerate(futures):
            try:
                transcript_categories = future.result()
                
                # Add to master collection
                for category_name in transcript_categories:
                    all_categories.add(category_name)
                
                # Progress indicator (every 10 completions)
                if (i + 1) % 10 == 0:
                    print(f"Processed {i + 1}/{len(dataframe)} transcripts...")
                    
            except Exception as e:
                print(f"Error processing transcript {i + 1}: {str(e)}")
    
    # Convert to result object
    categories_list = [
        CategoryName(categoryName=name)
        for name in sorted(all_categories)  # Sort for consistent output
    ]
    
    result = CategoryNameExtractionResult(
        categories=categories_list,
        total_categories=len(categories_list)
    )
    
    print(f"\nCategory name extraction complete!")
    print(f"Total unique category names found: {len(categories_list)}")
    
    # Print all category names
    print("\nExtracted Category Names:")
    for i, category in enumerate(categories_list, 1):
        print(f"{i}. {category.categoryName}")
    
    return result

def _extract_category_names_from_transcript(
    transcript_text: str, 
    context_prompt: str, 
    transcript_idx: int
) -> List[str]:
    """Extract category names from a single transcript - names only version"""
    
    # Truncate transcript if too long
    processed_transcript = transcript_text[:8000] + "... [truncated]" if len(transcript_text) > 8000 else transcript_text
    
    system_prompt = f"""You are an expert at extracting category names from text.

Your task: {context_prompt}

You must:
1. Extract 1-10 distinct category names from the transcript
2. Provide exact names/identifiers (be precise, avoid duplicates)
3. Return only category names explicitly mentioned or clearly referenced in the transcript
4. Do NOT provide definitions or descriptions

Return your response as a JSON object with this exact format:
{{
    "categoryNames": [
        "Exact name or identifier",
        "Another category name",
        "Third category name"
    ]
}}"""

    user_prompt = f"""Extract category names from the following transcript based on the criteria provided.

Context: {context_prompt}

Transcript (ID: {transcript_idx}):
{processed_transcript}

Please extract all relevant category names following the instructions above. Return only the names, no definitions."""

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0,
            max_tokens=1500,  # Reduced since no definitions needed
            response_format={"type": "json_object"}
        )
        
        parsed_response = json.loads(response.choices[0].message.content.strip())
        
        # Extract category names
        category_names = [
            str(name) for name in parsed_response["categoryNames"]
        ]
        
        print(f"  Extracted {len(category_names)} category names: {category_names}")
        return category_names
        
    except Exception as e:
        print(f"  Error during category name extraction: {str(e)}")
        return []


Extractor w/o definitions usage

In [18]:
# Usage Example: Category Name Extractor (No Definitions)
# This demonstrates extracting only category names without definitions

# Example usage - extract only product names (no descriptions)
extracted_names = category_name_extractor(
    dataframe=df,
    transcript_column="Conversation", 
    context_prompt="Extract names of products offered by Optimum telecommunications to business customers mentioned in the conversations between Optimum Business sales agents and customers. Exclude competitor mentions and non-Optimum related products",
    max_workers=8  # Process 8 transcripts in parallel
)

# View results - only names, no definitions
print(f"Found {extracted_names.total_categories} unique product names:")
for category in extracted_names.categories:
    print(f"- {category.categoryName}")

# Benefits of this version:
# - Faster processing (no definitions to generate)
# - Lower token usage (reduced max_tokens)
# - Simpler output (just names)
# - Better for when you only need the category names


Starting category name extraction for 10 transcripts...
Context: Extract names of products offered by Optimum telecommunications to business customers mentioned in the conversations between Optimum Business sales agents and customers. Exclude competitor mentions and non-Optimum related products
  Extracted 4 category names: ['TV', 'internet', 'telephone', 'fiber']
  Extracted 6 category names: ['internet service', 'mobile phones', 'tablets', 'point of sale', 'cameras', 'cash register']
  Extracted 6 category names: ['fiber', 'X gigabit', 'X mega', 'X giga', 'product protection', 'emergency backup']
  Extracted 7 category names: ['Optimum Business Class', 'Optimum Mobile', 'Wi Fi', 'gig service for business class', 'TV services', 'internet', 'mobile plan']
  Extracted 7 category names: ['Optimum Mobile', 'internet service', 'phone number', 'gig speed of service', 'modem', 'router', 'wireless failover service']
  Extracted 8 category names: ['Optimum Mobile', 'phone system', 'internet', 

# Category Extractor Function

This notebook contains the Category Extractor function for extracting categories from transcript conversations through open-ended generation.

## Function Overview

The `category_extractor` function processes each transcript individually and generates 1-10 categories per call based on the provided context prompt. Unlike the MECE theme analysis, this function:

- **No MECE constraints**: Categories don't need to be mutually exclusive
- **Individual processing**: Goes through calls one by one (no batching)
- **Open-ended extraction**: Focuses on comprehensive extraction based on prompts
- **Returns categories only**: Only extracts categories with definitions, no classification

**Example use cases:**
- Extracting all product names mentioned
- Identifying company names referenced
- Finding technical terms used
- Extracting software mentions

In [15]:
# Optimized Category Extractor Function with Parallel Processing
from typing import Dict, List, Union
from pydantic import BaseModel
from concurrent.futures import ThreadPoolExecutor
import json

class Category(BaseModel):
    categoryName: str
    categoryDefinition: str

class CategoryExtractionResult(BaseModel):
    categories: List[Category]
    total_categories: int

def category_extractor_optimized(
    dataframe: pd.DataFrame,
    transcript_column: str,
    context_prompt: str,
    target_column: str = "Category_Extraction",
    max_workers: int = 8
) -> CategoryExtractionResult:
    """
    Extract categories from transcripts through open-ended generation with parallel processing.
    Processes transcripts in parallel and generates 1-10 categories per call.
    
    Args:
        dataframe: DataFrame with transcript data
        transcript_column: Name of the column containing transcript text
        context_prompt: Context/question for category extraction
        target_column: Column name for storing extraction results (used internally)
        max_workers: Maximum number of parallel workers (default: 8)
    
    Returns:
        CategoryExtractionResult: Object containing all extracted categories
    """
    print(f"Starting category extraction for {len(dataframe)} transcripts...")
    print(f"Context: {context_prompt}")
    
    # Store all unique categories with their definitions
    all_categories = {}  # category_name -> category_definition
    
    # Process transcripts in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = [
            executor.submit(
                _extract_categories_from_transcript_optimized,
                str(row[transcript_column]),
                context_prompt,
                idx + 1
            )
            for idx, row in dataframe.iterrows()
        ]
        
        # Collect results as they complete
        for i, future in enumerate(futures):
            try:
                transcript_categories = future.result()
                
                # Add to master collection
                for category in transcript_categories:
                    category_name = category.categoryName
                    if category_name not in all_categories:
                        all_categories[category_name] = category.categoryDefinition
                    else:
                        # Use longer definition if available
                        existing_def = all_categories[category_name]
                        new_def = category.categoryDefinition
                        if len(new_def) > len(existing_def):
                            all_categories[category_name] = new_def
                
                # Progress indicator (every 10 completions)
                if (i + 1) % 10 == 0:
                    print(f"Processed {i + 1}/{len(dataframe)} transcripts...")
                    
            except Exception as e:
                print(f"Error processing transcript {i + 1}: {str(e)}")
    
    # Convert to result object
    categories_list = [
        Category(categoryName=name, categoryDefinition=definition)
        for name, definition in all_categories.items()
    ]
    
    result = CategoryExtractionResult(
        categories=categories_list,
        total_categories=len(categories_list)
    )
    
    print(f"\nCategory extraction complete!")
    print(f"Total unique categories found: {len(categories_list)}")
    
    # Print all categories
    print("\nExtracted Categories:")
    for i, category in enumerate(categories_list, 1):
        print(f"{i}. {category.categoryName}: {category.categoryDefinition}")
    
    return result

def _extract_categories_from_transcript_optimized(
    transcript_text: str, 
    context_prompt: str, 
    transcript_idx: int
) -> List[Category]:
    """Extract categories from a single transcript - optimized version"""
    
    # Truncate transcript if too long
    processed_transcript = transcript_text[:8000] + "... [truncated]" if len(transcript_text) > 8000 else transcript_text
    
    system_prompt = f"""You are an expert at extracting categories from text.

Your task: {context_prompt}

You must:
1. Extract 1-10 distinct categories from the transcript
2. Provide exact names/identifiers (be precise, avoid duplicates)
3. Include clear definitions for each category
4. Return only categories explicitly mentioned or clearly referenced in the transcript

Return your response as a JSON object with this exact format:
{{
    "categories": [
        {{
            "categoryName": "Exact name or identifier",
            "categoryDefinition": "Clear description of what this category represents"
        }}
    ]
}}"""

    user_prompt = f"""Extract categories from the following transcript based on the criteria provided.

Context: {context_prompt}

Transcript (ID: {transcript_idx}):
{processed_transcript}

Please extract all relevant categories following the instructions above."""

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0,
            max_tokens=2000,
            response_format={"type": "json_object"}
        )
        
        parsed_response = json.loads(response.choices[0].message.content.strip())
        
        # Create Category objects
        categories = [
            Category(
                categoryName=str(category["categoryName"]),
                categoryDefinition=str(category["categoryDefinition"])
            ) for category in parsed_response["categories"]
        ]
        
        print(f"  Extracted {len(categories)} categories: {[c.categoryName for c in categories]}")
        return categories
        
    except Exception as e:
        print(f"  Error during category extraction: {str(e)}")
        return []


# Usage Example: Extract Product Names

In [16]:
# Load your data
df = results_with_sales_stage
# Extract product names
extracted_categories = category_extractor(
    dataframe=df,
    transcript_column="Conversation", 
    context_prompt="Extract names of products offered by Optimum telecommunications to business customers mentioned in the conversations between Optimum Business sales agents and customers. Exclude competitor mentions and non-Optimum related products"
)

# View results
print(f"Found {extracted_categories.total_categories} products:")
for category in extracted_categories.categories:
    print(f"- {category.categoryName}: {category.categoryDefinition}")

Starting category extraction for 10 transcripts...
Context: Extract names of products offered by Optimum telecommunications to business customers mentioned in the conversations between Optimum Business sales agents and customers. Exclude competitor mentions and non-Optimum related products
Processing transcript 1/10...
  Extracted 5 categories: ['Optimum Business Class', 'Optimum Mobile', 'Gig Service for Business Class', 'Wi-Fi Service', 'Static IP']
Processing transcript 2/10...
  Extracted 7 categories: ['Optimum Mobile', 'Internet Service', 'Phone System', 'Modem', 'Connection Backup', 'Business Features', 'Faster Speed Upgrade']
Processing transcript 3/10...
  Extracted 5 categories: ['Optimum Mobile', 'Fiber Internet', 'Business Phone Line', 'Secondary Line', 'Static IP']
Processing transcript 4/10...
  Extracted 5 categories: ['Internet Service', 'Telephone Service', 'Fiber Internet', 'Double Play', 'Free Installation']
Processing transcript 5/10...
  Extracted 8 categories: ['H

In [13]:
import pandas as pd

# Create a new DataFrame with two columns: 'categoryName' and 'categoryDefinition'
categories_df = pd.DataFrame(
    [(cat.categoryName, cat.categoryDefinition) for cat in extracted_categories.categories],
    columns=["categoryName", "categoryDefinition"]
)

# Display the new DataFrame
display(categories_df)


Unnamed: 0,categoryName,categoryDefinition
0,Optimum Business Class,A service package offered by Optimum that incl...
1,Optimum Mobile,A mobile service plan offered by Optimum that ...
2,Gig Service for Business Class,High-speed internet service offered by Optimum...
3,Wi-Fi Service,Wireless internet service provided by Optimum ...
4,Static IP,A service option that provides a fixed IP addr...
5,Internet Service,High-speed internet service provided by Optimu...
6,Phone System,A telecommunication system that includes featu...
7,Modem,A device provided by Optimum Business that con...
8,Connection Backup,A feature that automatically activates to main...
9,Business Features,Additional features offered by Optimum Busines...


In [14]:
categories_df.to_csv("extracted_categories.csv", index=False)


## Notes

1. **OpenAI API Key Required**: Make sure your `.env` file contains `OPENAI_API_KEY=your_key_here`
2. **Rate Limits**: The function processes transcripts individually, which may take time for large datasets
3. **Token Limits**: Long transcripts are automatically truncated to ~2000 tokens for processing
4. **Category Deduplication**: Duplicate categories are automatically merged, keeping the longer definition
5. **Error Handling**: Failed transcript processing is logged and continues with the next transcript