# Comparing HDBSCAN Clustering vs Gemini Classifications

Let's compare the unsupervised clustering results with Gemini's supervised topic classification to see:
1. How well they align
2. Which approach gives better insights
3. Whether clustering can validate/improve Gemini's results

In [60]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re

print("=== COMPARING TWO TRIAGE APPROACHES ===")

# Load APPROACH 1: Notebook-based evaluation (triage.ipynb approach)
print("\nüî¨ APPROACH 1: Notebook Multi-Label Evaluator")
try:
    # This uses the validation results from the notebook evaluator
    notebook_validation = validation_results  # From previous cells
    notebook_status = Counter([r['status'] for r in notebook_validation])
    notebook_actions = Counter([r['action'] for r in notebook_validation])
    
    print(f"‚úÖ Tickets processed: {len(notebook_validation)}")
    print(f"‚úÖ Status breakdown: {dict(notebook_status)}")
    print(f"‚úÖ Actions needed: {dict(notebook_actions)}")
    
    # Calculate confidence rates
    high_conf_nb = len([r for r in notebook_validation if r['confidence'] == 'high'])
    med_conf_nb = len([r for r in notebook_validation if r['confidence'] == 'medium'])
    
    print(f"‚úÖ High confidence rate: {high_conf_nb/len(notebook_validation):.1%}")
    print(f"‚úÖ Med+High confidence rate: {(high_conf_nb + med_conf_nb)/len(notebook_validation):.1%}")
    
except Exception as e:
    print(f"‚ùå Notebook approach data not available: {e}")

# Load APPROACH 2: triage_evaluation_full.py results
print("\nüî¨ APPROACH 2: triage_evaluation_full.py Framework")
try:
    # Load results from validation run
    framework_results = pd.read_csv('./validation_results/tickets_with_metrics.csv')
    framework_topics = pd.read_csv('./validation_results/topic_summary.csv')
    
    print(f"‚úÖ Tickets processed: {len(framework_results)}")
    print(f"‚úÖ Topics identified: {len(framework_topics)}")
    
    # Calculate coherence stats
    avg_coherence = framework_topics['coherence'].mean()
    high_coh_topics = len(framework_topics[framework_topics['coherence'] > 0.4])
    
    print(f"‚úÖ Average topic coherence: {avg_coherence:.3f}")
    print(f"‚úÖ High-coherence topics (>0.4): {high_coh_topics}/{len(framework_topics)}")
    
    # Calculate confidence distribution
    if 'conf_label' in framework_results.columns:
        conf_dist = framework_results['conf_label'].value_counts()
        total = len(framework_results)
        print(f"‚úÖ Confidence distribution:")
        for conf, count in conf_dist.items():
            print(f"   {conf}: {count} ({count/total:.1%})")
    
except Exception as e:
    print(f"‚ùå Framework approach data not available: {e}")

print("\n" + "="*60)

=== COMPARING TWO TRIAGE APPROACHES ===

üî¨ APPROACH 1: Notebook Multi-Label Evaluator
‚úÖ Tickets processed: 30
‚úÖ Status breakdown: {'confident': 10, 'moderate': 13, 'disagreement': 6, 'isolated': 1}
‚úÖ Actions needed: {'keep': 24, 'review': 6}
‚úÖ High confidence rate: 33.3%
‚úÖ Med+High confidence rate: 80.0%

üî¨ APPROACH 2: triage_evaluation_full.py Framework
‚úÖ Tickets processed: 30
‚úÖ Topics identified: 10
‚úÖ Average topic coherence: 0.336
‚úÖ High-coherence topics (>0.4): 2/10
‚úÖ Confidence distribution:
   MED_CONF: 20 (66.7%)
   LOW_CONF: 5 (16.7%)
   HIGH_CONF: 5 (16.7%)



In [61]:
# COMPREHENSIVE COMPARISON ANALYSIS
print("üìä DETAILED COMPARISON")
print("="*60)

# Approach 1: Notebook Multi-Label Evaluator
nb_high_conf = 33.3
nb_med_high_conf = 80.0
nb_reviews_needed = 6
nb_total = 30

# Approach 2: Framework 
fw_high_conf = 16.7
fw_med_high_conf = 66.7 + 16.7  # MED + HIGH
fw_reviews_needed = 5  # LOW_CONF
fw_total = 30
fw_avg_coherence = 0.336
fw_high_coh_topics = 2

print("üéØ CONFIDENCE COMPARISON:")
print(f"   Notebook Approach:")
print(f"     ‚Ä¢ High confidence: {nb_high_conf:.1f}%")
print(f"     ‚Ä¢ Med+High confidence: {nb_med_high_conf:.1f}%")
print(f"     ‚Ä¢ Reviews needed: {nb_reviews_needed}/{nb_total} ({nb_reviews_needed/nb_total:.1%})")

print(f"   Framework Approach:")
print(f"     ‚Ä¢ High confidence: {fw_high_conf:.1f}%")
print(f"     ‚Ä¢ Med+High confidence: {fw_med_high_conf:.1f}%")
print(f"     ‚Ä¢ Reviews needed: {fw_reviews_needed}/{fw_total} ({fw_reviews_needed/fw_total:.1%})")

print(f"\nüèÜ WINNER ANALYSIS:")
if nb_high_conf > fw_high_conf:
    print(f"   ‚úÖ High Confidence: NOTEBOOK wins ({nb_high_conf:.1f}% vs {fw_high_conf:.1f}%)")
else:
    print(f"   ‚úÖ High Confidence: FRAMEWORK wins ({fw_high_conf:.1f}% vs {nb_high_conf:.1f}%)")

if nb_med_high_conf > fw_med_high_conf:
    print(f"   ‚úÖ Overall Confidence: NOTEBOOK wins ({nb_med_high_conf:.1f}% vs {fw_med_high_conf:.1f}%)")
else:
    print(f"   ‚úÖ Overall Confidence: FRAMEWORK wins ({fw_med_high_conf:.1f}% vs {nb_med_high_conf:.1f}%)")

if nb_reviews_needed < fw_reviews_needed:
    print(f"   ‚úÖ Fewer Reviews Needed: NOTEBOOK wins ({nb_reviews_needed} vs {fw_reviews_needed})")
else:
    print(f"   ‚úÖ Fewer Reviews Needed: FRAMEWORK wins ({fw_reviews_needed} vs {nb_reviews_needed})")

print(f"\nüìà ADDITIONAL FRAMEWORK METRICS:")
print(f"   ‚Ä¢ Topic coherence quality: {fw_avg_coherence:.3f} (>0.3 is good)")
print(f"   ‚Ä¢ High-quality topics: {fw_high_coh_topics}/10 topics")

print(f"\nüîç RECOMMENDATION:")
if nb_med_high_conf > fw_med_high_conf and nb_reviews_needed < fw_reviews_needed:
    print(f"   üéØ USE NOTEBOOK APPROACH")
    print(f"   ‚úÖ Better confidence rates")
    print(f"   ‚úÖ Fewer manual reviews needed")
    print(f"   ‚úÖ More suitable for production")
else:
    print(f"   üéØ USE FRAMEWORK APPROACH") 
    print(f"   ‚úÖ More comprehensive evaluation")
    print(f"   ‚úÖ Better topic quality metrics")
    print(f"   ‚úÖ Production-ready pipeline")

print("="*60)

üìä DETAILED COMPARISON
üéØ CONFIDENCE COMPARISON:
   Notebook Approach:
     ‚Ä¢ High confidence: 33.3%
     ‚Ä¢ Med+High confidence: 80.0%
     ‚Ä¢ Reviews needed: 6/30 (20.0%)
   Framework Approach:
     ‚Ä¢ High confidence: 16.7%
     ‚Ä¢ Med+High confidence: 83.4%
     ‚Ä¢ Reviews needed: 5/30 (16.7%)

üèÜ WINNER ANALYSIS:
   ‚úÖ High Confidence: NOTEBOOK wins (33.3% vs 16.7%)
   ‚úÖ Overall Confidence: FRAMEWORK wins (83.4% vs 80.0%)
   ‚úÖ Fewer Reviews Needed: FRAMEWORK wins (5 vs 6)

üìà ADDITIONAL FRAMEWORK METRICS:
   ‚Ä¢ Topic coherence quality: 0.336 (>0.3 is good)
   ‚Ä¢ High-quality topics: 2/10 topics

üîç RECOMMENDATION:
   üéØ USE FRAMEWORK APPROACH
   ‚úÖ More comprehensive evaluation
   ‚úÖ Better topic quality metrics
   ‚úÖ Production-ready pipeline


In [62]:
# GEMINI CONFIDENCE ALIGNMENT CHECK
print("üéØ CHECKING ALIGNMENT WITH GEMINI'S ORIGINAL CONFIDENCE")
print("="*60)

# Load Gemini's original confidence scores
gemini_df = pd.read_csv('Ticket_Classification_Results_CLEAN.csv')
gemini_confidence = gemini_df['confidence'].describe()

print("üìä Gemini's Original Confidence Distribution:")
print(f"   Mean: {gemini_confidence['mean']:.3f}")
print(f"   Std:  {gemini_confidence['std']:.3f}")
print(f"   Min:  {gemini_confidence['min']:.3f}")
print(f"   Max:  {gemini_confidence['max']:.3f}")

# Check high confidence tickets in Gemini
high_conf_gemini = len(gemini_df[gemini_df['confidence'] >= 0.9])
med_conf_gemini = len(gemini_df[gemini_df['confidence'] >= 0.8])

print(f"\nüî¢ Gemini's Confidence Breakdown:")
print(f"   High confidence (‚â•0.9): {high_conf_gemini}/30 ({high_conf_gemini/30:.1%})")
print(f"   Med+ confidence (‚â•0.8): {med_conf_gemini}/30 ({med_conf_gemini/30:.1%})")

print(f"\nüìà ALIGNMENT COMPARISON:")
print(f"   Gemini High Conf:    {high_conf_gemini/30:.1%}")
print(f"   Notebook High Conf:  {nb_high_conf:.1f}%")
print(f"   Framework High Conf: {fw_high_conf:.1f}%")

notebook_alignment = abs((nb_high_conf/100) - (high_conf_gemini/30))
framework_alignment = abs((fw_high_conf/100) - (high_conf_gemini/30))

print(f"\nüéØ ALIGNMENT SCORES (lower = better):")
print(f"   Notebook alignment error:  {notebook_alignment:.3f}")
print(f"   Framework alignment error: {framework_alignment:.3f}")

if notebook_alignment < framework_alignment:
    print(f"   üèÜ NOTEBOOK aligns better with Gemini confidence!")
else:
    print(f"   üèÜ FRAMEWORK aligns better with Gemini confidence!")

print("\n" + "="*60)
print("üéØ FINAL RECOMMENDATION BASED ON ALL METRICS:")

# Score each approach
nb_score = 0
fw_score = 0

# High confidence rate (closer to Gemini)
if notebook_alignment < framework_alignment:
    nb_score += 2
    print("‚úÖ Notebook: Better Gemini alignment (+2)")
else:
    fw_score += 2
    print("‚úÖ Framework: Better Gemini alignment (+2)")

# Overall confidence
if nb_med_high_conf > fw_med_high_conf:
    nb_score += 1
    print("‚úÖ Notebook: Higher overall confidence (+1)")
else:
    fw_score += 1
    print("‚úÖ Framework: Higher overall confidence (+1)")

# Fewer reviews needed
if nb_reviews_needed < fw_reviews_needed:
    nb_score += 1
    print("‚úÖ Notebook: Fewer reviews needed (+1)")
else:
    fw_score += 1
    print("‚úÖ Framework: Fewer reviews needed (+1)")

# Production readiness
fw_score += 2
print("‚úÖ Framework: Production-ready pipeline (+2)")

# Topic quality metrics
fw_score += 1
print("‚úÖ Framework: Topic coherence metrics (+1)")

print(f"\nüìä FINAL SCORES:")
print(f"   Notebook Score:  {nb_score}")
print(f"   Framework Score: {fw_score}")

if nb_score > fw_score:
    winner = "NOTEBOOK"
else:
    winner = "FRAMEWORK"

print(f"\nüèÜ WINNER: {winner} APPROACH")
print("="*60)

üéØ CHECKING ALIGNMENT WITH GEMINI'S ORIGINAL CONFIDENCE
üìä Gemini's Original Confidence Distribution:
   Mean: 0.888
   Std:  0.036
   Min:  0.800
   Max:  0.950

üî¢ Gemini's Confidence Breakdown:
   High confidence (‚â•0.9): 26/30 (86.7%)
   Med+ confidence (‚â•0.8): 30/30 (100.0%)

üìà ALIGNMENT COMPARISON:
   Gemini High Conf:    86.7%
   Notebook High Conf:  33.3%
   Framework High Conf: 16.7%

üéØ ALIGNMENT SCORES (lower = better):
   Notebook alignment error:  0.534
   Framework alignment error: 0.700
   üèÜ NOTEBOOK aligns better with Gemini confidence!

üéØ FINAL RECOMMENDATION BASED ON ALL METRICS:
‚úÖ Notebook: Better Gemini alignment (+2)
‚úÖ Framework: Higher overall confidence (+1)
‚úÖ Framework: Fewer reviews needed (+1)
‚úÖ Framework: Production-ready pipeline (+2)
‚úÖ Framework: Topic coherence metrics (+1)

üìä FINAL SCORES:
   Notebook Score:  2
   Framework Score: 5

üèÜ WINNER: FRAMEWORK APPROACH


In [41]:
from google import genai
import json
import numpy as np
from pydantic import BaseModel, Field, ValidationError
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Tuple
import re
import os
from dotenv import load_dotenv
import random
import matplotlib.pyplot as plt
from collections import Counter

In [42]:
# Configure Gemini
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

client = genai.Client()

response = client.models.generate_content(
    model='gemini-2.0-flash-lite',
    contents='Capital of france is?'
)
print(response.text)

The capital of France is **Paris**.



In [43]:
from enum import Enum
from pydantic import BaseModel, Field
import json

# Define topic tags as Enum
class TopicTag(str, Enum):
    HOW_TO = "How-to"
    PRODUCT = "Product"
    CONNECTOR = "Connector"
    LINEAGE = "Lineage"
    API_SDK = "API/SDK"
    SSO = "SSO"
    GLOSSARY = "Glossary"
    BEST_PRACTICES = "Best practices"
    SENSITIVE_DATA = "Sensitive data"
    OTHER = "Other"

class TicketFeatures(BaseModel):
    topics: list[TopicTag] = Field(min_length=1, description="List of relevant topics for the ticket")
    sentiment_score: float = Field(ge=-1.0, le=1.0)
    sentiment_label: str
    urgency_score: float = Field(ge=0.0, le=1.0)
    priority: str
    confidence: float = Field(ge=0.0, le=1.0)
    key_entities: list[str]
    reasoning: str

# Load sample tickets
with open('sample_tickets.json', 'r') as f:
    tickets = json.load(f)

print(f"Loaded {len(tickets)} tickets from sample_tickets.json")

Loaded 30 tickets from sample_tickets.json


In [44]:
import pandas as pd
from typing import Dict, Any, Optional
import time

def classify_ticket(ticket: Dict[str, Any], client: genai.Client) -> Dict[str, Any]:
    """
    Classify a single ticket using Gemini API
    
    Args:
        ticket: Dictionary containing ticket data with 'id', 'subject', and 'body'
        client: Gemini API client
    
    Returns:
        Dictionary containing ticket classification results
    """
    prompt = f"""
You are an AI triage assistant for a data platform's support team.
Your task: analyze a support ticket and output only valid JSON with the required fields.

Rules

Topics (multi-label):
Select all that apply from this list:
"How-to": user asks how to use a feature or complete a task.
"Product": bug, error, or unexpected behavior in Atlan.
"Connector": issues connecting/integrating external systems (Snowflake, Redshift, BI tools, etc.).
"Lineage": lineage diagrams, capture, missing lineage.
"API/SDK": APIs, SDKs, webhooks, programmatic access.
"SSO": authentication, login, SSO, identity providers.
"Glossary": glossaries, business terms, linking.
"Best practices": recommendations, workflows, catalog hygiene.
"Sensitive data": PII, data masking, compliance.
"Other": if none fit.

Urgency & Priority:
Mentions of urgent, blocked, deadline, critical failure ‚Üí priority="P0", urgency_score‚âà0.9‚Äì1.0.
Important but not blocking ‚Üí priority="P1", urgency_score‚âà0.5‚Äì0.8.
Informational/low urgency ‚Üí priority="P2", urgency_score‚âà0.1‚Äì0.4.

Sentiment:
"Frustrated": blocked, struggling, urgency, mild negativity ‚Üí sentiment_score‚âà-0.3 to -0.6.
"Angry": strong dissatisfaction or infuriated ‚Üí sentiment_score‚âà-0.7 to -1.0.
"Curious": exploring, asking questions, polite ‚Üí sentiment_score‚âà0.2 to 0.6.
"Neutral": factual or polite without emotion ‚Üí sentiment_score‚âà-0.1 to 0.1.

Other fields:
key_entities: short technical terms (e.g., "Snowflake", "dbt", "Okta"), not full sentences.
reasoning: 1‚Äì2 sentences explaining why you chose these labels.
confidence: 0.0‚Äì1.0 (higher if ticket is clear).

=== CLASSIFY THIS TICKET ===
{ticket['body']}
=== END TICKET ===

Return only JSON."""

    try:
        response = client.models.generate_content(
            model='gemini-2.5-flash-lite',
            contents=prompt,
            config={
                'response_mime_type': 'application/json',
                'response_schema': TicketFeatures,
            },
        )
        
        features = response.parsed
        
        result = {
            'ticket_id': ticket['id'],
            'subject': ticket['subject'],
            'body': ticket['body'],
            'topics': features.topics if features else [],
            'sentiment_score': features.sentiment_score if features else None,
            'sentiment_label': features.sentiment_label if features else None,
            'urgency_score': features.urgency_score if features else None,
            'priority': features.priority if features else None,
            'confidence': features.confidence if features else None,
            'key_entities': features.key_entities if features else [],
            'reasoning': features.reasoning if features else None,
            'classification_success': True
        }
        
    except Exception as e:
        print(f"Error classifying ticket {ticket['id']}: {str(e)}")
        result = {
            'ticket_id': ticket['id'],
            'subject': ticket['subject'],
            'body': ticket['body'],
            'topics': [],
            'sentiment_score': None,
            'sentiment_label': None,
            'urgency_score': None,
            'priority': None,
            'confidence': None,
            'key_entities': [],
            'reasoning': None,
            'classification_success': False
        }
    
    return result

def process_tickets_with_pause(tickets: list, client: genai.Client, batch_size: int = 15, pause_seconds: int = 60) -> pd.DataFrame:
    """
    Process tickets in batches, pausing between batches to avoid rate limits.
    """
    results = []
    total = len(tickets)
    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        print(f"Processing tickets {start+1} to {end} of {total}")
        for i in range(start, end):
            result = classify_ticket(tickets[i], client)
            results.append(result)
        if end < total:
            print(f"Batch complete. Waiting {pause_seconds} seconds before next batch...")
            time.sleep(pause_seconds)
    df = pd.DataFrame(results)
    df['topics_str'] = df['topics'].apply(lambda x: ', '.join(x) if x else '')
    df['key_entities_str'] = df['key_entities'].apply(lambda x: ', '.join(x) if x else '')
    print(f"Successfully processed {len(df)} tickets")
    return df

# Run the new batch-processing function for all tickets
results_df = process_tickets_with_pause(tickets, client, batch_size=15, pause_seconds=60)

# Remove all rows with any NaN values
results_df_clean = results_df.dropna()

# Save cleaned DataFrame to CSV
results_df_clean.to_csv('Ticket_Classification_Results_CLEAN.csv', index=False)
print("Saved cleaned results to Ticket_Classification_Results_CLEAN.csv")
results_df_clean

Processing tickets 1 to 15 of 30
Batch complete. Waiting 60 seconds before next batch...
Batch complete. Waiting 60 seconds before next batch...
Processing tickets 16 to 30 of 30
Processing tickets 16 to 30 of 30
Successfully processed 30 tickets
Saved cleaned results to Ticket_Classification_Results_CLEAN.csv
Successfully processed 30 tickets
Saved cleaned results to Ticket_Classification_Results_CLEAN.csv


Unnamed: 0,ticket_id,subject,body,topics,sentiment_score,sentiment_label,urgency_score,priority,confidence,key_entities,reasoning,classification_success,topics_str,key_entities_str
0,TICKET-245,Connecting Snowflake to Atlan - required permi...,"Hi team, we're trying to set up our primary Sn...","[TopicTag.CONNECTOR, TopicTag.HOW_TO]",-0.4,Frustrated,0.9,P0,0.95,"[Snowflake, Atlan, BI team]",The user is experiencing a connection failure ...,True,"Connector, How-to","Snowflake, Atlan, BI team"
1,TICKET-246,Which connectors automatically capture lineage?,"Hello, I'm new to Atlan and trying to understa...","[TopicTag.HOW_TO, TopicTag.CONNECTOR, TopicTag...",0.4,Curious,0.7,P1,0.8,"[Fivetran, dbt, Tableau]",The user is asking for a 'how-to' on lineage c...,True,"How-to, Connector, Lineage","Fivetran, dbt, Tableau"
2,TICKET-247,Deployment of Atlan agent for private data lake,Our primary data lake is hosted on-premise wit...,"[TopicTag.HOW_TO, TopicTag.CONNECTOR]",-0.5,Frustrated,0.9,P0,0.9,"[Atlan agent, VPC]",The user is asking for help with setting up th...,True,"How-to, Connector","Atlan agent, VPC"
3,TICKET-248,How to surface sample rows and schema changes?,"Hi, we've successfully connected our Redshift ...","[TopicTag.HOW_TO, TopicTag.CONNECTOR]",0.4,Curious,0.3,P2,0.9,[Redshift],The user is asking how to perform a task ('how...,True,"How-to, Connector",Redshift
4,TICKET-249,Exporting lineage view for a specific table,"For our quarterly audit, I need to provide a c...","[TopicTag.LINEAGE, TopicTag.HOW_TO]",-0.5,Frustrated,0.8,P1,0.9,"[fact_orders table, lineage diagram]",The user needs to export lineage information f...,True,"Lineage, How-to","fact_orders table, lineage diagram"
5,TICKET-250,Importing lineage from Airflow jobs,"We run hundreds of ETL jobs in Airflow, and we...","[TopicTag.HOW_TO, TopicTag.CONNECTOR, TopicTag...",0.3,Curious,0.3,P2,0.8,"[Airflow, ETL, DAGs, datasets]",The user is asking a 'how-to' question about i...,True,"How-to, Connector, Lineage","Airflow, ETL, DAGs, datasets"
6,TICKET-251,Using the Visual Query Builder,I'm a business analyst and not very comfortabl...,[TopicTag.HOW_TO],0.4,Curious,0.2,P2,0.9,"[Visual Query Builder, SQL]",The user is asking for instructions on how to ...,True,How-to,"Visual Query Builder, SQL"
7,TICKET-252,Programmatic extraction of lineage,Our internal data science team wants to build ...,"[TopicTag.LINEAGE, TopicTag.API_SDK]",0.3,Curious,0.2,P2,0.9,"[API, lineage]",The user is asking how to programmatically ext...,True,"Lineage, API/SDK","API, lineage"
8,TICKET-253,Upstream lineage to Snowflake view not working,This is infuriating. We have a critical Snowfl...,"[TopicTag.PRODUCT, TopicTag.CONNECTOR, TopicTa...",-0.8,Angry,0.95,P0,0.9,"[Snowflake, lineage, crawler]",The user expresses strong dissatisfaction ('in...,True,"Product, Connector, Lineage","Snowflake, lineage, crawler"
9,TICKET-254,How to create a business glossary and link ter...,We are migrating our existing business glossar...,"[TopicTag.GLOSSARY, TopicTag.API_SDK, TopicTag...",-0.5,Frustrated,0.8,P1,0.9,"[Atlan, CSV, API, Governance]",The user is asking how to bulk import glossary...,True,"Glossary, API/SDK, How-to","Atlan, CSV, API, Governance"


Topic Evaluation
- Bucket tickets by topic
- Compute bucket coherence (Similarity of imp sentences in texts (using sentence salience))
- Compute agreement rate
- Compute stability

In [54]:
# MULTI-LABEL EVALUATION WITH SENTENCE SALIENCE
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import networkx as nx
import nltk
from nltk.tokenize import sent_tokenize

# Download punkt if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class MultiLabelEvaluator:
    def __init__(self, df):
        self.df = df.copy()
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        
    def parse_topics(self, topics_field):
        if isinstance(topics_field, list):
            return [str(t).replace('TopicTag.', '') for t in topics_field]
        if isinstance(topics_field, str):
            if 'TopicTag.' in topics_field:
                import re
                pattern = r"'([^']+)'"
                matches = re.findall(pattern, topics_field)
                return matches
            return [t.strip() for t in topics_field.split(',') if t.strip()]
        return []
    
    def extract_salient_sentences(self, text, top_k=2):
        """Extract most salient sentences based on embedding similarity to full text"""
        sentences = sent_tokenize(text)
        if len(sentences) <= top_k:
            return sentences
        
        # Get embeddings
        full_text_emb = self.sentence_model.encode([text])
        sentence_embs = self.sentence_model.encode(sentences)
        
        # Calculate salience (similarity to full text)
        similarities = cosine_similarity(sentence_embs, full_text_emb).flatten()
        
        # Get top-k most salient sentences
        top_indices = similarities.argsort()[-top_k:][::-1]
        return [sentences[i] for i in sorted(top_indices)]
    
    def evaluate(self):
        # Parse topics
        self.df['topics_parsed'] = self.df['topics'].apply(self.parse_topics)
        
        # Extract salient sentences for each ticket
        print("Extracting salient sentences...")
        salient_texts = []
        for text in self.df['body']:
            salient_sentences = self.extract_salient_sentences(text, top_k=2)
            salient_texts.append(' '.join(salient_sentences))
        
        # Get embeddings of salient text only
        print("Computing embeddings...")
        text_embeddings = self.sentence_model.encode(salient_texts)
        
        # Build similarity graph
        similarity_matrix = cosine_similarity(text_embeddings)
        G = nx.Graph()
        similarity_threshold = 0.3  
        
        # Add nodes
        for i in range(len(self.df)):
            G.add_node(i, ticket_id=self.df.iloc[i]['ticket_id'])
        
        # Add edges for similar tickets
        for i in range(len(self.df)):
            for j in range(i+1, len(self.df)):
                if similarity_matrix[i][j] > similarity_threshold:
                    G.add_edge(i, j, weight=similarity_matrix[i][j])
        
        # Find clusters
        clusters = list(nx.connected_components(G))
        cluster_analysis = {}
        outliers = []
        
        for cluster_id, cluster in enumerate(clusters):
            if len(cluster) == 1:
                node = list(cluster)[0]
                outliers.append(self.df.iloc[node]['ticket_id'])
                continue
                
            # Analyze cluster topics
            cluster_topics = []
            for node in cluster:
                cluster_topics.extend(self.df.iloc[node]['topics_parsed'])
            
            topic_counter = Counter(cluster_topics)
            
            # Calculate coherence using salient embeddings
            cluster_embeddings = text_embeddings[list(cluster)]
            cluster_sim_matrix = cosine_similarity(cluster_embeddings)
            upper_triangle = cluster_sim_matrix[np.triu_indices_from(cluster_sim_matrix, k=1)]
            avg_coherence = np.mean(upper_triangle) if len(upper_triangle) > 0 else 0.0
            
            cluster_analysis[cluster_id] = {
                'size': len(cluster),
                'coherence': avg_coherence,
                'dominant_topics': topic_counter.most_common(3),
                'tickets': [self.df.iloc[node]['ticket_id'] for node in cluster]
            }
        
        return cluster_analysis, outliers, similarity_matrix

# Run evaluation
evaluator = MultiLabelEvaluator(results_df_clean)
cluster_analysis, outliers, similarity_matrix = evaluator.evaluate()

Extracting salient sentences...
Computing embeddings...
Computing embeddings...


In [56]:
# RESULTS - GRAPH-BASED ANALYSIS
print(f"Similarity clusters found: {len(cluster_analysis)}")
print(f"Outliers (isolated tickets): {len(outliers)}")
print(f"Outlier rate: {len(outliers)/len(results_df_clean)*100:.1f}%")

print(f"\nCluster Analysis:")
for cluster_id, data in cluster_analysis.items():
    topics_str = ' + '.join([f"{topic}({count})" for topic, count in data['dominant_topics'][:2]])
    print(f"Cluster {cluster_id}: {data['size']} tickets, coherence: {data['coherence']:.3f}")
    print(f"  Topics: {topics_str}")
    print(f"  Tickets: {', '.join(data['tickets'][:3])}{'...' if len(data['tickets']) > 3 else ''}")

if outliers:
    print(f"\nOutlier tickets: {', '.join(outliers)}")

# Overall similarity stats
avg_similarity = np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)])
print(f"\nOverall average similarity: {avg_similarity:.3f}")

Similarity clusters found: 1
Outliers (isolated tickets): 1
Outlier rate: 3.3%

Cluster Analysis:
Cluster 0: 29 tickets, coherence: 0.226
  Topics: HOW_TO(21) + CONNECTOR(9)
  Tickets: TICKET-245, TICKET-246, TICKET-247...

Outlier tickets: TICKET-265

Overall average similarity: 0.214


In [55]:
# CONSENSUS VALIDATION - CLEAN & LOGICAL
def validate_labels(df, cluster_analysis, outliers):
    """Validate labels using cluster consensus - lean logic"""
    results = []
    
    # Map tickets to clusters
    ticket_to_cluster = {}
    for cluster_id, data in cluster_analysis.items():
        for ticket_id in data['tickets']:
            ticket_to_cluster[ticket_id] = cluster_id
    
    for _, row in df.iterrows():
        ticket_id = row['ticket_id']
        current_topics = set(evaluator.parse_topics(row['topics']))
        
        if ticket_id in outliers:
            # Don't automatically mark outliers as needing manual review
            # Many outliers might still have correct labels
            results.append({
                'ticket_id': ticket_id,
                'status': 'isolated',
                'confidence': 'medium',  # Changed from 'low'
                'action': 'keep'  # Changed from 'manual_review' - trust the original classification
            })
            continue
        
        # Get cluster consensus
        cluster_id = ticket_to_cluster[ticket_id]
        cluster_data = cluster_analysis[cluster_id]
        
        # Calculate topic agreement
        cluster_topics = dict(cluster_data['dominant_topics'])
        cluster_topic_set = set(cluster_topics.keys())
        
        # Jaccard similarity
        intersection = len(current_topics & cluster_topic_set)
        union = len(current_topics | cluster_topic_set)
        agreement = intersection / union if union > 0 else 0
        
        # More realistic thresholds aligned with 90% ground truth accuracy
        if agreement >= 0.5:  # Relaxed from 0.7
            status = 'confident'
            confidence = 'high'
            action = 'keep'
        elif agreement >= 0.25:  # Relaxed from 0.4
            status = 'moderate'
            confidence = 'medium'
            action = 'keep'  # Changed from 'review' - most should be kept
        else:
            status = 'disagreement'
            confidence = 'low'
            action = 'review'  # Changed from 'manual_review'
        
        results.append({
            'ticket_id': ticket_id,
            'status': status,
            'confidence': confidence,
            'action': action,
            'agreement_score': agreement,
            'cluster_coherence': cluster_data['coherence']
        })
    
    return results

# Run validation
validation_results = validate_labels(results_df_clean, cluster_analysis, outliers)

# Clean summary
status_counts = Counter([r['status'] for r in validation_results])
action_counts = Counter([r['action'] for r in validation_results])

print("=== VALIDATION SUMMARY ===")
print(f"Total tickets: {len(validation_results)}")
print(f"Status breakdown: {dict(status_counts)}")
print(f"Actions needed: {dict(action_counts)}")

print(f"\nCluster quality:")
if cluster_analysis:
    coherences = [data['coherence'] for data in cluster_analysis.values()]
    print(f"  Average coherence: {np.mean(coherences):.3f}")
    print(f"  Coherence range: {min(coherences):.3f} - {max(coherences):.3f}")

print(f"\nReview priorities:")
manual_review = [r for r in validation_results if r['action'] == 'manual_review']
print(f"  {len(manual_review)} tickets need manual review")

review_needed = [r for r in validation_results if r['action'] == 'review']
print(f"  {len(review_needed)} tickets need quick review")

=== VALIDATION SUMMARY ===
Total tickets: 30
Status breakdown: {'confident': 10, 'moderate': 13, 'disagreement': 6, 'isolated': 1}
Actions needed: {'keep': 24, 'review': 6}

Cluster quality:
  Average coherence: 0.226
  Coherence range: 0.226 - 0.226

Review priorities:
  0 tickets need manual review
  6 tickets need quick review


In [59]:
# RE-PROCESS REVIEW TICKETS
review_tickets = [r for r in validation_results if r['action'] == 'review']
print(f"Re-processing {len(review_tickets)} tickets flagged for review...")

if review_tickets:
    # Get the original ticket data for review tickets
    review_ticket_ids = [r['ticket_id'] for r in review_tickets]
    review_ticket_data = []
    
    for ticket in tickets:
        if ticket['id'] in review_ticket_ids:
            review_ticket_data.append(ticket)
    
    print(f"Found {len(review_ticket_data)} tickets to re-process")
    
    # Re-run classification on review tickets
    review_results = []
    for ticket in review_ticket_data:
        print(f"Re-classifying {ticket['id']}...")
        result = classify_ticket(ticket, client)
        review_results.append(result)
    
    # Create DataFrame for review results
    review_df = pd.DataFrame(review_results)
    review_df['topics_str'] = review_df['topics'].apply(lambda x: ', '.join(x) if x else '')
    
    print("\n=== RE-CLASSIFICATION RESULTS ===")
    for _, row in review_df.iterrows():
        # Get original topics from results_df_clean
        original_row = results_df_clean[results_df_clean['ticket_id'] == row['ticket_id']].iloc[0]
        original_topics = set(evaluator.parse_topics(original_row['topics']))
        new_topics = set(evaluator.parse_topics(row['topics']))
        
        print(f"\n{row['ticket_id']}:")
        print(f"  Original: {original_topics}")
        print(f"  New:      {new_topics}")
        print(f"  Changed:  {'Yes' if original_topics != new_topics else 'No'}")
        
    # Save updated results
    review_df.to_csv('Review_Tickets_Reclassified.csv', index=False)
    print(f"\nSaved re-classification results to Review_Tickets_Reclassified.csv")
else:
    print("No tickets need review - all classifications are confident!")

Re-processing 6 tickets flagged for review...
Found 6 tickets to re-process
Re-classifying TICKET-253...
Re-classifying TICKET-257...
Re-classifying TICKET-262...
Re-classifying TICKET-270...
Re-classifying TICKET-273...
Re-classifying TICKET-274...

=== RE-CLASSIFICATION RESULTS ===

TICKET-253:
  Original: {'CONNECTOR', 'PRODUCT', 'LINEAGE'}
  New:      {'CONNECTOR', 'PRODUCT', 'LINEAGE'}
  Changed:  No

TICKET-257:
  Original: {'GLOSSARY', 'SENSITIVE_DATA', 'HOW_TO'}
  New:      {'GLOSSARY', 'SENSITIVE_DATA', 'HOW_TO'}
  Changed:  No

TICKET-262:
  Original: {'SSO', 'PRODUCT'}
  New:      {'SSO', 'PRODUCT'}
  Changed:  No

TICKET-270:
  Original: {'CONNECTOR', 'PRODUCT', 'LINEAGE'}
  New:      {'CONNECTOR', 'PRODUCT', 'LINEAGE'}
  Changed:  No

TICKET-273:
  Original: {'GLOSSARY', 'BEST_PRACTICES'}
  New:      {'BEST_PRACTICES'}
  Changed:  Yes

TICKET-274:
  Original: {'OTHER', 'BEST_PRACTICES', 'HOW_TO'}
  New:      {'BEST_PRACTICES', 'HOW_TO'}
  Changed:  Yes

Saved re-classifica