In [5]:
import pandas as pd

df = pd.read_csv("5TestCalls.csv")
df.head(10) 

Unnamed: 0,call_id,call_text
0,Call1,"[Agent] ""Thank you for choosing Optimum Busine..."
1,Call10,"[Agent] ""Thank you for choosing Optimum Busine..."
2,Call100,"[Agent] ""Good morning. Thank you for calling O..."
3,Call101,"[Agent] ""Good morning. Thank you for calling O..."
4,Call102,"[Agent] ""Hold on one second, hold on, do not d..."


In [8]:
import pandas as pd  
from typing import List
from pydantic import BaseModel
from openai import OpenAI
from dotenv import load_dotenv
import os
from concurrent.futures import ThreadPoolExecutor

# Load environment
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

# Define schema for structured output
class BinaryOutcome(BaseModel):
    call_id: str
    binary_explanation: str
    binary_label: str

# -------------------------------
# Helper function for one row
# -------------------------------
def process_row_binary(row,
                       context_prompt,
                       input_data,
                       positive_label,
                       negative_label,
                       client):
    call_id = row["call_id"]
    transcript = row[input_data]

    messages = [
        {
            "role": "system",
            "content": f"""
            You are a strict binary classifier. You must answer the user's binary question using evidence from the transcript. 
            A 'yes' requires evidence that satisfies the question. 
            A 'no' means the evidence is missing, unclear, or contradictory. 
            Don't make large assumptions. 
            Always return valid JSON matching the schema.""",
        },
        {
            "role": "user",
            "content": f"""

            Question: {context_prompt}
            Transcript: {transcript}
            
            Respond ONLY with:
            - binary_explanation: reason why it was labeled yes or no, referencing evidence from the transcript
            - binary_label: "{positive_label}" or "{negative_label}"
            """,
        },
    ]

    try:
        response = client.responses.parse(
            model="gpt-4o-mini",
            input=messages,
            text_format=BinaryOutcome,
            temperature=1.0,
            max_output_tokens=300,
        )

        parsed: BinaryOutcome = response.output_parsed
        parsed.call_id = call_id

    except Exception as e:
        print(f"Failed to parse {call_id}: {e}")
        parsed = BinaryOutcome(
            call_id=call_id,
            binary_explanation="No explanation found",
            binary_label=negative_label,
        )

    return parsed.model_dump()

# -------------------------------
# Parallel classifier
# -------------------------------
def binary_classifier_parallel(df,
                               context_prompt,
                               input_data="call_text",
                               positive_label="true",
                               negative_label="false",
                               explanation_col="binary_explanation",
                               label_col="binary_label",
                               max_workers=8) -> pd.DataFrame:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(
                process_row_binary,
                row,
                context_prompt,
                input_data,
                positive_label,
                negative_label,
                client
            )
            for _, row in df.iterrows()
        ]
        results = [f.result() for f in futures]

    # Convert list of dicts into DataFrame and merge
    results_df = pd.DataFrame(results)
    results_df = results_df.rename(columns={
        "binary_explanation": explanation_col,
        "binary_label": label_col,
    })

    df = df.merge(results_df, on="call_id", how="left")
    return df


In [9]:
def run_multiple_binary_classifiers(df, questions, max_workers=5):
    """
    Runs the binary classifier for multiple questions on the same dataframe.
    Each question adds new columns (label + explanation).
    """
    results_df = df.copy()

    for q in questions:
        print(f"Running classifier: {q['question']}")
        classified_df = binary_classifier_parallel(
            results_df,
            context_prompt=q["question"],
            positive_label="yes",
            negative_label="no",
            explanation_col=q["explanation_col"],
            label_col=q["label_col"],
            max_workers=max_workers,
        )
        # Keep expanding with new columns each time
        results_df = classified_df  

    return results_df


In [10]:
binary_questions = [
    {
        "question": "Did the prospect agree to a follow-up meeting?",
        "label_col": "followup_label",
        "explanation_col": "followup_explanation",
    },
    {
        "question": "Was pricing discussed in this call?",
        "label_col": "pricing_label",
        "explanation_col": "pricing_explanation",
    },
    {
        "question": "Was Product A explicitly mentioned in this call?",
        "label_col": "productA_label",
        "explanation_col": "productA_explanation",
    },
    {
        "question": "Was Product B explicitly mentioned in this call?",
        "label_col": "productB_label",
        "explanation_col": "productB_explanation",
    },
]


In [13]:
final_results = run_multiple_binary_classifiers(df, binary_questions, max_workers=5)

display(final_results.head())


Running classifier: Did the prospect agree to a follow-up meeting?
Running classifier: Was pricing discussed in this call?
Running classifier: Was Product A explicitly mentioned in this call?
Running classifier: Was Product B explicitly mentioned in this call?


Unnamed: 0,call_id,call_text,followup_explanation,followup_label,pricing_explanation,pricing_label,productA_explanation,productA_label,productB_explanation,productB_label
0,Call1,"[Agent] ""Thank you for choosing Optimum Busine...","The prospect, Kimchi, indicated that they woul...",no,Pricing was clearly discussed during the call....,yes,Product A was never explicitly mentioned in th...,no,Product B was not explicitly mentioned in the ...,no
1,Call10,"[Agent] ""Thank you for choosing Optimum Busine...",The customer indicated a desire for a represen...,yes,Pricing was discussed multiple times during th...,yes,Product A was not explicitly mentioned in the ...,no,Product B is never explicitly mentioned in the...,no
2,Call100,"[Agent] ""Good morning. Thank you for calling O...",The customer agreed to a follow-up appointment...,yes,Pricing was indeed discussed in the call. The ...,yes,Product A was not explicitly mentioned in the ...,no,Product B was not explicitly mentioned in the ...,no
3,Call101,"[Agent] ""Good morning. Thank you for calling O...",The customer explicitly agreed to have interne...,yes,Pricing was explicitly mentioned when the agen...,yes,Product A was not mentioned as the conversatio...,no,Product B was not explicitly mentioned in the ...,no
4,Call102,"[Agent] ""Hold on one second, hold on, do not d...",The customer expressed that they would like to...,yes,Pricing was explicitly discussed in the call. ...,yes,Product A is never explicitly mentioned in the...,no,Product B is not explicitly mentioned in the c...,no


In [12]:
final_results.to_csv("binary_classified_temp1_run1.csv", index=False)