In [None]:
import os
import re
import pandas as pd
import csv
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from openai import OpenAI

In [None]:
# Constants for file paths and URLs
PATH_TO_FILE = "/Users/pravin/Desktop/Src/call_analysis_results.csv"
TRANSCRIPT_FOLDER_PATH = '/Users/pravin/Desktop/transcripts_v3/transcripts_v3'
OLLAMA_URL = "http://localhost:11434/v1/"  # Local URL for the LLM API
OLLAMA_API_KEY = "ollama"  # API key for the LLM service

## Step 1: Load Transcripts from the Folder

In [1]:
# Path to the folder containing the transcript files
folder_path = '/Users/pravin/Desktop/transcripts_v3/transcripts_v3'

# Function to load all transcripts
def load_transcripts(folder_path):
    """
    Load all .txt transcript files from the specified folder.

    Args:
        folder_path (str): The path to the folder containing transcript files.

    Returns:
        list: A list of dictionaries with filenames as keys and file content as values.
    """
    transcripts = []
    # List and sort the filenames in ascending order
    filenames = sorted([filename for filename in os.listdir(folder_path) if filename.endswith(".txt")])
    
    # Load each transcript file in the sorted order
    for filename in filenames:
        with open(os.path.join(folder_path, filename), 'r') as file:
            transcripts.append({filename:file.read()})
    return transcripts

# Load all transcript files
transcripts = load_transcripts(folder_path)
print(f"Loaded {len(transcripts)} transcripts.")
# Function to extract the numerical part of the filename
def extract_number(key):
    return int(re.search(r'\d+', key).group())

# Sort the transcripts based on the number in the filename
sorted_transcripts = sorted(transcripts, key=lambda x: extract_number(list(x.keys())[0]))
sorted_transcripts[:5]


Loaded 200 transcripts.


[{'transcript_0.txt': "Member: Hi, I'm calling to get a case pre-authorized. My name is Emily Wilson and my member ID is MEM456789.\n\nPA Agent: Hi Emily, thank you for calling PA customer care. Can you please confirm your date of birth for verification purposes?\n\nMember: It's March 12, 1985.\n\nPA Agent: Thank you, Emily. Can you please provide me with some details about the case you're seeking pre-authorization for? What's the nature of the treatment or service you're looking to receive?\n\nMember: I'm scheduled to undergo an MRI scan for a knee injury. My doctor's office has already submitted the request, but I wanted to confirm the status and ensure that it's covered under my plan.\n\nPA Agent: I apologize for the delay, Emily. Let me just check on the status of your request. Can you please hold for just a moment?\n\nMember: Sure, thank you.\n\nPA Agent: Thank you for holding, Emily. I've located your request and I'm reviewing the details now. Can you please confirm the name of y

## Step 2: Extract Customer Portions from Each Transcript

In [2]:
# Function to extract customer dialogue from a list of transcripts
def extract_customer_transcripts(transcripts):
    """
    Extract the numerical part of a filename.

    Args:
        filename (str): The filename from which to extract the number.

    Returns:
        int: The extracted number from the filename.
    """
    customer_transcripts = []
    for transcript in transcripts:
        customer_lines = re.findall(r"Member: (.+)", transcript)
        customer_dialogue = ' '.join(customer_lines) # extract_customer_transcript(transcript)  
        customer_transcripts.append(customer_dialogue)
    return customer_transcripts

# Extract customer portions from all transcripts
transcript_texts = [list(transcript.values())[0] for transcript in sorted_transcripts]
customer_transcripts = extract_customer_transcripts(transcript_texts)
customer_transcripts[:5]

["Hi, I'm calling to get a case pre-authorized. My name is Emily Wilson and my member ID is MEM456789. It's March 12, 1985. I'm scheduled to undergo an MRI scan for a knee injury. My doctor's office has already submitted the request, but I wanted to confirm the status and ensure that it's covered under my plan. Sure, thank you. My doctor's name is Dr. Smith and the facility is Oakwood Medical Center. Okay. Okay, what does that mean? Okay, that sounds good. Can you give me a reference number for this case so I can follow up if needed? Great, thank you for your help. No, that's all. Thank you.",
 "Hi, I'm calling about a denied claim I received for my recent medical service. I was told that my policy doesn't cover it, but I'm certain it should be covered under my new policy. My member ID is MEM123456. The claim number is CLM789012, and the date of service was February 10th. I switched policies on January 1st. I was told that the new policy would cover the service I received, but the deni

## Step 3: Sentiment Analysis for Each Transcript


In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the sentiment analysis model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move model to MPS if available
if torch.backends.mps.is_available():
    model.to("mps")
else:
    model.to("cpu")

# Function to get the sentiment using the model
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Move inputs to MPS if available
    if torch.backends.mps.is_available():
        inputs = {k: v.to("mps") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()

    # Return label based on prediction
    if predicted_class == 0:
        return "NEGATIVE"
    elif predicted_class == 1:
        return "NEUTRAL"
    else:
        return "POSITIVE"

# Analyze sentiment for each customer transcript
sentiments = [get_sentiment(text) for text in customer_transcripts]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [5]:
def build_prompt(transcript):
    """
    Build the LLM prompt for analyzing the conversation.

    Args:
        transcript (str): The conversation transcript to be analyzed.

    Returns:
        str: The formatted prompt for the LLM model.
    """
    prompt_template = f"""
    
    The following is a conversation between a customer and an agent. Analyze only the customer side of the conversation and do the following:

    1. Determine if the customer's issue was resolved or if follow-up action is needed.
    2. Determine the overall sentiment of the customer during the conversation (positive, negative, or neutral).

    Customer Transcript: {transcript}

    Please provide your answer in the format:

    Call Outcome: [Issue Resolved/Follow-up Action Needed]
    Sentiment: [Positive/Negative/Neutral]
    """
    
    return prompt_template.strip()

In [9]:
def llm(transcript):
    """
    Use LLM to analyze the transcript for call outcome and sentiment.

    Args:
        transcript (str): The conversation transcript to be analyzed.

    Returns:
        str: The LLM response containing call outcome and sentiment analysis.
    """    
    prompt=build_prompt(transcript)
           
    ollama_client = OpenAI(base_url=OLLAMA_URL, api_key="ollama")

    response = ollama_client.chat.completions.create(
                model="llama3.2",
                messages=[{"role": "user", "content": prompt}]
            )
    answer = response.choices[0].message.content
    return answer

In [10]:
from tqdm import tqdm

def determine_call_outcomes(customer_transcripts):
    """
    Determine call outcomes and sentiments using LLM for each transcript.

    Args:
        transcripts (list): A list of transcripts to be analyzed.

    Returns:
        list: A list of LLM responses for each transcript.
    """
    outcomes = []
    for transcript in tqdm(customer_transcripts, desc="Processing Transcripts"):
        outcome = llm(transcript)  # Reuse the function from earlier
        outcomes.append(outcome)
    return outcomes


In [11]:
# Determine call outcome for each transcript
call_outcomes = determine_call_outcomes(customer_transcripts)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing Transcripts: 100%|██████████| 200/200 [39:49<00:00, 11.95s/it] 


In [34]:
import csv

# Specify the output CSV file name
csv_file_name = "call_outcomes.csv"

# Writing to the CSV file
with open(csv_file_name, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow(["Call_Outcome"])
    
    # Write the sentiments
    for call_outcome in call_outcomes:
        writer.writerow([call_outcome])

print(f"Sentiments written to {csv_file_name} successfully.")

Sentiments written to call_outcomes.csv successfully.


In [40]:
import pandas as pd

# Create a DataFrame to store results
df_results = pd.DataFrame({
    'Transcript': sorted_transcripts,
    'Customer Dialogue': customer_transcripts,
    'Sentiment': sentiments,  # 'POSITIVE' or 'NEGATIVE'
    'Call Outcome': call_outcomes
})


In [41]:
df_results.head()

Unnamed: 0,Transcript,Customer Dialogue,Sentiment,Call Outcome
0,"{'transcript_0.txt': 'Member: Hi, I'm calling ...","Hi, I'm calling to get a case pre-authorized. ...",POSITIVE,Call Outcome: Issue Resolved\nSentiment: Neutr...
1,"{'transcript_1.txt': 'Member: Hi, I'm calling ...","Hi, I'm calling about a denied claim I receive...",NEGATIVE,Call Outcome: Follow-up Action Needed\nSentime...
2,"{'transcript_2.txt': 'Member: Hi, I'm calling ...","Hi, I'm calling about my recent doctor's visit...",POSITIVE,Call Outcome: Follow-up Action Needed\nSentime...
3,"{'transcript_3.txt': 'Member: Hi, I'm calling ...","Hi, I'm calling about my recent visit to the d...",NEGATIVE,Call Outcome: Follow-up Action Needed\nSentime...
4,"{'transcript_4.txt': 'Member: Hi, I'd like to ...","Hi, I'd like to schedule an appointment with a...",POSITIVE,Call Outcome: Issue Resolved\nSentiment: Posit...


In [42]:
# Define a function to check and categorize the outcome
def categorize_outcome(call_outcome):
    call_outcome_lower = call_outcome.lower()
    if "issue"  in call_outcome_lower:
        return "Issue Resolved"
    elif "follow-up" in call_outcome_lower:
        return "Follow-up Action Needed"
    else:
        return "Unknown"

# Apply the function to the 'Call Outcome' column and create a new 'Output' column
df_results['Output'] = df_results['Call Outcome'].apply(categorize_outcome)

# Display the DataFrame
df_results.head()

Unnamed: 0,Transcript,Customer Dialogue,Sentiment,Call Outcome,Output
0,"{'transcript_0.txt': 'Member: Hi, I'm calling ...","Hi, I'm calling to get a case pre-authorized. ...",POSITIVE,Call Outcome: Issue Resolved\nSentiment: Neutr...,Issue Resolved
1,"{'transcript_1.txt': 'Member: Hi, I'm calling ...","Hi, I'm calling about a denied claim I receive...",NEGATIVE,Call Outcome: Follow-up Action Needed\nSentime...,Follow-up Action Needed
2,"{'transcript_2.txt': 'Member: Hi, I'm calling ...","Hi, I'm calling about my recent doctor's visit...",POSITIVE,Call Outcome: Follow-up Action Needed\nSentime...,Follow-up Action Needed
3,"{'transcript_3.txt': 'Member: Hi, I'm calling ...","Hi, I'm calling about my recent visit to the d...",NEGATIVE,Call Outcome: Follow-up Action Needed\nSentime...,Follow-up Action Needed
4,"{'transcript_4.txt': 'Member: Hi, I'd like to ...","Hi, I'd like to schedule an appointment with a...",POSITIVE,Call Outcome: Issue Resolved\nSentiment: Posit...,Issue Resolved


In [43]:
# Save results to a CSV file
df_results.to_csv('call_analysis_results.csv', index=False)
print("Results saved to call_analysis_results.csv")

Results saved to call_analysis_results.csv
