In [6]:
import os
import sys
sys.path.append("../../")
sys.path.append("../")
from dotenv import load_dotenv
from pprint import pprint
load_dotenv("../../export.env")

True

In [7]:
from utils.sambanova_endpoint import SambaNovaEndpoint
from langchain.prompts import PromptTemplate, load_prompt
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS 
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers import PydanticOutputParser, CommaSeparatedListOutputParser, StructuredOutputParser, ResponseSchema
from langchain_community.document_loaders import TextLoader
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from vectordb.vector_db import VectorDb

# load conversations

In [63]:
path="../data/conversations/transcription"
conversations = os.listdir(path)
documents = []
for conversation in conversations:
    conversation_path=os.path.join(path, conversation)
    loader = TextLoader(conversation_path)
    documents.extend(loader.load())
documents

[Document(page_content="speaker1: fire paramedic thirty three what is the address of your emergency\nspeaker2: ys sir I need to: I: uh hh I need an ambulance as soon as possible sir\nspeaker1: Ok sir what's your address\nspeaker2: Los Angeles speaker2ifornia\nspeaker2: nine zero zero seven seven\nspeaker1: 's it Carolwood\nspeaker2: Carolwood Drive yes hh ∙hh\nspeaker3: yes\nspeaker3: xxx xxx xxx xxx xxx xxx xxx xxx\nspeaker1: ok sir what's the phone number you are speaker2ling from\nspeaker1: xxx ⌈xxx ⌉\nspeaker2: ⌊sir⌋\nspeaker1: and what's the problem exactly what happened\nspeaker2: uh sir I have a- we have a: ∙hh a- a: gentleman here that needs h:elp\nspeaker2: ↓and he's: not breathing here\nspeacke2: ∙hhh he's not breathing and we need\nspeacke2: we're pu- trying to pump him\nspeacke2: but he's not ∙hhh he⌈'s not⌉\nspeaker1: ⌊ o k⌋ey\nspeacke2: yes sir\nspeaker1: o key: how old is he\nspeacke2: he's uh fifty years old sir\nspeaker1: fifty o key ∙hh\nspeaker1: he's unconscious he'

# Model

In [8]:
#Model definition
model = SambaNovaEndpoint(
            model_kwargs={
                "do_sample": True, 
                "temperature": 0.01,
                "max_tokens_to_generate": 1500,
            }
        ) 

# sumarization

In [195]:
# basic prompt example
basic_summarization_prompt = load_prompt("../prompts/basic_sumarization.yaml")

#sumarization_prompt
summarization_prompt=load_prompt("../prompts/summarization.yaml")

#brief sumarization prompt
brief_summarization_prompt=load_prompt("../prompts/short_summary.yaml")


In [102]:
#basic output parser
output_parser = StrOutputParser()


In [197]:
basic_summarization_chain = basic_summarization_prompt | model | output_parser
summarization_chain = summarization_prompt | model | output_parser
brief_summarization_chain = brief_summarization_prompt | model | output_parser

In [131]:
# invoke chains
input_variables={"conversation":documents[0].page_content}
basic_summarization_response = basic_summarization_chain.invoke(input_variables)
summarization_response = summarization_chain.invoke(input_variables)
brief_summarization_response = brief_summarization_chain.invoke(input_variables)


In [132]:
print("\n\n---  basic sumarization ---")
print(basic_summarization_response)
print("\n\n---  summarization ---")
print(summarization_response)
print("\n\n---  brief summarization ---")
print(brief_summarization_response)



---  basic sumarization ---

A call between a fire paramedic and a person in need of emergency services. The person is requesting an ambulance and reports that a man is not breathing and unconscious. The paramedic asks for the address, which is provided as Carolwood Drive in Los Angeles, and the phone number the person is calling from. The paramedic also asks about the man's age and condition, and the person reports that the man is 50 years old and not responding to CPR. The paramedic informs the person that they are on their way and asks if anyone witnessed what happened. The person reports that only a doctor was present and that they are ready for the paramedics to arrive. The paramedic confirms that they are less than a mile away and will be there shortly.


---  summarization ---

A call was made to the fire department's emergency number, reporting an unconscious and unresponsive person who was not breathing. The caller provided the address of the emergency, which was in Los Ange

## example custom output parser 

In [10]:


class MainTopic(BaseModel):
    main_topic_classes: str = Field(description="main topic class of the conversartion")

    # custom validation logic
    @validator("main_topic_classes")
    def topic_in_topic_list(cls, field):
        if field in ["healt emergecy", "fire emergency", "terrorism emergency"]:
            raise ValueError("not a topic in the classes list")
        return field

main_topic_classifcation_output_parser = PydanticOutputParser(pydantic_object=MainTopic)

prompt = PromptTemplate(
    template= """<s>[INST] <<SYS>>given the folowing conversation:
\n
{conversation}.
\n
clasify it in one of the following main topic classes:
\n
{topic_classes}
\n
{format_instructions}
\n
<</SYS>>/n
[/INST]
""",
    input_variables=["conversation", "topic_classes"],
    partial_variables={"format_instructions":main_topic_classifcation_output_parser.get_format_instructions()}
)


# Main topic clasification

In [109]:
# main prompt example
basic_topic_classifcation_prompt = load_prompt("../prompts/basic_topic_classification.yaml")

#topic classification prompt
topic_classification_prompt=load_prompt("../prompts/topic_classifications.yaml")

In [110]:
list_output_parser = CommaSeparatedListOutputParser()
list_output_parser.schema
list_format_instructions = list_output_parser.get_format_instructions()

In [111]:
# Chains
basic_topic_classifcation_chain = basic_topic_classifcation_prompt | model | output_parser
#topic_classifcation_chain = topic_classification_prompt | model | main_topic_classifcation_output_parser
topic_classifcation_chain = topic_classification_prompt | model | list_output_parser


In [112]:
# invoke chains
topic_classes = ["medical emergecy", "animals emergency", "terrorism emergency", "fire emergency", "undefined"]
#topic_classes = []
input_variables={"conversation":documents[0].page_content, "topic_classes" : "\n\t- ".join(topic_classes), "format_instructions": list_format_instructions}

basic_topic_classifcation_response = basic_topic_classifcation_chain.invoke(input_variables)
topic_classifcation_response = topic_classifcation_chain.invoke(input_variables)



In [113]:
print("\n\n---  basic topic classification ---")
print(basic_topic_classifcation_response)
print("\n\n--- topic classification ---")
print(topic_classifcation_response)



---  basic topic classification ---

The main topic of this conversation is:

* Emergency medical services

The conversation is between a fire paramedic and a person who is requesting an ambulance for a medical emergency. The paramedic asks for information about the location, the patient's condition, and the situation, and the person on the phone provides the necessary details. The paramedic assures the person that they are on their way and offers additional instructions and reassurance.


--- topic classification ---
['Medical emergency', 'undefined']


# NER

In [109]:
#NER prompt
basic_ner_prompt=load_prompt("../prompts/basic_ner.yaml")

In [110]:
#basic output parser
output_parser = StrOutputParser()

In [111]:
basic_ner_chain = basic_ner_prompt | model | output_parser

In [112]:
# invoke chains
entities = ["city", "address", "customer_name", "payment_type"]
input_variables={"conversation":documents[0].page_content, "entities" : "\n\t- ".join(entities)}

basic_ner_response = basic_ner_chain.invoke(input_variables)


In [113]:
print("\n\n--- Basic NER ---")
print(basic_ner_response)



--- Basic NER ---

City: Los Angeles
Address: Carolwood Drive, 90077
Customer Name: (not provided)
Payment Type: (not provided)


In [47]:
# output parser 
entities = ["city", "address", "customer_name", "payment_type"]

response_schemas = []

for entity in entities:
    response_schemas.append(ResponseSchema(name=entity, description=f"{entity}s find in conversation", type="list"))

entities_output_parser = StructuredOutputParser.from_response_schemas(response_schemas)


#print(entities_output_parser.get_format_instructions())


In [115]:
# ner_prompt = PromptTemplate(
#     template= """<s>[INST] <<SYS>> Given the following conversation
#           {conversation}
#           extract th following named entities
#           {entities}
#           if there is not a given value for one entity in the list keep it blank
          
#           {format_instructions}
#           <</SYS>>/n
#           NER: [/INST]
# """,
#     input_variables=["conversation", "entities"],
#     partial_variables={"format_instructions":entities_output_parser.get_format_instructions()}
# )

ner_prompt = load_prompt("../prompts/ner.yaml")

In [116]:
ner_chain = ner_prompt | model | entities_output_parser

In [117]:
input_variables={"conversation":documents[0].page_content,
                 "entities" : "\n\t- ".join(entities), 
                 "format_instructions":entities_output_parser.get_format_instructions()
                 }

ner_response = ner_chain.invoke(input_variables)

In [118]:
print("\n\n--- NER ---")
print(type(ner_response))
print(ner_response)



--- NER ---
<class 'dict'>
{'city': ['Los Angeles'], 'address': ['Carolwood Drive'], 'customer_name': [], 'payment_type': []}


# Sentiment Analysis 

In [173]:
basic_sentiment_analysis_prompt = load_prompt("../prompts/basic_sentiment_analysis.yaml")
sentiment_analysis_prompt = load_prompt("../prompts/sentiment_analysis.yaml")
sentiment_analysis_prompt2 = load_prompt("../prompts/sentiment_analysis2.yaml")

In [174]:
#basic output parser
output_parser = StrOutputParser()

In [175]:
basic_sentiment_analysis_chain = basic_sentiment_analysis_prompt | model | output_parser
sentiment_analysis_chain = sentiment_analysis_prompt | model | output_parser
sentiment_analysis_chain2 = sentiment_analysis_prompt2 | model | output_parser

In [176]:
# invoke chains
input_variables={"conversation":documents[0].page_content}
basic_sentiment_analysis_response = basic_sentiment_analysis_chain.invoke(input_variables)
sentiment_analysis_response = sentiment_analysis_chain.invoke(input_variables)
sentiment_analysis_response2 = sentiment_analysis_chain2.invoke(input_variables)


In [177]:
print("\n\n---  basic sentiment analysis ---")
print(basic_sentiment_analysis_response)
print("\n\n---  sentiment analysis ---")
print(sentiment_analysis_response)
print("\n\n---  sentiment analysis 2 ---")
print(sentiment_analysis_response2)



---  basic sentiment analysis ---

The overall customer's mood throughout the call can be described as "urgent". The customer is in a state of emergency and is seeking immediate assistance for a person who is not breathing and unconscious. The customer is anxious and concerned for the person's well-being and is trying to provide as much information as possible to the dispatcher to ensure that help arrives as quickly as possible. The customer's urgency and concern are evident in their tone and language throughout the call.


---  sentiment analysis ---

Negative


---  sentiment analysis 2 ---
panicked


# Factual Accuracy Analysis

In [41]:
# facts bdv  creation 
vdb=VectorDb()
imput_path="../data/facts"
retriever = vdb.create_vdb(imput_path,1500,200,"faiss",None).as_retriever()

2024-02-16 16:12:36,988 [INFO] - Total 1 files loaded
2024-02-16 16:12:36,988 [INFO] - Total 1 chunks created
2024-02-16 16:12:36,989 [INFO] - Load pretrained SentenceTransformer: hkunlp/instructor-large


load INSTRUCTOR_Transformer


2024-02-16 16:12:40,293 [INFO] - Use pytorch device: cpu
2024-02-16 16:12:40,294 [INFO] - Processing embeddings using hkunlp/instructor-large. This could take time depending on the number of chunks ...


max_seq_length  512


2024-02-16 16:12:40,698 [INFO] - Vector store saved to None


In [56]:
#output parser 

factual_accuracy_analysis_response_schemas = [ResponseSchema(name="correct", description="wether or not the provided information is correct", type="bool"),
                                              ResponseSchema(name="errors", description="list of summarized errors made by the agent, if there is no errors, emplty list" , type="list")
                                              ]

factual_accuracy_analysis_output_parser = StructuredOutputParser.from_response_schemas(factual_accuracy_analysis_response_schemas)

format_instructions=factual_accuracy_analysis_output_parser.get_format_instructions()
#print(format_instructions)

In [57]:
retrieval_qa_chat_prompt = load_prompt("../prompts/factual_accuracy_analysis.yaml")

combine_docs_chain = create_stuff_documents_chain(
    model, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
model_response=retrieval_chain.invoke({"input": documents[0].page_content, "format_instructions":format_instructions})["answer"]

factual_accuracy_analysis_response=factual_accuracy_analysis_output_parser.invoke(model_response)

In [60]:
print("---  factual accuracy analysis ---")
print(type(factual_accuracy_analysis_response))
print(factual_accuracy_analysis_response)


---  factual accuracy analysis ---
<class 'dict'>
{'correct': False, 'errors': ['Agent provided medical instructions, which is not allowed', 'Agent did not instruct the user to go to the nearest hospital']}


# Methods

## sumarization method

In [9]:
def get_summary(conversation, model=model):
    summarization_prompt=load_prompt("../prompts/summarization.yaml")
    output_parser = StrOutputParser()
    summarization_chain = summarization_prompt | model | output_parser
    input_variables={"conversation": conversation}
    print("summarizing")
    summarization_response = summarization_chain.invoke(input_variables)
    print("summarizing done")
    return summarization_response

## main topic clasiffication method

In [10]:

def classify_main_topic(conversation, classes, model=model):
    topic_classification_prompt=load_prompt("../prompts/topic_classifications.yaml")
    list_output_parser = CommaSeparatedListOutputParser()
    list_format_instructions = list_output_parser.get_format_instructions()
    topic_classifcation_chain = topic_classification_prompt | model | list_output_parser
    input_variables={"conversation":conversation, "topic_classes" : "\n\t- ".join(classes), "format_instructions": list_format_instructions}
    print("cassification")
    topic_classifcation_response = topic_classifcation_chain.invoke(input_variables)
    print("classification done")
    return topic_classifcation_response
    
    

## named entity recognition method

In [11]:

def get_entities(conversation, entities, model=model):
    ner_prompt = load_prompt("../prompts/ner.yaml")
    response_schemas = []
    for entity in entities:
        response_schemas.append(ResponseSchema(name=entity, description=f"{entity}s find in conversation", type="list"))
    entities_output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    ner_chain = ner_prompt | model | entities_output_parser
    input_variables={"conversation":conversation,
                     "entities" : "\n\t- ".join(entities), 
                     "format_instructions":entities_output_parser.get_format_instructions()
                    }
    print("extracting entities")
    ner_response = ner_chain.invoke(input_variables)
    print("extracting entities done")
    return ner_response
    

## sentiment analysis method

In [12]:

def get_sentiment(conversation, model=model):
    sentiment_analysis_prompt = load_prompt("../prompts/sentiment_analysis2.yaml")
    output_parser = StrOutputParser()
    sentiment_analysis_chain = sentiment_analysis_prompt | model | output_parser
    input_variables={"conversation":conversation}
    print("sentiment analysis")
    sentiment_analysis_response = sentiment_analysis_chain.invoke(input_variables)
    print("sentiment analysis done")
    return sentiment_analysis_response

## factual check method

In [13]:

def set_retriever(documents_path):
    vdb=VectorDb()
    retriever = vdb.create_vdb(documents_path,1500,200,"faiss",None).as_retriever()
    return retriever

def factual_accuracy_analysis(conversation, retriever, model=model):
    factual_accuracy_analysis_response_schemas = [ResponseSchema(name="correct",
                                                                 description="wether or not the provided information is correct",
                                                                 type="bool"
                                                                 ),
                                                  ResponseSchema(name="errors",
                                                                 description="list of summarized errors made by the agent, if there is no errors, emplty list" ,
                                                                 type="list")
                                                ]
    factual_accuracy_analysis_output_parser = StructuredOutputParser.from_response_schemas(factual_accuracy_analysis_response_schemas)
    format_instructions=factual_accuracy_analysis_output_parser.get_format_instructions()
    retrieval_qa_chat_prompt = load_prompt("../prompts/factual_accuracy_analysis.yaml")
    combine_docs_chain = create_stuff_documents_chain(
        model, retrieval_qa_chat_prompt
    )
    retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
    input_variables={"input":conversation,
                     "format_instructions":format_instructions
                    }
    model_response=retrieval_chain.invoke(input_variables)["answer"]
    print("factual check")
    factual_accuracy_analysis_response=factual_accuracy_analysis_output_parser.invoke(model_response)
    print("factual check done")
    return factual_accuracy_analysis_response


# complete analysis 

In [14]:
path="../data/conversations/transcription"
conversations = os.listdir(path)
documents = []
for conversation in conversations:
    conversation_path=os.path.join(path, conversation)
    loader = TextLoader(conversation_path)
    documents.extend(loader.load())
documents

[Document(page_content="speaker1: fire paramedic thirty three what is the address of your emergency\nspeaker2: ys sir I need to: I: uh hh I need an ambulance as soon as possible sir\nspeaker1: Ok sir what's your address\nspeaker2: Los Angeles speaker2ifornia\nspeaker2: nine zero zero seven seven\nspeaker1: 's it Carolwood\nspeaker2: Carolwood Drive yes hh ∙hh\nspeaker3: yes\nspeaker3: xxx xxx xxx xxx xxx xxx xxx xxx\nspeaker1: ok sir what's the phone number you are speaker2ling from\nspeaker1: xxx ⌈xxx ⌉\nspeaker2: ⌊sir⌋\nspeaker1: and what's the problem exactly what happened\nspeaker2: uh sir I have a- we have a: ∙hh a- a: gentleman here that needs h:elp\nspeaker2: ↓and he's: not breathing here\nspeacke2: ∙hhh he's not breathing and we need\nspeacke2: we're pu- trying to pump him\nspeacke2: but he's not ∙hhh he⌈'s not⌉\nspeaker1: ⌊ o k⌋ey\nspeacke2: yes sir\nspeaker1: o key: how old is he\nspeacke2: he's uh fifty years old sir\nspeaker1: fifty o key ∙hh\nspeaker1: he's unconscious he'

In [145]:
def call_analysis(conversation, documents_path, classes_list, entities_list):
    retriever = set_retriever(documents_path)
    summary = get_summary(conversation)
    classification = classify_main_topic(conversation, classes_list)
    entities = get_entities(conversation, entities_list)
    sentiment = get_sentiment(conversation)
    factual_analysis = factual_accuracy_analysis(conversation, retriever)
    quality_score = None 
    
    return {
        "summary": summary,
        "classification": classification,
        "entities": entities,
        "sentiment": sentiment,
        "factual_analysis": factual_analysis,
        "quality_score": quality_score
    }

classes = ["medical emergecy", "animals emergency", "terrorism emergency", "fire emergency", "undefined"]   
entities = ["city", "address", "customer_name", "payment_type"]
pprint(call_analysis(conversation=documents[0].page_content, documents_path="../data/facts", classes_list=classes, entities_list=entities))

2024-02-19 17:12:43,126 [INFO] - Total 1 files loaded
2024-02-19 17:12:43,127 [INFO] - Total 1 chunks created
2024-02-19 17:12:43,128 [INFO] - Load pretrained SentenceTransformer: hkunlp/instructor-large


load INSTRUCTOR_Transformer


2024-02-19 17:12:45,547 [INFO] - Use pytorch device: cpu
2024-02-19 17:12:45,548 [INFO] - Processing embeddings using hkunlp/instructor-large. This could take time depending on the number of chunks ...


max_seq_length  512


2024-02-19 17:12:46,070 [INFO] - Vector store saved to None


summarizing
summarizing done
cassification
classification done
extracting entities
extracting entities done
sentiment analysis
sentiment analysis done
factual check
factual check done
{'classification': ['Medical emergency', 'undefined'],
 'entities': {'address': ['Carolwood Drive'],
              'city': ['Los Angeles'],
              'customer_name': [],
              'payment_type': []},
 'factual_analysis': {'correct': False,
                      'errors': ['Agent provided medical instructions, which '
                                 'is not allowed',
                                 'Agent did not instruct the user to go to the '
                                 'nearest hospital']},
 'quality_score': None,
 'sentiment': 'panicked',
 'summary': '\n'
            "A call was made to the fire department's emergency number, "
            'reporting an unconscious and unresponsive person who was not '
            'breathing. The caller provided the address of the emergency, '
       

In [15]:
import concurrent.futures

def call_analysis_parallel(conversation, documents_path, classes_list, entities_list):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submitting tasks to executor
        retriever_future = executor.submit(set_retriever, documents_path=documents_path)
        summary_future = executor.submit(get_summary, conversation=conversation)
        classification_future = executor.submit(classify_main_topic, conversation=conversation, classes=classes)
        entities_future = executor.submit(get_entities, conversation=conversation, entities=entities_list)
        sentiment_future = executor.submit(get_sentiment, conversation=conversation)
        retriever=retriever_future.result()
        factual_analysis_future = executor.submit(factual_accuracy_analysis, conversation=conversation, retriever = retriever)

        # Retrieving results
        summary = summary_future.result()
        classification = classification_future.result()
        entities = entities_future.result()
        sentiment = sentiment_future.result()
        factual_analysis = factual_analysis_future.result()

    quality_score = None  # Assuming this doesn't require parallel execution

    return {
        "summary": summary,
        "classification": classification,
        "entities": entities,
        "sentiment": sentiment,
        "factual_analysis": factual_analysis,
        "quality_score": quality_score
    }
classes = ["medical emergecy", "animals emergency", "terrorism emergency", "fire emergency", "undefined"]   
entities = ["city", "address", "customer_name", "payment_type"]
pprint(call_analysis_parallel(conversation=documents[0].page_content, documents_path="../data/facts", classes_list=classes, entities_list=entities))

extracting entitiessentiment analysis
summarizing

cassification
model called
model called
model called
model called
classification done
summarizing done


2024-02-19 17:20:08,280 [INFO] - Total 1 files loaded
2024-02-19 17:20:08,281 [INFO] - Total 1 chunks created
  from tqdm.autonotebook import trange
2024-02-19 17:20:09,381 [INFO] - Load pretrained SentenceTransformer: hkunlp/instructor-large


load INSTRUCTOR_Transformer


2024-02-19 17:20:11,561 [INFO] - Use pytorch device: cpu
2024-02-19 17:20:11,562 [INFO] - Processing embeddings using hkunlp/instructor-large. This could take time depending on the number of chunks ...


max_seq_length  512


2024-02-19 17:20:11,925 [INFO] - Loading faiss.
2024-02-19 17:20:11,951 [INFO] - Successfully loaded faiss.
2024-02-19 17:20:11,957 [INFO] - Vector store saved to None


model called
extracting entities done
sentiment analysis done
factual check
factual check done
{'classification': ['Medical emergency', 'undefined'],
 'entities': {'address': ['Carolwood Drive'],
              'city': ['Los Angeles'],
              'customer_name': [],
              'payment_type': []},
 'factual_analysis': {'correct': False,
                      'errors': ['Agent provided medical instructions, which '
                                 'is not allowed',
                                 'Agent did not instruct the user to go to the '
                                 'nearest hospital']},
 'quality_score': None,
 'sentiment': 'panicked',
 'summary': '\n'
            "A call was made to the fire department's emergency number, "
            'reporting an unconscious and unresponsive person who was not '
            'breathing. The caller provided the address of the emergency, '
            'which was in Los Angeles, and mentioned that a doctor was present '
            'an