In [2]:
import os

from langchain_openai import OpenAI, OpenAIEmbeddings
from pinecone import Pinecone

In [3]:
%pwd

'/home/rohwid/GitHub/question-and-answer-chatbot/notebooks'

In [4]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [5]:
%pwd

'/home/rohwid/GitHub/question-and-answer-chatbot'

In [6]:
with open('.env') as f:
    os.environ.update(
        line.strip().split('=') for line in f
)

In [7]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_INDEX = os.environ.get('PINECONE_INDEX')

In [8]:
pincone = Pinecone(api_key=PINECONE_API_KEY)
index = pincone.Index(PINECONE_INDEX)

In [9]:
from langchain_pinecone import Pinecone

In [10]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1536)
text_field = "text"

query = "What are the specific features or aspects that users appreciate the most in our application?"

vectorstore = Pinecone(
    index, embeddings, text_field, namespace='ns-0'
)

In [11]:
vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='google_store_review_comments: Awesome UI, best music app out there!', metadata={'author_app_version': '1.1.0.91', 'author_name': 'A Google user', 'pseudo_author_id': '286593453219054880269', 'review_id': '672a155a-e81e-4d28-bdeb-a74c031bc072', 'review_likes': '1', 'review_rating': '5', 'review_timestamp': '2014-05-27 15:17:20', 'row': 3.0, 'source': '672a155a-e81e-4d28-bdeb-a74c031bc072'}),
 Document(page_content='google_store_review_comments: I enjoy the awesome UI of this app, and it has all the music one can ask for.', metadata={'author_app_version': '1.1.0.91', 'author_name': 'A Google user', 'pseudo_author_id': '234382942865437071667', 'review_id': 'bfa8876b-470e-4640-83a7-77427f7f37e8', 'review_likes': '4', 'review_rating': '5', 'review_timestamp': '2014-05-27 14:36:02', 'row': 1.0, 'source': 'bfa8876b-470e-4640-83a7-77427f7f37e8'}),
 Document(page_content='google_store_review_comments: Excellent music app', metadata={'author_app_version': '1.1.0.112', 'au

In [12]:
from langchain.chains import RetrievalQA

In [13]:
llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-instruct', openai_api_key=OPENAI_API_KEY)

In [14]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa_result = qa.invoke(query)

In [15]:
qa_result

{'query': 'What are the specific features or aspects that users appreciate the most in our application?',
 'result': ' The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.'}

## Chatbot with Chain and Memory

Create a prompt to instruct the model to perform as a chatbot also take the summarizaion result previously, take the human input and get the chat history. The chatbot will automatically using chain that connecting or relating the current human input and the previous chat as chat history with `ConversationBufferMemory` or well known as memory. So, `LLMChain` is the library by langchain will use to perform this chain to connects the current chato or human input with the previous chat.

In [16]:
from langchain import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain import LLMChain

In [17]:

# Setup prompt to perform as chatbot
command = """
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT: {text}
"""

prompt = PromptTemplate(
    input_variables=["text"],
    template=command,
)

command_prompt = prompt.format(text=qa_result['result'])

# Setup prompt to take human imput and chat history.
chat = """
{chat_history}
Human: {human_input}
Chatbot:"""

template = command_prompt + chat

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input"], 
    template=template
)

memory = ConversationBufferMemory(memory_key="chat_history")

chat_chain = LLMChain(
    llm=OpenAI(model_name='gpt-3.5-turbo-instruct', openai_api_key=OPENAI_API_KEY), 
    prompt=prompt, 
    verbose=True,
    memory=memory
)

Performs chat by taking the human input.

In [18]:
chat_chain.predict(human_input=f'{query}')



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.


Human: What are the specific features or aspects that users appreciate the most in our application?
Chatbot:[0m

[1m> Finished chain.[0m


' The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.'

## Test the Memory

Checking the memory by asking about the previous chat.

In [19]:
chat_chain.predict(human_input="What was I asked you about?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.

Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.
Human: What was I asked you about?
Chatbot:[0m

[1m> Finished chain.[0m


' You asked me about the specific features or aspects that users appreciate the most in our application.'

### Correctness using an additional reference

We can evaluate our generation based on correctness, which would relly on the internal knowledge of the LLM. This might not be the best approach since we are not sure if the LLM has the correct knowledge. To make sure we create our evaluator with `requires_reference=True` to use an additional reference to evaluate the correctness of the generation.

In [20]:
prompts = [
    "What are the specific features or aspects that users appreciate the most in our application?", 
    "In comparison to our application, which music streaming platform are users most likely to compare ours with?", 
    "What are the primary reasons users express dissatisfaction with Spotify?", 
    "Can you identify emerging trends or patterns in recent user reviews thatmay impact our product strategy?"
]

In [21]:
predicts = []

for prompt in prompts:
    predicts.append(chat_chain.predict(human_input=f'{prompt}').strip())

predicts



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.

Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.
Human: What was I asked you about?
AI:  You asked me about the specific features or aspects that users appreciate the most in our application.
Human: What are the specific features or aspects that users appreciate the most in our application?
Chatbot:[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT:  The users

['The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app. Is there anything else you would like to know?',
 'I am not sure which specific platform users compare ours with, but they may compare it to other popular music streaming services such as Spotify, Apple Music, or Pandora.',
 'Some users have expressed dissatisfaction with Spotify due to limited music options, technical issues, and the lack of certain features.',
 'Based on recent user reviews, it seems users are looking for more personalized and curated music recommendations, as well as a seamless and intuitive user experience. It may be beneficial to focus on these areas in our product strategy to meet the evolving needs of our users.']

In [22]:
evaluation_llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-instruct', openai_api_key=OPENAI_API_KEY)

In [23]:
references = [
    "Users frequently praise the intuitive UI design, extensive variety of music choices, and seamless user experience.",
    "Users often draw comparisons with Pandora when discussing our application's features and functionalities.",
    "Common concerns among dissatisfied users include occasional playback issues, difficulties in discovering new music, and a desire for a more personalized recommendation system.",
    "Recent reviews highlight an increasing demand for enhanced social sharing features, indicating a potential opportunity for improving the platform's community engagement."
]

In [24]:
from langchain.evaluation import load_evaluator

# create evaluator
evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=evaluation_llm,requires_reference=True)

# evaluate
eval_results = []

for predict, reference in zip(predicts, references):
    eval_result = evaluator.evaluate_strings(
        prediction=predict,
        input=prompt,
        reference=reference
    )
    
    eval_results.append(eval_result)

In [25]:
eval_results

[{'reasoning': 'Step 1: Check for correctness\n- The submission mentions the UI, variety of music, and overall experience, which aligns with the reference.\n- The submission also mentions continuous improvement, which is not explicitly mentioned in the reference but can be inferred as a positive aspect of the app.\n- The submission does not contain any false or misleading information.\n- Conclusion: The submission is correct, accurate, and factual.\n\nConclusion:',
  'value': 'Y',
  'score': 1},
 {'reasoning': 'Step 1: Is the submission correct?\n- The submission mentions that users may compare the product to other popular music streaming services such as Spotify, Apple Music, or Pandora.\n- The reference also mentions that users often compare the product to Pandora.\n- Therefore, the submission is correct.\n\nStep 2: Is the submission accurate?\n- The submission mentions that users may compare the product to other popular music streaming services such as Spotify, Apple Music, or Pando

## Retrival Augmented Generation (RAG) evaluation

Retrieval Augmented Generation (RAG) is one of the most popular use cases for LLMs, but it is also one of the most difficult to evaluate. We want RAG models to use the provided context to correctly answer a question, write a summary, or generate a response. This is a challenging task for LLMs, and it is difficult to evaluate whether the model is using the context correctly. 

Langchain has a handy `ContextQAEvalChain` class that allows you to evaluate your RAG models. It takes a `context` and a `question` as well as a `prediction` and a `reference` to evaluate the correctness of the generation. The evaluator returns a dictionary with the following values:

`reasoning`: String "chain of thought reasoning" from the LLM generated prior to creating the score    
`score`: Binary integer 0 to 1, where 1 would mean that the output is correct, and 0 otherwise  
`value`: A "CORRECT" or "INCORRECT" corresponding to the score

In [26]:
questions = [
    "What are the specific features or aspects that users appreciate the most in our application?", 
    "In comparison to our application, which music streaming platform are users most likely to compare ours with?", 
    "What are the primary reasons users express dissatisfaction with Spotify?", 
    "Can you identify emerging trends or patterns in recent user reviews thatmay impact our product strategy?"
]

In [32]:
contexts = []

for question in questions:
    documents = vectorstore.similarity_search(
        question,  # our search query
        k=3  # return 3 most relevant docs
    )

    context = [document.page_content.split(':')[1].strip() for document in documents]
    contexts.append(' '.join(context))

contexts

In [33]:
prompts = []

for question, context in zip(questions, contexts):
    prompts.append(f"""Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}""")

prompts

["Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n    Users frequently praise the intuitive UI design, extensive variety of music choices, and seamless user experience.\n\n    Question: What are the specific features or aspects that users appreciate the most in our application?",
 "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n    Users often draw comparisons with Pandora when discussing our application's features and functionalities.\n\n    Question: In comparison to our application, which music streaming platform are users most likely to compare ours with?",
 "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n    Common concerns among d

In [34]:
predicts = []

for prompt in prompts:
    predicts.append(chat_chain.predict(human_input=f'{prompt}').strip())

predicts



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.

Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.
Human: What was I asked you about?
AI:  You asked me about the specific features or aspects that users appreciate the most in our application.
Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app. Is there anythi

['As mentioned by users in their reviews, they particularly appreciate the intuitive UI design, extensive variety of music choices, and seamless user experience in our application. Is there anything else you would like to know?',
 'As indicated by user comparisons, Pandora is a popular music streaming platform that users may compare our app with in terms of features and functionalities. Is there anything else you would like to know?',
 'Some common reasons for dissatisfaction among users include occasional playback issues, difficulties in discovering new music, and a desire for a more personalized recommendation system on Spotify. Is there anything else you would like to know?',
 "Based on recent reviews, there seems to be an increasing demand for enhanced social sharing features, which could be a potential opportunity for improving our platform's community engagement. Is there anything else you would like to know?"]

As we can see without the context the generation is incorrect. Now lets see if our evaluator can detect that as well. As reference we will use the output of vectorsore similarity search. 

In [None]:
references = [
    "Users frequently praise the intuitive UI design, extensive variety of music choices, and seamless user experience.",
    "Users often draw comparisons with Pandora when discussing our application's features and functionalities.",
    "Common concerns among dissatisfied users include occasional playback issues, difficulties in discovering new music, and a desire for a more personalized recommendation system.",
    "Recent reviews highlight an increasing demand for enhanced social sharing features, indicating a potential opportunity for improving the platform's community engagement."
]

In [36]:
from langchain.evaluation import load_evaluator

# create evaluator
evaluator = load_evaluator("context_qa", llm=evaluation_llm)

# evaluate
zipped = zip(questions, predicts, contexts, references)

eval_results = []

for question, predict, context, reference in zipped:
  eval_result = evaluator.evaluate_strings(
    input=question,
    prediction=predict,
    context=context,
    reference=reference
  )
  
  eval_results.append(eval_result)

In [37]:
eval_results

[{'reasoning': 'CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'CORRECT', 'value': 'CORRECT', 'score': 1}]

## Pairwise comparison and scoring

Pairwise comparison or Scoring is a method for evaluating LLMs that asks the model to choose between two generations or generate scores for the quality. Those methods are useful for evaluating whether a model can generate a better response than another/previous model.
This can also be used to generate preference data or AI Feedback for RLAIF or DPO. 

Lets first look at the pairwise comparison. Here for we generate first two generations and then ask the LLM to choose between them.

In [40]:
prompt = "Write a short email to your boss about the meeting tomorrow."
pred_a = chat_chain.predict(human_input=f'{prompt}').strip()

prompt = "Write a short email to your boss about the meeting tomorrow" # remove the period to not use cached results
pred_b = chat_chain.predict(human_input=f'{prompt}').strip()



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.

Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.
Human: What was I asked you about?
AI:  You asked me about the specific features or aspects that users appreciate the most in our application.
Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app. Is there anythi

In [41]:
assert pred_a != pred_b

Now, lets use our LLM to select its preferred generation.

In [42]:
prompts = [
    "What are the specific features or aspects that users appreciate the most in our application?", 
    "In comparison to our application, which music streaming platform are users most likely to compare ours with?", 
    "What are the primary reasons users express dissatisfaction with Spotify?", 
    "Can you identify emerging trends or patterns in recent user reviews thatmay impact our product strategy?"
]

In [45]:
predicts_a = []

for prompt in prompts:
    predicts_a.append(chat_chain.predict(human_input=f'{prompt}').strip())

predicts_a



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.

Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.
Human: What was I asked you about?
AI:  You asked me about the specific features or aspects that users appreciate the most in our application.
Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app. Is there anythi

['As mentioned by users in their reviews, they particularly appreciate the intuitive UI design, extensive variety of music choices, and seamless user experience in our application. Is there anything else you would like to know?',
 'As indicated by user comparisons, Pandora is a popular music streaming platform that users may compare our app with in terms of features and functionalities. Is there anything else you would like to know?',
 'Some common reasons for dissatisfaction among users include occasional playback issues, difficulties in discovering new music, and a desire for a more personalized recommendation system on Spotify. Is there anything else you would like to know?',
 "Based on recent reviews, there seems to be an increasing demand for enhanced social sharing features, which could be a potential opportunity for improving our platform's community engagement. Is there anything else you would like to know?"]

In [46]:
predicts_b = []

for prompt in prompts:
    predicts_b.append(chat_chain.predict(human_input=f'{prompt}').strip())

predicts_b



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
%INSTRUCTIONS:
You are very helpful chatbot.

%TEXT:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.

Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app.
Human: What was I asked you about?
AI:  You asked me about the specific features or aspects that users appreciate the most in our application.
Human: What are the specific features or aspects that users appreciate the most in our application?
AI:  The users appreciate the awesome UI and the variety of music available in the app. They also enjoy the overall experience and the continuous improvement of the app. Is there anythi

['As mentioned by users in their reviews, they particularly appreciate the intuitive UI design, extensive variety of music choices, and seamless user experience in our application. Is there anything else you would like to know?',
 'As indicated by user comparisons, Pandora is a popular music streaming platform that users may compare our app with in terms of features and functionalities. Is there anything else you would like to know?',
 'Some common reasons for dissatisfaction among users include occasional playback issues, difficulties in discovering new music, and a desire for a more personalized recommendation system on Spotify. Is there anything else you would like to know?',
 "Based on recent reviews, there seems to be an increasing demand for enhanced social sharing features, which could be a potential opportunity for improving our platform's community engagement. Is there anything else you would like to know?"]

In [47]:
from langchain.evaluation import load_evaluator

# create evaluator
evaluator = load_evaluator("pairwise_string", llm=evaluation_llm)

# evaluate
eval_results = []
zipped = zip(predicts_a, predicts_b, prompts)

for predict_a, predict_b, prompt in zipped:
    eval_result = evaluator.evaluate_string_pairs(
        prediction=predict_a,
        prediction_b=predict_a,
        input=prompt,
    )

    eval_results.append(eval_result)

This chain was only tested with GPT-4. Performance may be significantly worse with other models.


In [48]:
eval_results

[{'reasoning': '\n\n[[C]] Both Assistant A and B provide the same response, so it is a tie. Both responses are helpful, relevant, correct, and demonstrate depth of thought by mentioning specific features and aspects that users appreciate in the application.',
  'value': None,
  'score': 0.5},
 {'reasoning': '\n\n[[C]] Both Assistant A and B provide the same response, so it is a tie. They both accurately mention Pandora as a popular music streaming platform that users may compare the app with in terms of features and functionalities. They also ask if there is anything else the user would like to know, showing helpfulness and relevance. However, they do not demonstrate depth of thought as they simply state a fact without providing any further analysis or explanation.',
  'value': None,
  'score': 0.5},
 {'reasoning': "\n\n[[C]] Both Assistant A and B provide the same response, so it is a tie. Both responses address the user's question and provide relevant and accurate information about t