# 📍 NLP Term Project
⭐ **Team 5**: 2277018 Seoyeon Ye, 2277025 Eunsang Lee, 2277031 Ahyun 

- Setup
  - set parameters
  - load data
- Construct KB
- Categorization using LLM
- Define prompt
- Generate answer
- Evaluate
- Save the answers

## Setup

In [7]:
!pip3 install -qU python-dotenv langchain langchain-community langchain-core langchain-text-splitters langchain_upstage oracledb langid python-dotenv faiss-cpu pymupdf4llm llama_index

### set parameters

In [4]:
api_key = "YOUR_API_HERE"

data_path = "./" # folder path containing samples.csv

UPSTAGE_API_KEY = api_key

### load data

In [8]:
# read csv file
import os
import pandas as pd

def read_data(data_path):
    data = pd.read_csv(data_path)
    prompts = data['prompts']
    answers = data['answers']
    return prompts, answers

prompts, answers = read_data(os.path.join(data_path, 'testset.csv'))

## Load KB

In [17]:
import pymupdf4llm
from langchain_upstage import UpstageEmbeddings
from langchain_community.vectorstores import FAISS

dirs = ['business', 'ewha', 'history', 'law', 'philosophy', 'psychology']
embeddings = UpstageEmbeddings(model="embedding-passage", api_key=UPSTAGE_API_KEY)

# Initialize a dictionary to hold vector stores for each document
vector_stores = {}

for d in dirs:
    vector_store = FAISS.load_local('./KB/{0}/'.format(d), embeddings, allow_dangerous_deserialization=True)
    vector_stores[d] = vector_store
    
print(type(vector_stores['business']))

<class 'langchain_community.vectorstores.faiss.FAISS'>


## Categorization using LLM

In [18]:
import langid
def check_ewha(query): # classify using which language used
    lang, _ = langid.classify(query)
    if lang == 'ko':
        return True
    else:
        return False

In [19]:
from langchain_upstage import ChatUpstage
from langchain_core.prompts import PromptTemplate

llm = ChatUpstage(api_key = UPSTAGE_API_KEY, temperature=0)

prompt_template_option = PromptTemplate.from_template(
    """
    # Request
    You are an AI assistant designed to provide accurate and concise answers. 
    Please follow these instructions carefully to ensure the best response:
    
    1. Analyze the provided Question and determine the most appropriate category(1~5) from the list below.
    2. Choose one category based on its content.
    3. Provide the final answer in the following format:
       [ANSWER]: (X)  # Replace X with the category number (1~5)
       
    # Categories
    (1) Business 
    (2) History 
    (3) Law 
    (4) Philosophy 
    (5) Psychology
    
    ---
    # Important Notes
    - The answer must only indicate the category number; do not provide explanations or the answer to the question itself.
    - Answer which category the question belongs to, not the answer to the question itself. Keep this in mind.
    - Ensure your response strictly matches the required format: `[ANSWER]: (X)`.
    
    ---
    # Question
    {question}
    
    ---
    # Answer
    [ANSWER]: 
    """
)

chain_option = prompt_template_option | llm

## Define prompt

In [20]:
prompt_template = PromptTemplate.from_template(
    """
    # Question
    {question}
    
    # Requests
    - You are {role} intended to provide accurate and concise answers using the provided context.
    - Solve the question step by step.
    - Analyze the context and identify relevant information related to the question.
    - Finish your answer with "[ANSWER]: (X)" where X is the correct letter choice.
    
    # Context
    {context}
    
    # Steps:
    Your steps here.
    """
)

chain = prompt_template | llm

## Generate answer

In [21]:
# function to extract an answer from response

import re

def extract_option(response):
    """
    extracts the answer from the response using a regular expression.
    expected format: "[ANSWER]: (A) convolutional networks"

    if there are any answers formatted like the format, it returns None.
    """
    pattern = r"\[ANSWER\]:\s*\((1-6)\)"  # Regular expression to capture the answer letter and text
    match = re.search(pattern, response)

    if match:
        return match.group(1) # Extract the number inside parentheses
    else:
        return extract_again_option(response)

def extract_again_option(response):
    pattern = r"\b[1-6]\b(?!.*\b[1-6]\b)"
    match = re.search(pattern, response)
    if match:
        return match.group(0)
    else:
        return '5'

In [None]:
responses = []
for prompt in prompts:
    if check_ewha(prompt): # question from ewha
        # Create the retriever
        vector_store = vector_stores['ewha']
        retriever = vector_store.as_retriever(
            search_type="mmr",        # Use "similarity" or "mmr" (Maximal Marginal Relevance)
            search_kwargs={"k": 5}    # Retrieve top 5 most similar chunks
        )
        
        relevant_docs = retriever.invoke(prompt)
        relevant_info = "\n".join(doc.page_content for doc in relevant_docs)

        response = chain.invoke({"question": prompt, "role": 'Ewha University Senator', "context": relevant_info})
        responses.append(response.content)
        continue
        
    response_option = chain_option.invoke({"question": prompt})
    opt = extract_option(response_option.content)
    
    print(opt)
    
    # business
    if opt == '1':
        # Create the retriever
        vector_store = vector_stores['business']
        retriever = vector_store.as_retriever(
            search_type="mmr",        # Use "similarity" or "mmr" (Maximal Marginal Relevance)
            search_kwargs={"k": 5}    # Retrieve top 5 most similar chunks
        )
        
        relevant_docs = retriever.invoke(prompt)
        relevant_info = "\n".join(doc.page_content for doc in relevant_docs)

        response = chain.invoke({"question": prompt, "role": 'business expert', "context": relevant_info})
        responses.append(response.content)
    
    # history
    elif opt == '2':
        # Create the retriever
        vector_store = vector_stores['history']
        retriever = vector_store.as_retriever(
            search_type="mmr",        # Use "similarity" or "mmr" (Maximal Marginal Relevance)
            search_kwargs={"k": 5}    # Retrieve top 5 most similar chunks
        )
        
        relevant_docs = retriever.invoke(prompt)
        relevant_info = "\n".join(doc.page_content for doc in relevant_docs)

        response = chain.invoke({"question": prompt, "role": 'paleoanthropologist', "context": relevant_info})
        responses.append(response.content)
    
    # law
    elif opt == '3':
        # Create the retriever
        vector_store = vector_stores['law']
        retriever = vector_store.as_retriever(
            search_type="mmr",        # Use "similarity" or "mmr" (Maximal Marginal Relevance)
            search_kwargs={"k": 5}    # Retrieve top 5 most similar chunks
        )
        
        relevant_docs = retriever.invoke(prompt)
        relevant_info = "\n".join(doc.page_content for doc in relevant_docs)

        response = chain.invoke({"question": prompt, "role": 'legal expert', "context": relevant_info})
        responses.append(response.content)
    
    # philosophy    
    elif opt == '4':
        # Create the retriever
        vector_store = vector_stores['philosophy']
        retriever = vector_store.as_retriever(
            search_type="mmr",        # Use "similarity" or "mmr" (Maximal Marginal Relevance)
            search_kwargs={"k": 5}    # Retrieve top 5 most similar chunks
        )
        
        relevant_docs = retriever.invoke(prompt)
        relevant_info = "\n".join(doc.page_content for doc in relevant_docs)

        response = chain.invoke({"question": prompt, "role": 'philosophy expert', "context": relevant_info})
        responses.append(response.content)
    
    # psychology
    else:
        # Create the retriever
        vector_store = vector_stores['psychology']
        retriever = vector_store.as_retriever(
            search_type="mmr",        # Use "similarity" or "mmr" (Maximal Marginal Relevance)
            search_kwargs={"k": 5}    # Retrieve top 5 most similar chunks
        )
        
        relevant_docs = retriever.invoke(prompt)
        relevant_info = "\n".join(doc.page_content for doc in relevant_docs)

        response = chain.invoke({"question": prompt, "role": 'psychology expert', "context": relevant_info})
        responses.append(response.content)
    

## Evaluate

In [None]:
# function to extract an answer from response

import re

def extract_answer(response):
    """
    extracts the answer from the response using a regular expression.
    expected format: "[ANSWER]: (A) convolutional networks"

    if there are any answers formatted like the format, it returns None.
    """
    pattern = r"\[ANSWER\]:\s*\((A-Z)\)"  # Regular expression to capture the answer letter and text
    match = re.search(pattern, response)

    if match:
        return match.group(1) # Extract the letter inside parentheses (e.g., A)
    else:
        return extract_again(response)

def extract_again(response):
    pattern = r"\b[A-Z]\b(?!.*\b[A-Z]\b)"
    match = re.search(pattern, response)
    if match:
        return match.group(0)
    else:
        return None

In [None]:
# print accuracy

cnt = 0

for answer, response in zip(answers, responses):
    print("-"*10)
    generated_answer = extract_answer(response)
    print(response)
    # check
    if generated_answer:
        print(f"generated answer: {generated_answer}, answer: {answer}")
    else:
        print("extraction fail")


    if generated_answer == None:
        continue
    if generated_answer in answer:
        cnt += 1

print()
print(f"acc: {(cnt/len(responses))*100}%")

## Save the answers

In [None]:
# you can save the answers if you need
"""
file_path = "answers_{0}.txt".format("history")
with open(file_path, "w", encoding='utf-8') as file:
    for item in responses:
        file.write(item + "\n========\n")
"""