# 2. Retrieval Augmented Generation

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

## 2.1 Document and Data Preprocessing

### 2.1.1 Preprocessing: PDF to Text Conversion

`pdfplumber`: Plumb a PDF for detailed information about each text character, rectangle, and line. Plus: Table extraction and visual debugging.

https://github.com/jsvine/pdfplumber

Several other Python libraries help users to extract information from PDFs. As a broad overview, pdfplumber distinguishes itself from other PDF processing libraries by combining these features:

* Easy access to detailed information about each PDF object
* Higher-level, customizable methods for extracting text and tables
* Tightly integrated visual debugging
* Other useful utility functions, such as filtering objects via a crop-box

It's also helpful to know what features pdfplumber does not provide:

* PDF generation
* PDF modification
* Optical character recognition (OCR)
* Strong support for extracting tables from OCR'ed documents

**Specific comparisons**

* `pdfminer.six` provides the foundation for pdfplumber. It primarily focuses on parsing PDFs, analyzing PDF layouts and object positioning, and extracting text. It does not provide tools for table extraction or visual debugging.
* `PyPDF2` is a pure-Python library "capable of splitting, merging, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files." It can extract page text, but does not provide easy access to shape objects (rectangles, lines, etc.), table-extraction, or visually debugging tools.
* `pymupdf` is substantially faster than pdfminer.six (and thus also pdfplumber) and can generate and modify PDFs, but the library requires installation of non-Python software (MuPDF). It also does not enable easy access to shape objects (rectangles, lines, etc.), and does not provide table-extraction or visual debugging tools.
* `camelot`, `tabula-py`, and `pdftables` all focus primarily on extracting tables. In some cases, they may be better suited to the particular tables you are trying to extract.

In [None]:
import pdfplumber
from pathlib import Path

file_path = Path("fis_issue22-3.pdf").resolve()
pdf = pdfplumber.open(file_path)
pages = pdf.pages

In [None]:
page = pages[0]
print(f'page witdh x height = {page.width} x {page.height}')
print('[Extracted Text]')
print(page.extract_text())
page.to_image(resolution=50)

In [None]:
page = pages[4]
print(f'page witdh x height = {page.width} x {page.height}')
page.to_image(resolution=50)

Bounding box of `pdfplumber.page.Page` = (`x0`, `top`, `x1`, `bottom`). If `relative` is `True`:
* (`x0`, `top`) = (x, y) coordinates of the top-left corner of the box
* (`x1`, `bottom`) = (x, y) coordinates of the bottom-right corner of the box

In [None]:
left = page.crop((0.0, 0.0, 0.5*page.width, page.height), relative=True, strict=True)
right = page.crop((0.5*page.width, 0.0, page.width, page.height), relative=True, strict=True)
left.to_image(resolution=50)

In [None]:
import re
# remove un used strings like 
def clean_text(s: str) -> str:
    # remove un used strings like 
    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', ' ', s)
    # if three white spaces or more(except '\n'), replace with empty string
    s = re.sub(r' {3,}', '', s)
    return s

print(clean_text(left.extract_text()))

`Layout-Parser` is a unified toolkit for deep learning based document image analysis

* https://layout-parser.github.io
* https://github.com/Layout-Parser/layout-parser
* https://github.com/Layout-Parser/layout-parser/blob/main/examples/Deep%20Layout%20Parsing.ipynb
* https://tesseract-ocr.github.io/tessdoc/Installation.html
* https://github.com/tesseract-ocr/tesseract
* https://yunwoong.tistory.com/51

In [None]:
import numpy as np
import layoutparser as lp
import torchvision.ops.boxes as bops
from io import BytesIO
from PIL import Image
from pdfplumber.page import Page, CroppedPage
from layoutparser.models.detectron2 import Detectron2LayoutModel

def inference_page(p: Page | CroppedPage, model: Detectron2LayoutModel) -> tuple:
    img = p.to_image()
    imgfile = BytesIO()
    img.save(imgfile, format='png', quantize=False)
    img = np.array(Image.open(imgfile).convert('RGB'))
    blocks = model.detect(img)
    return blocks, img

# lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config
# lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config
model = Detectron2LayoutModel(
    'lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config', 
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)

# using deep learning model to detect the layout of figures
blocks, img = inference_page(left, model)
lp.draw_box(img, blocks, box_width=3, box_alpha=0.2, show_element_type=True)

It returns the coordinates of (`x_0`, `y_0`, `x_1`, `y_1`) for each text box in the PDF.

In [None]:
import matplotlib.pyplot as plt

def draw_block_with_coords(block, img):
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    ax.imshow(img)
    # draw bounding box of the first block on the img
    ax.add_patch(plt.Rectangle(
    (block.x_1, block.y_1),   # x, y
    block.x_2 - block.x_1,  # width
    block.y_2 - block.y_1,  # height
    fill=False, edgecolor='red', lw=1.5))
    # draw the red dot at the top-left corner of the bounding box
    ax.plot(block.x_1, block.y_1, 'ro')
    # draw the red dot at the bottom-right corner of the bounding box
    ax.plot(block.x_2, block.y_2, 'ro')
    # write the coordinates
    ax.text(block.x_1 - 10, block.y_1 - 8.5, 
            f'(x1,y1)=({block.x_1:.2f}, {block.y_1:.2f})', 
            fontsize=12, color='red')
    ax.text(block.x_2 - 100, block.y_2 + 30, 
            f'(x2,y2)=({block.x_2:.2f}, {block.y_2:.2f})', 
            fontsize=12, color='red')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    plt.show()

block = blocks[0].block
print(f'Coordinates of the first block:')
print(f'x1: {block.x_1:.4f}, y1: {block.y_1:.4f}')
print(f'x2: {block.x_2:.4f}, y2: {block.y_2:.4f}')

draw_block_with_coords(block, img)

In [None]:
# store all the pdf pages for further usage
PDF_PAGES = []
for page in pages:
    if page.height > page.width:
        PDF_PAGES.append(page)
    else:
        left = page.crop((0.0, 0.0, 0.5*page.width, page.height), relative=True, strict=True)
        right = page.crop((0.5*page.width, 0.0, page.width, page.height), relative=True, strict=True)
        PDF_PAGES.append(left)
        PDF_PAGES.append(right)

### 2.1.2 Load Documents 

`Document` is a basic class of object to represent a piece of information. 

* `page_content`, `metadata`

In [None]:
import json
from langchain_core.documents import Document

doc = Document(
    page_content='I am a page content',
    metadata = {'page': 0}
)
doc.__dict__

In [None]:
with open('./fis_issue22_3.json', 'r') as f:
    all_data = json.load(f)

docs = []
for data in all_data:
    doc = Document(
        page_content = data['page_content'],
        metadata = data['metadata']
    )
    docs.append(doc)

print(docs[0].page_content)
print(f'\nTotal Documents: {len(docs)}')

### 2.1.3 Text Splitter

Benefits for splitting documents:

* **Efficient resouce management**: Inputting an entire document into an LLM is costly and hinders efficient answer extraction from extensive information, sometimes causing hallucination issues. Therefore, the aim is to extract only the information needed for the response.
* **Accurate information retrieval**: Segmenting documents aids in extracting only the relevant information for a given query. By focusing on specific topics or content within each segment, it ensures the retrieval of highly pertinent information.

At a high level, text splitters work as following:

1. Split the text up into small, semantically meaningful chunks (often sentences).
2. Start combining these small chunks into a larger chunk until you reach a certain size (as measured by some function).
3. Once you reach that size, make that chunk its own piece of text and then start creating a new chunk of text with some overlap (to keep context between chunks).

That means there are two different axes along which you can customize your text splitter:

1. How the text is split
2. How the chunk size is measured

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # the maximum size of the split text chunks.
    chunk_size=250,
    # the number of overlapping characters between split text chunks.
    chunk_overlap=0,
    # the function to calculate the length of the text.
    length_function=len,
    # whether the delimiter is a regular expression.
    is_separator_regex=False,
    
)

texts = []
metadatas = []
for data in all_data:
    texts.append(data['page_content'])
    metadatas.append(data['metadata'])
documents = text_splitter.create_documents(texts, metadatas)

print(f'Original number of pages: {len(all_data)}')
print(f'Total Documents: {len(documents)}')

`RecursiveCharacterTextSplitter` is designed for general text and operates by using a specified list of characters as parameters. It splits the text in the order of the provided characters until the resulting chunks are small enough. By default, it uses the characters `["\n\n", "\n", " ", ""]`, splitting the text recursively from paragraphs to sentences to words.

In [None]:
print(all_data[4]['page_content'])

In [None]:
split_documents = [d for d in documents if d.metadata['page'] == 4]
for i, sd in enumerate(split_documents):
    print(f'\n[Split Documents for Page 4] - {i+1}')
    print(sd.page_content)

## 2.2 Embeddings 

Embedding models create a vector representation of a piece of text that captures the semantic meaning of the text.

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

openai_embeddings = OpenAIEmbeddings(
    api_key = os.getenv('OPENAI_API_KEY'),
    model = 'text-embedding-3-small', # text-embedding-ada-002
    dimensions = 1024  # 1536 for text-embedding-ada-002
)

model_name = 'intfloat/multilingual-e5-large-instruct'
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},  # cuda, cpu
    encode_kwargs={'normalize_embeddings': True},
)

Similarity calculation

* `Cosine similarity` is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. It is thus a judgment of orientation and not magnitude: two vectors with the same orientation have a cosine similarity of 1, two vectors at 90° have a similarity of 0, and two vectors diametrically opposed have a similarity of -1, independent of their magnitude.

$$\text{cosine\_similarity}(\mathbf{a}, \mathbf{b}) = \frac{\mathbf{a} \cdot \mathbf{b}}{\vert\mathbf{a}\vert \vert\mathbf{b}\vert} = \frac{\sum_{i=1}^{n} a_i b_i}{\sqrt{\sum_{i=1}^{n} a_i^2} \sqrt{\sum_{i=1}^{n} b_i^2}}$$



In [None]:
# embed the text into a vector 
e1_en = np.array(openai_embeddings.embed_query('Hello, world!'))
e1_kr = np.array(openai_embeddings.embed_query('안녕하세요, 세계!'))
e2_en = np.array(hf_embeddings.embed_query('Hello, world!'))
e2_kr = np.array(hf_embeddings.embed_query('안녕하세요, 세계!'))

# they are not quite similar since it is mostly trained on English
print(f'OpenAI Similarity: {e1_en @ e1_kr: .4f}')
# they are similar since it is trained on multi-lingual
print(f'HuggingFace Similarity: {e2_en @ e2_kr: .4f}')
# they are not similar, since they are trained on different models
print(f'Two Embeddings Similarity: {e1_en @ e2_en: .4f}')

visualize embeddings

In [None]:
# visualize the embeddings
n = 100
fig, ax = plt.subplots(1, 1, figsize=(16, 6), sharex=True)
es = np.array([e1_en, e1_kr, e2_en, e2_kr])[:, :n]
titles = ['OpenAI(EN)', 'OpenAI(KR)','HuggingFace(EN)', 'HuggingFace(KR)']
m = ax.matshow(es, cmap='coolwarm', interpolation='nearest', aspect=4)
ax.set_yticks(np.arange(len(titles)))
ax.set_yticklabels(titles)
ax.set_title('Embeddings (First 100 dimensions)', y=1.1, fontsize=16)
ax.set_xticks(list(range(0, n, 10))+[n-1])
ax.set_xticklabels([1] + list(range(10, n, 10))+[n])
ax.set_xlabel('n-th Dimension')
plt.gca().xaxis.tick_bottom()
plt.colorbar(m, orientation='vertical', fraction=0.35, shrink=0.40, pad=0.05, label='normalized value', ax=ax)
plt.show()

## 2.3 Vector Store and Retriever

A vector store takes care of storing embedded data and performing vector searches.

In [None]:
# FAISS 
from langchain_community.vectorstores import FAISS

faiss_db = FAISS.from_documents(
    documents,     # embedding documents
    hf_embeddings  # embedding model
)


`FAISS` is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. FAISS is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed by Facebook AI Research.

* https://github.com/facebookresearch/faiss

In [None]:
# similarity search with vector store
similar_docs = faiss_db.similarity_search('재정 융자사업의 개념은 무엇인가요?', k=3)

# returns the list of similar documents with the score(L2 distance)
similar_docs = faiss_db.similarity_search_with_score('재정융자 사업의 개념은 무엇인가요?', k=3)

print('[Search Results]')
for i, (doc, score) in enumerate(similar_docs):
    print(f'Document {i}(s={score:.4f}) - Page: {doc.metadata["page"]}')
    print(f'{doc.page_content}\n')

In [None]:
PDF_PAGES[2].to_image(resolution=70)

In [None]:
# filter the documents with the metadata
similar_docs = faiss_db.similarity_search_with_score(
    '재정융자 사업의 개념은 무엇인가요?', k=3, filter=dict(page=10)
)

print('[Search Results]')
for i, (doc, score) in enumerate(similar_docs):
    print(f'Document {i}(s={score:.4f}) - Page: {doc.metadata["page"]}')
    print(f'{doc.page_content}\n')

In [None]:
PDF_PAGES[10].to_image(resolution=70)

A retriever is an interface that returns documents given an unstructured query. It is more general than a vector store. 

The `MMR (Maximal Marginal Relevance)` method avoids redundancy in query results by balancing relevance and diversity. Instead of retrieving only the most relevant items, MMR ensures that the results include diverse perspectives and new information, preventing the selection of very similar items. This approach helps users gain a broader understanding of a topic by considering both the relevance of documents to the query and their dissimilarity from already selected documents.

In [None]:
# similarity search with retriever
# we can use LCEL style query
retriever = faiss_db.as_retriever(
    search_type = 'mmr',    # "similarity", "mmr", or "similarity_score_threshold".
    search_kwargs={'k': 3}  # {"score_threshold": 0.75}
)
similar_docs = retriever.invoke('재정 융자사업의 개념은 무엇인가요?')
print('[Search Results]')
for i, doc in enumerate(similar_docs):
    print(f'Document {i}(s={score:.4f}) - Page: {doc.metadata["page"]}')
    print(f'{doc.page_content}\n')

There are also other retriever classes, such as `EnsembleRetriever`, `SelfQueryRetriever`

* `EnsembleRetriever`: An ensemble of retrievers that aggregates their results.
* `SelfQueryRetriever`: A retriever that generate question by itself with LLM.

## 2.4 Naive Retrieval Augmented Generation

<img src="https://lh3.googleusercontent.com/d/1J7JasPNjGfAySxvZoqG7aV29F1nYuxXQ" width="75%">

1. **Indexing**: Documents are split into chunks, encoded into vectors, and stored in a vector database. (Relevant to `VectorStore`)
2. **Retrieval**: Retrieve the Top k chunks most relevant to the question based on semantic similarity. (Relevant to `Retriever`)
3. **Generation**: Input the original question and the retrieved chunks together into LLM to generate the final answer. (Relevant to `LCEL pipeline`)

### 2.4.1 Step 1: Load data

In [None]:
import json
from langchain_core.documents import Document

with open('./fis_issue22_3.json', 'r') as f:
    all_data = json.load(f)

### 2.4.2 Step 2: Split documents

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, chunk_overlap=50, length_function=len,
)

texts = []
for data in all_data:
    texts.append(data['page_content'])
texts = '\n\n'.join(texts)
documents = text_splitter.create_documents([texts])

print(f'Original number of pages: {len(all_data)}')
print(f'Total Documents: {len(documents)}')
print(texts.split('\n\n')[1][:767])

<img src="https://lh3.googleusercontent.com/d/1JCH8jNIWbWFyPb2bEaFkgMYrP1R5AB7w" width="50%">

visualize chunks: https://chunkviz.up.railway.app/

### 2.4.3 Step 3: Create embeddings and store them in a vector store

In [None]:
import os
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

openai_embeddings = OpenAIEmbeddings(api_key = os.getenv('OPENAI_API_KEY'))
faiss_db = FAISS.from_documents(documents, openai_embeddings)

### 2.4.4 Step 4: Create a retriever

In [None]:
retriever = faiss_db.as_retriever(
    search_type='similarity_score_threshold',  # "similarity", "mmr", or "similarity_score_threshold".
    search_kwargs={'k': 3, 'score_threshold': 0.75}
)

similar_docs = retriever.invoke('재정 융자사업의 개념은 무엇인가요?')
print('[Search Results]')
for i, doc in enumerate(similar_docs):
    print(f'Document {i}')
    print(f'{doc.page_content}\n')

### 2.4.5 Step 5: Prompt Template

In [None]:
from langchain_core.prompts import PromptTemplate

template = '''You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
When you answer the question you must quote the context number and the relevant text.
The output format should be a JSON object with the following structure:
```json
{{
    "context_id": "The retrieved context number(integer)",
    "quote": "The relevant text(part of the context)",
    "answer": "The answer to the question"
}}
```
Answer in Korean.

## Question:
{question}

## Context:
{context}

## Answer:
'''

prompt = PromptTemplate(
    template=template, 
    input_variables=['question', 'context']
)

### 2.4.6 Step 6: Define LLM

In [None]:
from langchain_openai import ChatOpenAI

model_openai = ChatOpenAI(
    model='gpt-4o-mini',  # latest model
    temperature=0.5,
    model_kwargs = {'response_format': {'type': 'json_object'}}
)

### 2.4.7 Step 7: Create Chain and Run

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.pydantic_v1 import BaseModel, Field

class Answer(BaseModel):
    context_id: int = Field(..., title='The retrieved context number(integer)')
    quote: str = Field(..., title='The relevant text(part of the context)')
    answer: str = Field(..., title='The answer to the question')

def numbering_context(context: list) -> str:
    s = ''
    for i, c in enumerate(context):
        s += f'[source {i+1}] {c}\n'
    return s

os.environ['LANGCHAIN_TRACING_V2'] = 'true'

chain = (
    {'question': RunnablePassthrough(), 'context': retriever | RunnableLambda(numbering_context)}
    | prompt
    | model_openai
    | JsonOutputParser(pydantic_object=Answer)
)

question = '1990년대 이전의 재정 융자사업은 어떻게 진행되었나요?'
answer = chain.invoke(question)

os.environ['LANGCHAIN_TRACING_V2'] = 'false'

Check LangSmith: https://smith.langchain.com/public/377158a0-fc0b-42a7-8a1f-34a39943f949/r

In [None]:
from pprint import pprint
print(f'Referred Context: {answer["context_id"]}')
pprint(f'Quote: {answer["quote"]}')
print()
pprint(answer['answer'])

## 3. Application Development

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

### Scenario: Travel Enthusiast's Quest for a Perfect Vacation

Meet Alex, an avid traveler planning an unforgettable trip to three of the world's most iconic cities: New York, Paris, and Seoul. With a passion for discovering hidden gems and indulging in local cuisines, Alex turns to the innovative Travel Search Guide powered by Retrieval-Augmented Generation (RAG). Alex begins by entering preferences into the guide—luxurious hotels, must-see attractions, and top-rated restaurants. The guide instantly retrieves and presents detailed options from a curated database, including renowned hotels like The Ritz in Paris and The Shilla in Seoul, attractions such as Central Park and the Eiffel Tower, and culinary delights from Le Bernardin in New York to Tosokchon Samgyetang in Seoul. With comprehensive information at Alex’s fingertips, including ratings, contact details, and accessibility options, planning the perfect itinerary becomes effortless. 

In [None]:
with open('./travel_guides_db.txt', 'r') as f:
    tg_data = f.read().split('\n\n')

print(f'Total records: {len(tg_data)}\n')
print(tg_data[0])

In [None]:
from langchain_core.documents import Document
from pprint import pprint
docs = []
for i, data in enumerate(tg_data):
    name_and_type = data.split('###')[0].lstrip('## ').rstrip()
    typ, name = name_and_type.split(': ')
    doc = Document(
        page_content = data,
        metadata = {'id': i, 'type': typ, 'name': name}
    )
    docs.append(doc)

list_of_availables = [d.metadata['name'] for d in docs]
pprint(list_of_availables[:5])

TODO: Create Embeddings, Vector Store, and Retriever

In [None]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules['pysqlite3']

import os
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.pydantic_v1 import BaseModel, Field
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
# Data model for answer format
class Answer(BaseModel):
    context_id: list[int] = Field(..., title='The retrieved context number(list of integer) e.g., [1, 3]')
    quote: list[str] = Field(..., title="The relevant text(part of the context, list of strings), e.g., ['quote from context_id=1', 'quote from context_id=3']")
    answer: str = Field(..., title='The final answer to the question')

# pipeline formatting context with numbering
def numbering_context(context: list) -> str:
    s = ''
    for i, c in enumerate(context):
        s += f'[source {i+1}] {c}\n'
    return s

# os.environ['LANGCHAIN_TRACING_V2'] = 'true'
# define LLM model for self-query retriever
llm = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0.5,
)
# create embeddings and vector store
openai_embeddings = OpenAIEmbeddings(api_key = os.getenv('OPENAI_API_KEY'))
vectorstore = Chroma.from_documents(docs, embedding=openai_embeddings)

# create self-query retriever
metadata_fields = [
    AttributeInfo(name='id', type='int', description='The unique identifier of the document'),
    AttributeInfo(name='type', type='str', description='The type of the document: one of ["Hotel", "Restaurant", "Attraction"]'),
]
contents_description = ''''The documents about three cities in New York, Paris and Seoul. 
We have three types of documents: Hotel, Restaurant, and Attraction.'''
retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectorstore,
    document_contents=contents_description,
    metadata_field_info=metadata_fields,
    search_type='similarity_score_threshold',
    search_kwargs={'k': 5, 'score_threshold': 0.65}
)

In [None]:
# filterout the documents with the metadata
def remove_duplicates_by_id(documents: list) -> list:
    seen = set()
    return [d for d in documents if not (d.metadata['id'] in seen or seen.add(d.metadata['id']))]

documents = retriever.invoke(
    'What activities and highlights can visitors enjoy at the tower in the mountain of Seoul?'
)
print(f'Before removing duplicates: {len(documents)}')
documents = remove_duplicates_by_id(documents)
print(f'After removing duplicates: {len(documents)}')

In [None]:
template = '''You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
When you answer the question you must quote the context number and the relevant text.
The output format should be a JSON object with the following structure:
```json
{{
    "context_id": "The retrieved context number(list of integer) e.g., [1, 3]",
    "quote": "The relevant text(part of the context, list of strings), e.g., ['quote from context_id=1', 'quote from context_id=3']"
    "answer": "The final answer to the question"
}}
```

[Question]:
{question}

[Context]:
{context}

[Answer]:
'''

prompt = PromptTemplate(
    template=template, 
    input_variables=['question', 'context']
)

model_openai = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0.5,
    model_kwargs = {'response_format': {'type': 'json_object'}}
)

# TODO: Implement the pipeline
chain = (
    {'question': RunnablePassthrough(), 
     'context': retriever | RunnableLambda(remove_duplicates_by_id) | RunnableLambda(numbering_context)}
    | prompt
    | model_openai
    | JsonOutputParser(pydantic_object=Answer)
)

question = 'What activities and highlights can visitors enjoy at the tower in the mountain of Seoul?'

# os.environ['LANGCHAIN_TRACING_V2'] = 'true'
answer = chain.invoke(question)
# os.environ['LANGCHAIN_TRACING_V2'] = 'false'

Check the detail process in LangSmith: https://smith.langchain.com/public/8c10b8f4-adf9-43ae-879b-272b2bb0daad/r

In [None]:
print(f'Referred Context: {answer["context_id"]}')
pprint(f'Quote: {answer["quote"]}')
print()
pprint(answer['answer'])