# Recency Filtering

Showcase capabilities of the recency filtering node postprocessor

In [1]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.indices.postprocessor import (
    FixedRecencyPostprocessor,
    EmbeddingRecencyPostprocessor
)
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.response.notebook_utils import display_response
import os
import openai

ModuleNotFoundError: No module named 'llama_index.data_structs.node'

In [None]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

In [None]:
os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = os.getenv("OPENAI_API_KEY")

### Parse Documents into Nodes, add to Docstore

In this example, there are 3 different versions of PG's essay. They are largely identical **except** 
for notes section and the amount of money they raised raised for Viaweb. 

V1: 10k, V2: 30k (and no notes section at the end)

V1: 2023-01-01, V2: 2023-02-03

The idea is to encourage index to fetch the most recent info (which is V3)

In [2]:
# load documents
from llama_index.storage.storage_context import StorageContext


def get_file_metadata(file_name: str):
    """Get file metadata."""
    if "draft" in file_name:
        # make the draft newer than the original
        return {"date": "2023-02-03"}
    else:
        return {"date": "2023-01-01"}

documents = SimpleDirectoryReader(
    input_files=[
        "./../data/paul_graham/paul_graham_essay.txt",
        "./../data/paul_graham/paul_graham_essay_draft_no_notes.txt",
    ],
    file_metadata=get_file_metadata
).load_data()

NameError: name 'SimpleDirectoryReader' is not defined

In [None]:
# first document includes the "notes" section of the essay
old_document = documents[0]
print(old_document.extra_info)
print(old_document.get_text())

In [None]:
# second document does not include the "notes" section of the essay
new_document = documents[1]
print(new_document.extra_info)
print(new_document.get_text())

In [None]:
print('test')

### Build Index

In [None]:
# build index 
index = VectorStoreIndex.from_documents(documents)

### Define Recency Postprocessors

In [None]:
node_postprocessor = FixedRecencyPostprocessor(service_context=index.service_context)

In [None]:
# this is the default EmbeddingRecencyPostprocessor, which
# filters nodes that have similar node embeddings
# (reminder: documents are made of of several embeddings)
node_postprocessor_emb_node = EmbeddingRecencyPostprocessor(
    service_context=index.service_context
)

In [None]:
# this is the document EmbeddingRecencyPostprocessor, which
# filters nodes that have similar document embeddings
# (reminder: documents are made of of several embeddings)
node_postprocessor_emb_doc = EmbeddingRecencyPostprocessor(
    service_context=index.service_context,
    storage_context=index.storage_context,
    embedding_filter_level="documents"
)

### Query Index

In [None]:
# answer is only in the old document
query_notes_section = "Did John Collison read a draft of this document?"

# old doc answer = $10K
# new doc answer = $30K
query_funding = "How much did Paul raise from Julian for his startup Viaweb?"

In [None]:
# naive query
query_engine = index.as_query_engine(
    similarity_top_k=3,
)
response_funding = query_engine.query(query_funding)
response_notes = query_engine.query(query_notes_section)

In [None]:
# query using fixed recency node postprocessor
query_engine = index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor],
)
response_funding = query_engine.query(query_funding)
response_notes = query_engine.query(query_notes_section)

In [None]:
# query using embedding-based node postprocessor
query_engine = index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor_emb_node],
)
response_funding = query_engine.query(query_funding)
response_notes = query_engine.query(query_notes_section)

In [None]:
# query using embedding-based docuemnt filtering node postprocessor
query_engine = index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[node_postprocessor_emb_doc],
)
response_funding = query_engine.query(query_funding)
response_notes = query_engine.query(query_notes_section)

### Query Index (Lower-Level Usage)

In this example we first get the full set of nodes from a query call, and then send to node postprocessor, and then
finally synthesize response through a list index.

In [None]:
from llama_index import ListIndex

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k=3,
    response_mode="no_text"
)
init_response = query_engine.query(
    query_funding, 
)
resp_nodes = [n.node for n in init_response.source_nodes]

In [None]:
list_index = ListIndex(resp_nodes)
query_engine = list_index.as_query_engine(
    node_postprocessors=[node_postprocessor]
)
response = query_engine.query(query_str)