## Setup and Import Libraries

In [2]:
import os
import pandas as pd
import numpy as np
import nest_asyncio
from llama_index.core.schema import Document
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    SimpleDirectoryReader, VectorStoreIndex, Settings, 
    StorageContext, load_index_from_storage
)
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.response.notebook_utils import display_response
from trulens_eval.feedback import GroundTruthAgreement
from trulens_eval import TruLlama, Tru
from trulens.core import Feedback, FeedbackMode
from trulens_eval import OpenAI as fOpenAI
from utils import build_sentence_window_index, get_sentence_window_query_engine, get_prebuilt_trulens_recorder
from copy import deepcopy
from dotenv import load_dotenv

import warnings
warnings.filterwarnings('ignore')

In [3]:
load_dotenv()

True

In [4]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["HUGGINGFACE_API_KEY"] = os.getenv("HUGGINGFACE_API_KEY")

In [5]:
documents = SimpleDirectoryReader(
    input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [6]:
document = Document(text="\n\n".join([doc.text for doc in documents]))

In [7]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

## Window-sentence retrieval setup

In [8]:
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [9]:
text = "hello. how are you? I am fine!  "

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [10]:
print([x.text for x in nodes])

['hello. ', 'how are you? ', 'I am fine!  ']


In [11]:
print(nodes[1].metadata["window"])

hello.  how are you?  I am fine!  


In [12]:
text = "hello. foo bar. cat dog. mouse"

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [13]:
print([x.text for x in nodes])

['hello. ', 'foo bar. ', 'cat dog. ', 'mouse']


In [14]:
print(nodes[0].metadata["window"])

hello.  foo bar.  cat dog.  mouse


In [15]:
print(nodes[1].metadata["window"])

hello.  foo bar.  cat dog.  mouse


## Building the Index

In [18]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [19]:
Settings.node_parser = node_parser

In [21]:
sentence_index = VectorStoreIndex.from_documents(
    [document], 
    embed_model=Settings.embed_model,
    node_parser=Settings.node_parser
)

In [22]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")

## Building the PostProcessor

In [23]:
postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [24]:
scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [25]:
nodes_old[1].text

'foo bar. '

In [26]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

In [27]:
print(replaced_nodes[1].text)

hello.  foo bar.  cat dog.  mouse


## Adding a Reranker

In [28]:
rerank = SentenceTransformerRerank(
    top_n=2, model="BAAI/bge-reranker-base"
)

In [29]:
query = QueryBundle("I want a dog.")

In [30]:
scored_nodes = [
    NodeWithScore(node=TextNode(text="This is a cat"), score=0.6),
    NodeWithScore(node=TextNode(text="This is a dog"), score=0.4),
]

In [31]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

In [32]:
print([(x.text, x.score) for x in reranked_nodes])

[('This is a dog', np.float32(0.918274)), ('This is a cat', np.float32(0.0014040894))]


## Running the query engine

In [33]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postproc, rerank]
)

In [34]:
window_response = sentence_window_engine.query(
    "What are the keys to building a career in AI?"
)

In [35]:
display_response(window_response)

**`Final Response:`** The keys to building a career in AI involve learning foundational technical skills, working on projects, finding a job, and being part of a supportive community. Additionally, collaborating with others, influencing, and being influenced by others are critical aspects for success in AI career development.

## Putting All Together

In [36]:
def build_sentence_window_index(
    documents, llm, embed_model="BAAI/bge-small-en-v1.5",
    sentence_window_size=3, save_dir="sentence_index"
):

    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )

    # Adding Configurations
    Settings.embed_model = HuggingFaceEmbedding(
        model_name=embed_model
    )
    Settings.node_parser = node_parser

    if not os.path.exists(save_dir):
        # Create new index
        sentence_index = VectorStoreIndex.from_documents(
            [document], 
            embed_model=Settings.embed_model,
            node_parser=Settings.node_parser
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        # Load existing index
        storage_context = StorageContext.from_defaults(persist_dir=save_dir)
        sentence_index = load_index_from_storage(storage_context)

    return sentence_index

In [37]:
def get_sentence_window_query_engine(
    sentence_index, similarity_top_k=6, rerank_top_n=2, rerank_model="BAAI/bge-reranker-base"
):
    # Define the postprocessor
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")

    # Define Reranker
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model=rerank_model
    )

    # Define Query Engine
    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )

    return sentence_window_engine

In [38]:
index = build_sentence_window_index(
    [document],
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    save_dir="./sentence_index",
)

Loading llama_index.core.storage.kvstore.simple_kvstore from ./sentence_index\docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./sentence_index\index_store.json.


In [39]:
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)

## TruLens Evaluation

In [41]:
eval_questions = []
with open('generated_questions/generated_questions.text', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [42]:
def run_evals(eval_questions, tru_recorder, query_engine):
    for question in eval_questions:
        with tru_recorder as recording:
            response = query_engine.query(question)

In [43]:
Tru().reset_database()

🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `TruSession` to prevent this.


Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]


### Sentence Window Size = 1

In [44]:
sentence_index_1 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="BAAI/bge-small-en-v1.5",
    sentence_window_size=1,
    save_dir="sentence_index_1",
)

In [45]:
sentence_window_engine_1 = get_sentence_window_query_engine(sentence_index_1)

In [46]:
tru_recorder_1 = get_prebuilt_trulens_recorder(
    sentence_window_engine_1,
    app_id='sentence window engine 1'
)

instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.core.embeddings.multi_modal_base.MultiModalEmbedding'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.core.base.embeddings.base.BaseEmbedding'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.core.schema.TransformComponent'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.core.schema.BaseComponent'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'pydantic.main.BaseModel'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base

In [47]:
run_evals(eval_questions, tru_recorder_1, sentence_window_engine_1)

In [49]:
# Tru().run_dashboard()

### Sentence Window Size = 3

In [50]:
sentence_index_3 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir="sentence_index_3",
)

sentence_window_engine_3 = get_sentence_window_query_engine(sentence_index_3)

tru_recorder_3 = get_prebuilt_trulens_recorder(
    sentence_window_engine_3,
    app_id='sentence window engine 3'
)

instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.core.embeddings.multi_modal_base.MultiModalEmbedding'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.core.base.embeddings.base.BaseEmbedding'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.core.schema.TransformComponent'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'llama_index.core.schema.BaseComponent'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base <class 'pydantic.main.BaseModel'>
instrumenting <class 'llama_index.embeddings.huggingface.base.HuggingFaceEmbedding'> for base

In [51]:
run_evals(eval_questions, tru_recorder_3, sentence_window_engine_3)

In [56]:
# Tru().run_dashboard()