In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")


In [10]:
from llama_index.core import SimpleDirectoryReader

document = SimpleDirectoryReader(
    input_files = ["introduction-to-natural-language-processing.pdf"]
).load_data()

In [11]:
len(document)

52

In [12]:
from llama_index.core import Document

document = Document(
    text="\n\n".join([doc.text for doc in document])
)

In [13]:
document

Document(id_='f909835c-0d4f-4fde-bc07-6d3e76349191', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Introduction to natural\nlanguage processing\nR. Kibble\nCO3354\n2013\nUndergraduate study in \nComputing and related programmes\nThis is an extract from a subject guide for an undergraduate course offered as part of the \nUniversity of London International Programmes in Computing. Materials for these programmes are developed by academics at Goldsmiths.\nFor more information, see: www.londoninternational.ac.uk\n\nThis guide was prepared for the University of London International Programmes by:\nR. KibbleThis is one of a series of subject guides published by the University. We regret that due to pressure of work the author is \nunable to enter into any correspondence relating to, or arising from, the guide. If you have any comments on this subject guide, favourable or unfavourable, please use the form at the back of thi

In [14]:
from llama_index.core.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    # how many sentences on either side to capture
    window_size=3,
    # the metadata key that holds the window of surrounding sentences
    window_metadata_key="window",
    # the metadata key that holds the original sentence
    original_text_metadata_key="original_sentence",
)

In [15]:
text = "hello. how are you? I am fine!."

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [16]:
print([x.text for x in nodes])

['hello. ', 'how are you? ', 'I am fine!.']


In [17]:
print(nodes[1].metadata["window"])

hello.  how are you?  I am fine!.


In [18]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o", temperature=0.1)



In [19]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = "local:BAAI/bge-small-en-v1.5"
Settings.node_parser = node_parser

In [20]:
from llama_index.core import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
    [document]
)

In [21]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")

In [22]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor

postprocessor = MetadataReplacementPostProcessor(
    target_metadata_key="window",
)

In [23]:
from llama_index.core.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [24]:
scored_nodes

[NodeWithScore(node=TextNode(id_='c581b1c9-eec4-468a-a368-5e51b38ec726', embedding=None, metadata={'window': 'hello.  how are you?  I am fine!.', 'original_sentence': 'hello. '}, excluded_embed_metadata_keys=['window', 'original_sentence'], excluded_llm_metadata_keys=['window', 'original_sentence'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b42d9cd5-89be-4782-a4c8-057a16d0105b', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='a28a178122f579a666ab87184a33af68c551424b8779e43c89f92ea9fe916dc5'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='35f6cde6-93c8-4c99-974d-fa5f6123208d', node_type=<ObjectType.TEXT: '1'>, metadata={'window': 'hello.  how are you?  I am fine!.', 'original_sentence': 'how are you? '}, hash='7d0281d700ed558feb9302d9c4d0a7b9336e5249070e1641ae9ca5abfdf95070')}, text='hello. ', mimetype='text/plain', start_char_idx=0, end_char_idx=7, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_s

In [25]:
nodes_old[0].text

'hello. '

In [26]:
postprocessor.postprocess_nodes(scored_nodes)

[NodeWithScore(node=TextNode(id_='c581b1c9-eec4-468a-a368-5e51b38ec726', embedding=None, metadata={'window': 'hello.  how are you?  I am fine!.', 'original_sentence': 'hello. '}, excluded_embed_metadata_keys=['window', 'original_sentence'], excluded_llm_metadata_keys=['window', 'original_sentence'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b42d9cd5-89be-4782-a4c8-057a16d0105b', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='a28a178122f579a666ab87184a33af68c551424b8779e43c89f92ea9fe916dc5'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='35f6cde6-93c8-4c99-974d-fa5f6123208d', node_type=<ObjectType.TEXT: '1'>, metadata={'window': 'hello.  how are you?  I am fine!.', 'original_sentence': 'how are you? '}, hash='7d0281d700ed558feb9302d9c4d0a7b9336e5249070e1641ae9ca5abfdf95070')}, text='hello.  how are you?  I am fine!.', mimetype='text/plain', start_char_idx=0, end_char_idx=7, text_template='{metadata_str}\n\n{content}', metadata_template='{

In [27]:
from llama_index.core.indices.postprocessor import SentenceTransformerRerank

# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
    top_n=2, model="BAAI/bge-reranker-base"
)

In [28]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postprocessor, rerank]
)

In [29]:
window_response = sentence_window_engine.query(
    "What are the keys to building a career in AI?"
)

In [30]:
window_response.response

'The context does not provide specific guidance on building a career in AI. However, it does highlight the importance of understanding artificial intelligence techniques and the scientific study of language, as these have been crucial in developing applications like automatic translation and voice recognition. Engaging with educational resources, such as the Natural Language Toolkit and related studies, could be beneficial for those interested in AI, particularly in the field of natural language processing.'

In [31]:
from llama_index.core import load_index_from_storage
from llama_index.core import VectorStoreIndex, StorageContext

index_dir = "./sentence_index"
sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=index_dir)
        )

# RAG

In [93]:
import os
import openai
from dotenv import load_dotenv
from llama_index.core import Document
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core import load_index_from_storage
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.indices.postprocessor import SentenceTransformerRerank

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")


documents = SimpleDirectoryReader(input_files = ["introduction-to-natural-language-processing.pdf"]).load_data()
document = Document(text="\n\n".join([doc.text for doc in documents]))

def get_sentence_window_index(documents, index_dir, sentence_window_size=3):
    Node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_sentence",
    )

    Settings.llm = OpenAI()
    Settings.embed_model = "local:BAAI/bge-small-en-v1.5"
    Settings.node_parser = Node_parser

    if not os.path.exists(index_dir):
        sentence_index = VectorStoreIndex.from_documents([document])
        sentence_index.storage_context.persist(persist_dir=index_dir)
        
    else:
        sentence_index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_dir))
    return sentence_index

def get_sentence_window_engine(sentence_index):
    
    postprocessor = MetadataReplacementPostProcessor(target_metadata_key="window",)
    rerank = SentenceTransformerRerank(top_n=2, model="BAAI/bge-reranker-base") 
    sentence_window_engine = sentence_index.as_query_engine(similarity_top_k=6, node_postprocessors=[postprocessor, rerank])
    
    return sentence_window_engine


In [94]:
from trulens_eval import TruLlama, Tru, Feedback, feedback
feedback.OpenAI()

OpenAI(tru_class_info=trulens.providers.openai.provider.OpenAI, endpoint=Endpoint(OpenAIEndpoint), model_engine='gpt-4o-mini')

In [95]:
index_1 = get_sentence_window_index(documents, index_dir, sentence_window_size=3)
engine_1 = get_sentence_window_engine(index_1)

In [83]:
# Use the recorder as a context manager to log queries
with tru_query_engine_recorder as recording:
    response = engine_1.query("Who introduced the notions of finite-state machines and context-free grammar (CFG) to linguistics?")# The record of the app invocation can be retrieved from the `recording`:
rec = recording.get()  # Use .get if only one record

# Display the record
display(rec)

# Run the dashboard to visualize feedback
tru.run_dashboard()
print(response)



Record(record_id='record_hash_571b299d9e5b5e46edb402dd97c153a9', app_id='app_hash_a5eec600241273ea48ca31c413e6bddf', cost=Cost(n_requests=1, n_successful_requests=1, n_completion_requests=1, n_classification_requests=0, n_classes=0, n_embedding_requests=0, n_embeddings=0, n_tokens=645, n_stream_chunks=0, n_prompt_tokens=624, n_completion_tokens=21, n_cortex_guardrails_tokens=0, cost=0.000978, cost_currency='USD'), perf=Perf(start_time=datetime.datetime(2024, 10, 23, 16, 16, 45, 440870), end_time=datetime.datetime(2024, 10, 23, 16, 16, 50, 535569)), ts=datetime.datetime(2024, 10, 23, 16, 16, 50, 537551), tags='-', meta=None, main_input='Who introduced the notions of finite-state machines and context-free grammar (CFG) to linguistics?', main_output='Chomsky introduced the notions of finite-state machines and context-free grammar (CFG) to linguistics.', main_error=None, calls=[RecordAppCall(call_id='aec7cf9d-f8f4-492f-afd3-66d5f00160e4', stack=[RecordAppCallMethod(path=Lens().app, method=

Starting dashboard ...


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.1.2:58519 .
Chomsky introduced the notions of finite-state machines and context-free grammar (CFG) to linguistics.


In [96]:
window_response_1 = engine_1.query(
    "Who introduced the notions of finite-state machines and context-free grammar (CFG) to linguistics?"
)
window_response_1.response

'Chomsky introduced the notions of finite-state machines and context-free grammar (CFG) to linguistics.'

In [51]:
window_response = sentence_window_engine.query(
    "Who introduced the notions of finite-state machines and context-free grammar (CFG) to linguistics?"
)
window_response.response

'The notions of finite-state machines and context-free grammar (CFG) were introduced to linguistics by Chomsky.'

# Evaluation

In [86]:
eval_questions = [
    "Who introduced the notions of finite-state machines and context-free grammar (CFG) to linguistics?",
    "How did linguists test formal rules of grammar according to Chomsky’s approach?",
    "What has contributed to making the vision of computers understanding ordinary language and holding conversations with humans more plausible in the 21st century?",
    "Why is it often necessary to assign a part of speech (POS) to a word based on its function in context rather than its inherent meaning?"
]

In [87]:
from trulens.core import TruSession

session = TruSession()
session.reset_database()

Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]


In [97]:
from trulens_eval import Tru
from trulens_eval.feedback.provider import OpenAI
from trulens_eval import Feedback
from trulens_eval.app import App
import numpy as np

tru = Tru()

# Initialize provider class
provider = OpenAI()

# Select context to be used in feedback. The location of context is app specific.

def get_evaluation_response(rag_engine, app_id, eval_questions):
    
    context = App.select_context(rag_engine)

    # Define a groundedness feedback function
    f_groundedness = (
        Feedback(provider.groundedness_measure_with_cot_reasons)
        .on(context.collect())  # Collect context chunks into a list
        .on_output()
    )

    # Question/answer relevance between overall question and answer.
    f_answer_relevance = (
        Feedback(provider.relevance)
        .on_input_output()
    )

    # Question/statement relevance between question and each context chunk.
    f_context_relevance = (
        Feedback(provider.context_relevance_with_cot_reasons)
        .on_input()
        .on(context)
        .aggregate(np.mean)
    )

    from trulens_eval import TruLlama

    # Initialize the recorder
    tru_query_engine_recorder = TruLlama(
        rag_engine,
        app_id= app_id,
        feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])
    
    for question in eval_questions:
        with tru_query_engine_recorder as recording:
            response = rag_engine.query(question)
    records = recording.get()
    
    return records


## sentence window size = 1

In [98]:
index_dir = "./sentence_index_1"
sw_index_1 = get_sentence_window_index(documents, index_dir, sentence_window_size=1)
sw_engine_1 = get_sentence_window_engine(sw_index_1)

records = get_evaluation_response(
    sw_engine_1,
    app_id='sentence window engine 1',
    eval_questions = eval_questions
)

display(records)
tru.run_dashboard()

TypeError: OpenAI.__init__() got an unexpected keyword argument 'model'

In [47]:
run_evals(eval_questions, tru_recorder_1, sentence_window_engine_1)

NameError: name 'tru_recorder_1' is not defined