In [1]:
from bs4 import BeautifulSoup as bs
import requests

In [50]:
pycon_url = "https://in.pycon.org/cfp/pycon-india-2023/proposals"
resp = requests.get(pycon_url)
resp

<Response [200]>

In [51]:
soup = bs(resp.content, 'html.parser')
links = soup.find_all("h3", "proposal--title")

len(links)

268

talk: 35

workshops: 7

In [57]:
talks = links[:35]
workshops = links[35: 42]

talk_links = [t.a["href"] for t in talks]
workshop_links = [workshop.a["href"] for workshop in workshops]

In [33]:
PYCON_URL = "https://in.pycon.org"

def get_proposal_data(link):
    resp = requests.get(PYCON_URL+link)
    soup = bs(resp.content, 'html.parser')

    metadata = {}
    meta = soup.find("section", "col-sm-3 proposal-meta")
    if meta is not None:
        for r in meta.find_all("tr"):
            k, v = r.find_all('td')
            k = k.get_text()[:-1]
            v = v.get_text().strip()
            metadata[k] = v
    title = soup.find("h1", "proposal-title")
    metadata["title"] = title.get_text().strip()
    speaker_info = soup.find("p", "text-muted")
    metadata["speaker"] = speaker_info.b.get_text().strip()
    content = soup.find("section", "col-sm-8 proposal-writeup")
    
    return content.get_text(), metadata

In [41]:
c, m = get_proposal_data(workshop_links[3])

In [44]:
m

{'Section': 'Data Science, AI & ML',
 'Type': 'Workshops',
 'Target Audience': 'Intermediate',
 'Last Updated': '14 Sep, 2023',
 'title': 'All Them Data Engines: Pandas, Spark, Dask, Polars and more - Data Munging with Python circa 2023.',
 'speaker': 'shaurya shaurya3 (~shaurya3)'}

### LlamaIndex nodes
Now lets make the llamaIndex nodes from this and the metadata that can be obtained.

In [61]:
from llama_index.schema import TextNode
from tqdm import tqdm

In [58]:
len(talk_links+workshop_links)

42

In [62]:
nodes = []
for l in tqdm(talk_links+workshop_links):
    c, m = get_proposal_data(l)
    t = TextNode(text=c, metadata=m)
    nodes.append(t)

100%|███████████████████████████████████████████████████████████| 42/42 [00:21<00:00,  1.95it/s]


In [63]:
len(nodes)

42

In [64]:
# save the nodes for later
import pickle

with open("py23nodes.pkl", "wb") as f:
    pickle.dump(nodes, f)

## baseline LlamaIndex

In [65]:
from llama_index import VectorStoreIndex

baseline = VectorStoreIndex(nodes)

In [70]:
qe = baseline.as_query_engine()
q = "is there a llamaindex workshop?"
r = qe.query("is there a llamaindex workshop?")
print(r)

Yes, there is a LlamaIndex workshop mentioned in the context information.


In [68]:
baseline.storage_context.persist()

In [69]:
# evaluate
from llama_index.evaluation import FaithfulnessEvaluator, RelevancyEvaluator

service_context = baseline.service_context
faithfulness = FaithfulnessEvaluator(service_context=service_context)
relevancy = RelevancyEvaluator(service_context=service_context)

In [72]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

In [75]:
eval_result = faithfulness.evaluate_response(response=r)
"Pass" if eval_result.passing else "Fail"

'Fail'

In [86]:
r

Response(response='Yes, there is a LlamaIndex workshop mentioned in the context information.', source_nodes=[NodeWithScore(node=TextNode(id_='1e264a40-fe2f-4c0f-954b-32427f471b0a', embedding=None, metadata={'Section': 'Data Science, AI & ML', 'Type': 'Workshops', 'Target Audience': 'Beginner', 'Last Updated': '14 Sep, 2023', 'title': 'Mastering Retrieval Augmented Generation (RAG) with LlamaIndex and LLMs', 'speaker': 'ravi theja (~ravi1)'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='e14d7553deeccb8ea1d62416410198657cc8a709cd7f41f92b81101b1fea34fe', text="\n\nDescription:\nLlamaIndex is a toolkit designed to enhance the utility of Large Language Models (LLMs). It provides powerful capabilities to bridge the gap between your custom data and LLMs, thereby enabling the construction of sophisticated, data-driven applications. With LlamaIndex, harnessing the potential of retrieval and generation in language models becomes seamless and efficient.\

In [85]:
eval_result = relevancy.evaluate_response(
    query=q, response=r.response, contexts=[c.text for c in r.source_nodes]
)
eval_result

AttributeError: 'str' object has no attribute 'response'