# Test notebook for parsing and chunking text data.

In [8]:
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>"""

soup = BeautifulSoup(html_doc, 'html.parser')
soup.prettify() # Pretty print the parsed HTML

In [3]:
# Ways to navigate the parse tree
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.p)
print(soup.p['class'])
print(soup.a)
print(soup.find_all('a'))
print(soup.find(id="link3"))

<title>The Dormouse's story</title>
title
The Dormouse's story
head
<p class="title"><b>The Dormouse's story</b></p>
['title']
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


# Try to do the same with a local text file.

In [14]:

df = pd.read_csv("../data/raw/test.csv", header=None)

all_text = "".join(df[0].tolist())

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    length_function=len,
)

chunks = text_splitter.split_text(all_text)

for chunk in chunks:
    print(chunk)
    print("\n---\n")


Retrieval-Augmented Generation (RAG) is a technique for enhancing the accuracy and reliability of

---

of large language models (LLMs) with facts fetched from external sources. It is a powerful approach

---

approach for building chatbots and question-answering systems.The first step in a RAG pipeline is

---

is data ingestion and processing. This involves loading documents from various sources, such as

---

such as text files, PDFs, or websites. After loading, the documents are split into smaller,

---

smaller, manageable chunks.These chunks are then converted into numerical representations called

---

called embeddings using a sentence-transformer model. The embeddings are stored in a specialized

---

vector database, which allows for efficient similarity search.When a user asks a question, their

---

their query is also converted into an embedding. The system then searches the vector database to

---

to find the text chunks with embeddings most similar to the query's embedd