In [None]:
import sys
import os

# Get the absolute path of the parent directory of the "notebooks" directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Add the parent directory to the Python path
sys.path.append(parent_dir)

`WebPageLinkExtractor` is a Python class that extracts all links from a given webpage recursively up to a certain depth. The class takes a base URL as input along with optional parameters such as the maximum depth of recursion and the timeout value for fetching pages.

The class uses the `requests` library to fetch the content of the webpage and `BeautifulSoup` library to extract all links from the HTML content. The class then follows each link that belongs to the same domain as the base URL, and extracts all links from each of those pages as well. This process continues recursively until the maximum depth of recursion is reached.

In [None]:
from data_loaders.websites import WebPageLinkExtractor

base_url = 'https://python.langchain.com/en/latest/'
extractor = WebPageLinkExtractor(base_url, max_depth=1000)
links = extractor.get_links()
print(f'\nMaximum depth reached: {extractor.current_max_depth}')
print(f'Total links found: {len(links)}\n')

Now filter only `html` links as the above code downloads all the links like `.md` or `.pynb`

In [None]:
html_links = [link for link in links if link.endswith('.html')]
print(len(html_links))

The nest_asyncio library is used in Python to enable running asyncio event loops inside a Jupyter notebook environment. This library is used because Jupyter notebooks use their own event loop, which can cause conflicts when trying to run other event loops like asyncio.

When using asyncio in a Jupyter notebook, nest_asyncio.apply() is used to patch the event loop and allow it to run in the notebook environment. This essentially allows the asyncio event loop to run inside the notebook's event loop without conflicts.

In [None]:
import nest_asyncio
nest_asyncio.apply()

Use the `WebBaseLoader` class provided by the `langchain` package to asynchronously load all the html contents from the links

In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(html_links)
loader.requests_per_second = 10
html_web_pages = loader.aload()

In [None]:
print(len(html_web_pages))
print(html_web_pages[0].page_content)
print(html_web_pages[0])
print(html_web_pages[0].metadata)
print(html_web_pages[0].metadata.get("source"))

Text Splitters

Note that for the tokenizer we defined the encoder as `"cl100k_base"`. This is a specific tiktoken encoder which is used by `gpt-3.5-turbo`. Other encoders exist and at the time of writing are summarized as:

| Encoder | Models |
| --- | --- |
| `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` |
| `p50k_base` | `text-davinci-003`, `code-davinci-002`, `code-cushman-002` |
| `r50k_base` | `text-davinci-001`, `davinci`, `text-similarity-davinci-001` |
| `gpt2` | `gpt2` |

You can find these details in the [Tiktoken `model.py` script](https://github.com/openai/tiktoken/blob/main/tiktoken/model.py), or using `tiktoken.encoding_for_model`:

In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=20)

Then we split the text for a document

In [None]:
web_pages_text_chunks = text_splitter.split_documents(html_web_pages)
print(len(web_pages_text_chunks))
print(web_pages_text_chunks[0])

Generate embeddings using `OpenAIEmbeddings`.

In [None]:
import os
import getpass
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key="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")
temp_text = "This is a test document."
query_result = embeddings.embed_query(temp_text)
print(query_result)

Running Qdrant vector store to save the embeddings locally

In [None]:
from langchain.vectorstores import Qdrant
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
qdrant_url = "http://localhost:6333/"

qdrant = Qdrant.from_documents(documents=web_pages_text_chunks[1:5],
                               embedding=embeddings, 
                               url=qdrant_url, 
                               collection_name="langchain_documents")

Set the OpenAI API key as an environment variable in your system. In Linux or macOS, you can do this by running the following command in a terminal: 

`export OPENAI_API_KEY=<your_key_here>`.

Restart your Jupyter notebook to ensure the environment variable is loaded.

In [None]:
import os
import qdrant_client

collection_name = "langchain_documents"
qdrant_url = "http://localhost:6333/"
qdrant_port = 6333
openai_api_key = os.environ["OPENAI_API_KEY"]
query = "What wrappers are provided by SearxNG search API"

Retrieval

Similarity search
The simplest scenario for using Qdrant vector store is to perform a similarity search. Under the hood, our query will be encoded with the embedding_function and used to find similar documents in Qdrant collection.

In [None]:
client = qdrant_client.QdrantClient(url=qdrant_url, port=qdrant_port)

qdrant = Qdrant(client=client, 
                collection_name="langchain_documents", 
                embedding_function=embeddings.embed_query)

found_docs = qdrant.similarity_search(query)
print(found_docs[0].page_content)

Sometimes we might want to perform the search, but also obtain a relevancy score to know how good is a particular result.

In [None]:
found_docs = qdrant.similarity_search_with_score(query)
document, score = found_docs[0]
print(document.page_content)
print(f"\nScore: {score}")

In [None]:

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from qdrant_client import QdrantClient


client = QdrantClient(url=qdrant_url)

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=openai_api_key)
qdrant = Qdrant(client=client, collection_name=collection_name, embedding_function=embeddings.embed_query)
search_results = qdrant.similarity_search(query, k=2)
chain = load_qa_chain(OpenAI(openai_api_key=openai_api_key,temperature=0.2), chain_type="stuff")
results = chain({"input_documents": search_results, "question": query}, return_only_outputs=True)

print(results["output_text"])