In [1]:
import json
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
RETRIEVAL_SOURCES_PATH = '../data/retrieval_sources.json'

def _get_src_urls(src_path: str = RETRIEVAL_SOURCES_PATH):
    with open(src_path) as retrieval_src_file:
        retrieval_srcs = json.load(retrieval_src_file)
        urls = retrieval_srcs['bsky'] + retrieval_srcs['skyware']
        return urls

In [4]:
urls = _get_src_urls()
docs = [WebBaseLoader(url).load() for url in urls]

print(docs[0][0].page_content.strip()[:1000])

The AT Protocol | Bluesky






Skip to main contentBlueskyDocsBlogShowcaseGitHubSearchGet StartedTutorialsStarter TemplatesAdvanced GuidesThe AT ProtocolFederation ArchitectureLinks, mentions, and rich textRate LimitsLabels and moderationPostsTimestampsFirehoseResolving IdentitiesCustom SchemasBackfilling the NetworkRead-After-WriteService AuthPDS EntrywayoEmbed and Post Embed WidgetAction Intent LinksOAuth Client ImplementationAPI Hosts and AuthHTTP ReferenceSupportAdvanced GuidesThe AT ProtocolOn this pageThe AT Protocol
The AT Protocol (Authenticated Transfer Protocol, or atproto) is a standard for public conversation and an open-source framework for building social apps.
It creates a standard format for user identity, follows, and data on social apps, allowing apps to interoperate and users to move across them freely. It is a federated network with account portability.
Basic Concepts​
Identity​
Users are identified by domain names on the AT Protocol. These domains map to cryptogra

In [6]:
docs_list = [item for sublist in docs for item in sublist]
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100,
    chunk_overlap=50
)
doc_splits = text_splitter.split_documents(docs_list)

In [7]:
doc_splits[0].page_content.strip()

'The AT Protocol | Bluesky'

In [8]:
vectorstore = InMemoryVectorStore.from_documents(
    documents=doc_splits,
    embedding=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()

In [10]:
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
    "retrieve_bsky_docs",
    "Search and return information about the Bluesky social app and Bluesky labelers.",
)

In [11]:
retriever_tool.invoke({'query': 'label configuration'})



Evaluate doc relevance

In [17]:
from pydantic import BaseModel, Field
from typing import Literal
from langchain.chat_models import init_chat_model
from langgraph.graph import MessagesState

EVAL_PROMPT = (
    "You are an evaluator assessing the relevance of a retrieved document to a user question. \n"
    "Here is the retrieved document: \n\n {context} \n\n"
    "Here is the user question: {question} \n"
    "If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n"
    "Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."
)

class EvaluateDocuments(BaseModel):
    """Evaluate documents using a binary score for relevance check"""
    binary_score: str = Field(
        description="Relevance score: 'yes' if relevant, or 'no' if not relevant"
    )

eval_model = init_chat_model("openai:gpt-4.1", temperature=0)

def evaluate_documents(state: MessagesState) -> Literal['generate_answer', 'rewrite_question']:
    question = state['messages'][0].content
    context = state['messages'][-1].content

    prompt = EVAL_PROMPT.format(context=context, question=question)

    response = eval_model.with_structured_output(EvaluateDocuments).invoke(
        [{'role': 'user', 'content': prompt}]
    )
    score = response.binary_score

    if score == 'yes':
        return 'generate_answer'
    else:
        return 'rewrite_question'

In [None]:
from langchain_core.messages import convert_to_messages

input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "What does the Bluesky documentation say about Personal Data Servers?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_bsky_docs",
                        "args": {"query": "bluesky personal data server"},
                    }
                ],
            },
            {"role": "tool", "content": "sike!", "tool_call_id": "1"},
        ]
    )
}
evaluate_documents(input)

'rewrite_question'

Rewrite question

In [19]:
response_model = init_chat_model("openai:gpt-4.1", temperature=0)

REWRITE_PROMPT = (
    "Look at the input and try to reason about the underlying semantic intent / meaning.\n"
    "Here is the initial question:"
    "\n ------- \n"
    "{question}"
    "\n ------- \n"
    "Formulate an improved question:"
)


def rewrite_question(state: MessagesState):
    """Rewrite the original user question."""
    messages = state["messages"]
    question = messages[0].content
    prompt = REWRITE_PROMPT.format(question=question)
    response = response_model.invoke([{"role": "user", "content": prompt}])
    return {"messages": [{"role": "user", "content": response.content}]}

In [20]:
input = {
    "messages": convert_to_messages(
        [
            {
                "role": "user",
                "content": "What does the Bluesky documentation say about Personal Data Servers?",
            },
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "1",
                        "name": "retrieve_bsky_docs",
                        "args": {"query": "bluesky personal data server"},
                    }
                ],
            },
            {"role": "tool", "content": "sike!", "tool_call_id": "1"},
        ]
    )
}

response = rewrite_question(input)
print(response["messages"][-1]["content"])

What information does the Bluesky documentation provide regarding Personal Data Servers, including their purpose, functionality, and role within the Bluesky ecosystem?


In [None]:
GENERATE_PROMPT = (
    "You are an assistant for question-answering tasks about Bluesky's moderation system. Use the following pieces of retrieved context to answer the question.\n"
    "If the question asks for label configuration, label definitions, or moderation settings, provide the appropriate configuration in the correct format (JSON, code snippets, or structured data as needed).\n"
    "Try your best to suggest label names, descriptions, and severity, based on the provided question and context.\n"
    "If the question asks for general information or explanations, use three sentences maximum and keep the answer concise.\n"
    "Question: {question}"
    "Context: {context}"
)

def generate_answer(state: MessagesState):
    """Generate an answer."""
    question = state["messages"][0].content
    context = state["messages"][-1].content
    prompt = GENERATE_PROMPT.format(question=question, context=context)
    response = response_model.invoke([{"role": "user", "content": prompt}])
    return {"messages": [response]}