In [53]:
%pip install -qU instructor openai pydantic graphviz requests pandas beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [213]:
from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import re

def parse_html_to_dataframe(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Initialize lists to store the extracted data
    section_names = []
    section_ids = []
    section_text_names = []
    section_texts = []
    
    # Find all span elements with class 'h3' or 'h4'
    for span in soup.find_all('span', class_='h3') + soup.find_all('span', class_='h4'):
        # Extract section name and ID
        a_tag = span.find('a', class_='selflink')
        if a_tag:
            section_id = a_tag.get('id')
            section_name = a_tag.text.strip()
            
            # Extract section text name (text after the section number)
            section_text_name = span.text.strip().split(None, 1)[1] if len(span.text.strip().split(None, 1)) > 1 else ""
            
            # Find the text content following the span
            section_text = ""
            next_element = span.next_sibling
            while next_element and not (isinstance(next_element, Tag) and next_element.name == 'span' and ('h3' or 'h4') in next_element.get('class', [])):
                if isinstance(next_element, NavigableString):
                    section_text += next_element.strip() + " "
                next_element = next_element.next_sibling
            
            # Clean up the section text
            section_text = re.sub(r'\s+', ' ', section_text).strip()
            
            # Append the extracted data to the lists
            section_names.append(section_name)
            section_ids.append(section_id)
            section_text_names.append(section_text_name)
            section_texts.append(section_text)
    
    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'section_name': section_names,
        'section_id': section_ids,
        'section_text_name': section_text_names,
        'section_text': section_texts
    })
    
    return df

In [251]:
with open('rfc7231.html', 'r') as file:
    html_content = file.read()

df = parse_html_to_dataframe(html_content)

In [252]:
df

Unnamed: 0,section_name,section_id,section_text_name,section_text
0,1.1,section-1.1,Conformance and Error Handling,"The key words ""MUST"", ""MUST NOT"", ""REQUIRED"", ""SHALL"", ""SHALL NOT"", ""SHOULD"", ""SHOULD NOT"", ""RECOMMENDED"", ""MAY"", and ""OPTIONAL"" in this document are to be interpreted as described in [ ]. Conformance criteria and considerations regarding error handling are defined in ."
1,1.2,section-1.2,Syntax Notation,"This specification uses the Augmented Backus-Naur Form (ABNF) notation of [ ] with a list extension, defined in , that allows for compact definition of comma-separated lists using a '#' operator (similar to how the '*' operator indicates repetition). describes rules imported from other documents. shows the collected grammar with all list operators expanded to standard ABNF notation."
2,3.1,section-3.1,Representation Metadata,"Representation header fields provide metadata about the representation. When a message includes a payload body, the representation header fields describe how to interpret the representation data enclosed in the payload body. In a response to a HEAD request, the representation header fields describe the representation data that would have been enclosed in the payload body if the same request had been a GET. The following header fields convey representation metadata: +-------------------+-----------------+ | Header Field Name | Defined in... | +-------------------+-----------------+ | Content-Type | | | Content-Encoding | | | Content-Language | | | Content-Location | | +-------------------+-----------------+ HTTP uses Internet media types [ ] in the Content-Type ( ) and Accept ( ) header fields in order to provide open and extensible data typing and type negotiation. Media types define both a data format and various processing models: how to process that data in accordance with each context in which it is received. media-type = type ""/"" subtype *( OWS "";"" OWS parameter ) type = token subtype = token The type/subtype MAY be followed by parameters in the form of name=value pairs. parameter = token ""="" ( token / quoted-string )"
3,3.2,section-3.2,Representation Data,"The representation data associated with an HTTP message is either provided as the payload body of the message or referred to by the message semantics and the effective request URI. The representation data is in a format and encoding defined by the representation metadata header fields. The data type of the representation data is determined via the header fields Content-Type and Content-Encoding. These define a two-layer, ordered encoding model: representation-data := Content-Encoding( Content-Type( bits ) )"
4,3.3,section-3.3,Payload Semantics,"Some HTTP messages transfer a complete or partial representation as the message ""payload"". In some cases, a payload might contain only the associated representation's header fields (e.g., responses to HEAD) or only some part(s) of the representation data (e.g., the 206 (Partial Content) status code). The purpose of a payload in a request is defined by the method semantics. For example, a representation in the payload of a PUT request ( ) represents the desired state of the target resource if the request is successfully applied, whereas a representation in the payload of a POST request ( ) represents information to be processed by the target resource."
...,...,...,...,...
117,8.2.3,section-8.2.3,Registrations,The status code registry has been updated with the registrations below:
118,8.3.1,section-8.3.1,Considerations for New Header Fields,"Header fields are key:value pairs that can be used to communicate data about the message, its payload, the target resource, or the connection (i.e., control data). See for a general definition of header field syntax in HTTP messages. The requirements for header field names are defined in [ ]. Authors of specifications defining new fields are advised to keep the name as short as practical and not to prefix the name with ""X-"" unless the header field will never be used on the Internet. (The ""X-"" prefix idiom has been extensively misused in practice; it was intended to only be used as a mechanism for avoiding name collisions inside proprietary software or intranet processing, since the prefix would ensure that private names never collide with a newly registered Internet name; see [ ] for further information). New header field values typically have their syntax defined using ABNF ([ ]), using the extension defined in as necessary, and are usually constrained to the range of US-ASCII characters. Header fields needing a greater range of characters can use an encoding such as the one defined in [ ]. Leading and trailing whitespace in raw field values is removed upon field parsing ( ). Field definitions where leading or trailing whitespace in values is significant will have to use a container syntax such as quoted-string ( ). Because commas ("","") are used as a generic delimiter between field-values, they need to be treated with care if they are allowed in the field-value. Typically, components that might contain a comma are protected with double-quotes using the quoted-string ABNF production. For example, a textual date and a URI (either of which might contain a comma) could be safely carried in field-values like these: Example-URI-Field: ""http://example.com/a.html,foo"", ""http://without-a-comma.example.com/"" Example-Date-Field: ""Sat, 04 May 1996"", ""Wed, 14 Sep 2005"" Note that double-quote delimiters almost always are used with the quoted-string production; using a different syntax inside double-quotes will likely cause unnecessary confusion."
119,8.3.2,section-8.3.2,Registrations,"The ""Message Headers"" registry has been updated with the following permanent registrations: +-------------------+----------+----------+-----------------+ | Header Field Name | Protocol | Status | Reference | +-------------------+----------+----------+-----------------+ | Accept | http | standard | | | Accept-Charset | http | standard | | | Accept-Encoding | http | standard | | | Accept-Language | http | standard | | | Allow | http | standard | | | Content-Encoding | http | standard | | | Content-Language | http | standard | | | Content-Location | http | standard | | | Content-Type | http | standard | | | Date | http | standard | | | Expect | http | standard | | | From | http | standard | | | Location | http | standard | | | Max-Forwards | http | standard | | | MIME-Version | http | standard | | | Referer | http | standard | | | Retry-After | http | standard | | | Server | http | standard | | | User-Agent | http | standard | | | Vary | http | standard | | +-------------------+----------+----------+-----------------+ The change controller for the above registrations is: ""IETF (iesg@ietf.org) - Internet Engineering Task Force""."
120,8.4.1,section-8.4.1,Procedure,"Content coding registrations MUST include the following fields: o Name o Description o Pointer to specification text Names of content codings MUST NOT overlap with names of transfer codings ( ), unless the encoding transformation is identical (as is the case for the compression codings defined in ). Values to be added to this namespace require IETF Review (see ) and MUST conform to the purpose of content coding defined in this section. The ""HTTP Content Coding Registry"" has been updated with the registrations below: +----------+----------------------------------------+---------------+ | Name | Description | Reference | +----------+----------------------------------------+---------------+ | identity | Reserved (synonym for ""no encoding"" in | | | | Accept-Encoding) | | +----------+----------------------------------------+---------------+ This section is meant to inform developers, information providers, and users of known security concerns relevant to HTTP semantics and its use for transferring information over the Internet. Considerations related to message syntax, parsing, and routing are discussed in . The list of considerations below is not exhaustive. Most security concerns related to HTTP semantics are about securing server-side applications (code behind the HTTP interface), securing user agent"


In [256]:
# get the row for specific section
df[df['section_id'] == 'section-4.1']

Unnamed: 0,section_name,section_id,section_text_name,section_text
6,4.1,section-4.1,Overview,"The request method token is the primary source of request semantics; it indicates the purpose for which the client has made this request and what is expected by the client as a successful result. The request method's semantics might be further specialized by the semantics of some header fields when present in a request ( ) if those additional semantics do not conflict with the method. For example, a client can send conditional request header fields ( ) to make the requested action conditional on the current state of the target resource ([ ]). method = token HTTP was originally designed to be usable as an interface to distributed object systems. The request method was envisioned as applying semantics to a target resource in much the same way as invoking a defined method on an identified object would apply semantics. The method token is case-sensitive because it might be used as a gateway to object-based systems with case-sensitive method names. Unlike distributed objects, the standardized request methods in HTTP are not resource-specific, since uniform interfaces provide for better visibility and reuse in network-based systems [ ]. Once defined, a standardized method ought to have the same semantics when applied to any resource, though each resource determines for itself whether those semantics are implemented or allowed. This specification defines a number of standardized methods that are commonly used in HTTP, as outlined by the following table. By convention, standardized methods are defined in all-uppercase US-ASCII letters."


In [259]:

def filter_sections(df, sections):
    """
    Filter the DataFrame to include only specified sections and their subsections.
    
    :param df: pandas DataFrame to filter
    :param sections: list of section numbers to include
    :return: filtered pandas DataFrame
    """
    # Create a regular expression pattern to match main sections and any subsections
    pattern = r'^(' + '|'.join(map(str, sections)) + r')(\.\d+)*'
    
    # Filter the DataFrame based on the section_name column
    filtered_df = df[df['section_name'].str.match(pattern)]

    # drop rows with NaN values in any column
    filtered_df = filtered_df.dropna()
    
    return filtered_df

# Filter for sections
filtered_df = filter_sections(df, [4, 5, 6])
filtered_df

Unnamed: 0,section_name,section_id,section_text_name,section_text
6,4.1,section-4.1,Overview,"The request method token is the primary source of request semantics; it indicates the purpose for which the client has made this request and what is expected by the client as a successful result. The request method's semantics might be further specialized by the semantics of some header fields when present in a request ( ) if those additional semantics do not conflict with the method. For example, a client can send conditional request header fields ( ) to make the requested action conditional on the current state of the target resource ([ ]). method = token HTTP was originally designed to be usable as an interface to distributed object systems. The request method was envisioned as applying semantics to a target resource in much the same way as invoking a defined method on an identified object would apply semantics. The method token is case-sensitive because it might be used as a gateway to object-based systems with case-sensitive method names. Unlike distributed objects, the standardized request methods in HTTP are not resource-specific, since uniform interfaces provide for better visibility and reuse in network-based systems [ ]. Once defined, a standardized method ought to have the same semantics when applied to any resource, though each resource determines for itself whether those semantics are implemented or allowed. This specification defines a number of standardized methods that are commonly used in HTTP, as outlined by the following table. By convention, standardized methods are defined in all-uppercase US-ASCII letters."
7,4.2,section-4.2,Common Method Properties,"Request methods are considered ""safe"" if their defined semantics are essentially read-only; i.e., the client does not request, and does not expect, any state change on the origin server as a result of applying a safe method to a target resource. Likewise, reasonable use of a safe method is not expected to cause any harm, loss of property, or unusual burden on the origin server."
8,4.3,section-4.3,Method Definitions,"The GET method requests transfer of a current selected representation for the target resource. GET is the primary mechanism of information retrieval and the focus of almost all performance optimizations. Hence, when people speak of retrieving some identifiable information via HTTP, they are generally referring to making a GET request. It is tempting to think of resource identifiers as remote file system pathnames and of representations as being a copy of the contents of such files. In fact, that is how many resources are implemented (see for related security considerations). However, there are no such limitations in practice. The HTTP interface for a resource is just as likely to be implemented as a tree of content objects, a programmatic view on various database records, or a gateway to other information systems. Even when the URI mapping mechanism is tied to a file system, an origin server might be configured to execute the files with the request as input and send the output as the representation rather than transfer the files directly. Regardless, only the origin server needs to know how each of its resource"
9,5.1,section-5.1,Controls,Controls are request header fields that direct specific handling of the request. +-------------------+--------------------------+ | Header Field Name | Defined in... | +-------------------+--------------------------+ | Cache-Control | | | Expect | | | Host | | | Max-Forwards | | | Pragma | | | Range | | | TE | | +-------------------+--------------------------+
10,5.2,section-5.2,Conditionals,"The HTTP conditional request header fields [ ] allow a client to place a precondition on the state of the target resource, so that the action corresponding to the method semantics will not be applied if the precondition evaluates to false. Each precondition defined by"
...,...,...,...,...
101,6.6.2,section-6.6.2,501 Not Implemented,"The 501 (Not Implemented) status code indicates that the server does not support the functionality required to fulfill the request. This is the appropriate response when the server does not recognize the request method and is not capable of supporting it for any resource. A 501 response is cacheable by default; i.e., unless otherwise indicated by the method definition or explicit cache controls (see ). The 502 (Bad Gateway) status code indicates that the server, while acting as a gateway or proxy, received an invalid response from an inbound server it accessed while attempting to fulfill the request. The 503 (Service Unavailable) status code indicates that the server is currently unable to handle the request due to a temporary overload or scheduled maintenance, which will likely be alleviated after some delay. The server MAY send a Retry-After header field ( ) to suggest an appropriate amount of time for the client to wait before retrying the request. Note: The existence of the 503 status code does not imply that a server has to use it when becoming overloaded. Some servers might simply refuse the connection. The 504 (Gateway Timeout) status code indicates that the server, while acting as a gateway or proxy, did not receive a timely response from an upstream server it needed to access in order to complete the request."
102,6.6.3,section-6.6.3,502 Bad Gateway,"The 502 (Bad Gateway) status code indicates that the server, while acting as a gateway or proxy, received an invalid response from an inbound server it accessed while attempting to fulfill the request. The 503 (Service Unavailable) status code indicates that the server is currently unable to handle the request due to a temporary overload or scheduled maintenance, which will likely be alleviated after some delay. The server MAY send a Retry-After header field ( ) to suggest an appropriate amount of time for the client to wait before retrying the request. Note: The existence of the 503 status code does not imply that a server has to use it when becoming overloaded. Some servers might simply refuse the connection. The 504 (Gateway Timeout) status code indicates that the server, while acting as a gateway or proxy, did not receive a timely response from an upstream server it needed to access in order to complete the request."
103,6.6.4,section-6.6.4,503 Service Unavailable,"The 503 (Service Unavailable) status code indicates that the server is currently unable to handle the request due to a temporary overload or scheduled maintenance, which will likely be alleviated after some delay. The server MAY send a Retry-After header field ( ) to suggest an appropriate amount of time for the client to wait before retrying the request. Note: The existence of the 503 status code does not imply that a server has to use it when becoming overloaded. Some servers might simply refuse the connection. The 504 (Gateway Timeout) status code indicates that the server, while acting as a gateway or proxy, did not receive a timely response from an upstream server it needed to access in order to complete the request."
104,6.6.5,section-6.6.5,504 Gateway Timeout,"The 504 (Gateway Timeout) status code indicates that the server, while acting as a gateway or proxy, did not receive a timely response from an upstream server it needed to access in order to complete the request."


In [262]:
ideas = filtered_df['section_text_name'].tolist()
with open('ideas.txt', 'w') as file:
    file.write('\n'.join(ideas))

In [None]:
planning_meta_prompt = """
I'm about to give you the text in the following sections. Come up with a plan of how to model this content as a knowledge graph with Nodes and Edges. Which fields should you model as Nodes? Which fields should you model as Edges? Reply with a clear prompt to give to an AI model that captures the general idea without just listing the answer explicitly.  

Overview
Common Method Properties
Method Definitions
Controls
Conditionals
Content Negotiation
Authentication Credentials
Request Context
Overview of Status Codes
Informational 1xx
Successful 2xx
Redirection 3xx
Client Error 4xx
Server Error 5xx
Safe Methods
Idempotent Methods
Cacheable Methods
GET
HEAD
POST
PUT
DELETE
CONNECT
OPTIONS
TRACE
Expect
Max-Forwards
Quality Values
Accept
Accept-Charset
Accept-Encoding
Accept-Language
From
Referer
User-Agent
100 Continue
101 Switching Protocols
200 OK
201 Created
202 Accepted
203 Non-Authoritative Information
204 No Content
205 Reset Content
300 Multiple Choices
301 Moved Permanently
302 Found
303 See Other
305 Use Proxy
306 (Unused)
307 Temporary Redirect
400 Bad Request
402 Payment Required
403 Forbidden
404 Not Found
405 Method Not Allowed
406 Not Acceptable
408 Request Timeout
409 Conflict
410 Gone
411 Length Required
413 Payload Too Large
414 URI Too Long
415 Unsupported Media Type
417 Expectation Failed
426 Upgrade Required
500 Internal Server Error
501 Not Implemented
502 Bad Gateway
503 Service Unavailable
504 Gateway Timeout
505 HTTP Version Not Supported
"""

In [264]:
prompt = """
Objective: Develop a knowledge graph that captures the hierarchical and relational structure of HTTP methods, status codes, and headers, alongside their properties and interdependencies.

Nodes: Represent the key entities in the system. These include:

Method Types (e.g., GET, POST, PUT, DELETE): Each HTTP method has unique properties like idempotency, cacheability, and safety. These should be nodes because they encapsulate important method-specific attributes.
Status Codes (e.g., 200 OK, 404 Not Found): Each status code can be considered a node with attributes such as code group (1xx, 2xx, etc.), description, and the associated scenarios that trigger it.
Headers (e.g., Accept, User-Agent, Referer): HTTP headers define various metadata or control information about the request/response. Each header should be modeled as a node with its purpose, possible values, and applicable methods.
Edges: Represent the relationships and connections between these entities:

"Defines" Edge: Connect Method nodes to their respective properties such as Safe, Idempotent, or Cacheable.
"Triggers" Edge: Connect Methods to the possible Status Codes they can trigger.
"Requires" Edge: Connect Headers to the Methods or Status Codes that may depend on or utilize them.
"Relates To" Edge: Link various Status Codes to the HTTP Methods they typically accompany.
"Handles" Edge: Connect Conditionals and Controls to Methods to show which aspects of the HTTP request they manage.
Attributes for Nodes:

Method Nodes: Attributes include idempotency, safety, cacheability, and the HTTP verb.
Status Code Nodes: Attributes include the numeric code, description, and the code category (e.g., Informational, Success, Client Error).
Header Nodes: Attributes include possible values, impact on the request/response, and usage context (e.g., request or response headers).

Graph Hierarchy:
Start with high-level categories like Method Definitions, Status Codes, and Headers.
Within these, further split into specific entities (e.g., GET under Methods, 200 OK under Status Codes, Accept-Language under Headers).
Connect these entities with the appropriate relationships (Edges) to model the HTTP ecosystem.
"""


In [265]:
from pydantic import BaseModel, Field
from typing import List, Optional
from graphviz import Digraph

class Node(BaseModel):
    id: int
    label: str    
    color: str

    def __hash__(self):
        return hash((self.id, self.label, self.color))
    
    def __eq__(self, other):
        if isinstance(other, Node):
            return self.id == other.id and self.label == other.label and self.color == other.color
        return False

class Edge(BaseModel):
    source: int
    target: int
    label: str
    color: str = "black"

    def __hash__(self):
        return hash((self.source, self.target))
    
    def __eq__(self, other):
        if isinstance(other, Edge):
            return (self.source == other.source
                and self.target == other.target
                and self.label == other.label
                and self.color == other.color)
        return False

class KnowledgeGraph(BaseModel):
    nodes: Optional[List[Node]] = Field(..., default_factory=list)
    edges: Optional[List[Edge]] = Field(..., default_factory=list)

    def update(self, other: "KnowledgeGraph") -> "KnowledgeGraph":
        """Updates the current graph with the other graph, deduplicating nodes and edges."""
        if other.nodes is None:
            other.nodes = []

        if other.edges is None:
            other.edges = []
            
        return KnowledgeGraph(
            nodes=list(set(self.nodes + other.nodes)),
            edges=list(set(self.edges + other.edges)),
        )

    def draw(self, prefix: str = None):
        dot = Digraph(comment="Knowledge Graph")

        for node in self.nodes:  
            dot.node(str(node.id), node.label, color=node.color)

        for edge in self.edges:  
            dot.edge(
                str(edge.source), str(edge.target), label=edge.label, color=edge.color
            )
        dot.render(prefix, format="png", view=False)

In [266]:
from openai import OpenAI
import instructor

client = instructor.from_openai(OpenAI())

def generate_graph(input: List[str]) -> KnowledgeGraph:
    cur_state = KnowledgeGraph()  
    num_iterations = len(input)
    for i, inp in enumerate(input):
        new_updates = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": """You are an iterative knowledge graph builder.
                    You are given the current state of the graph, and you must append the nodes and edges to it.
                    Do not provide any duplicates and try to reuse nodes as much as possible.""",
                },
                {
                    "role": "user",
                    "content": f"""{prompt} Extract any new nodes and edges from the following:
                    # Part {i}/{num_iterations} of the input:

                    {inp}""",
                },
                {
                    "role": "user",
                    "content": f"""Here is the current state of the graph:
                    {cur_state.model_dump_json(indent=2)}""",
                },  
            ],
            response_model=KnowledgeGraph,
        )  # type: ignore

        # Update the current state
        cur_state = cur_state.update(new_updates)  
        cur_state.draw(prefix=f"./out/iteration_{i}")
    return cur_state

In [270]:
# make an array of the section_text_name and the section_text 
input = "<section-name>" + filtered_df['section_text_name'] + "</section-name> <section-text>" + filtered_df['section_text'] + "</section-text>"
# convert to list
input = input.tolist()
print(len(input))

In [None]:
# Future work - try to improve the graph by passing the whole graph, then iterating through the sections to review 
# and add/remove nodes and edges as needed now we have the whole graph constructed.