Skip to content

Commit

Permalink
🥥 Content Refresher Agent (#1146)
Browse files Browse the repository at this point in the history
  • Loading branch information
KhoomeiK committed Jul 28, 2023
1 parent 15c88e7 commit d030610
Show file tree
Hide file tree
Showing 7 changed files with 322 additions and 3 deletions.
19 changes: 19 additions & 0 deletions docs/development/workflows.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,25 @@ Each frontend `Node` will have an associated `Block`.
`Node` represents the frontend view / position while the `Block` represents what will actually happen when that `Node` is run.
For example, a "SlackMessageBlock" is a `Block` that, when executed, would send a user a message on "Slack".

## Adding a new block
To add a new block, start by updating the frontend:
- open next/src/services/workflow/node-block-definitions.ts
- figure it out
- (soon block definitions on frontend will be set from backend and edits won't be needed here)

Then update the backend:
- open platform/reworkd_platform/schemas/workflow/blocks
- add a new file for your block
- define the block's input and output types as classes
- add a class for the block with
- attributes: type, description, image_url, and input
- async method: `run`
- install dependencies for your block with `poetry add`
- open platform/reworkd_platform/web/api/workflow/blocks/web/__init__.py
- import your block
- add an if branch to get_block_runner
- `docker compose down; docker compose up --build`

## Node types
- Triggers: Designate how workflows are run
- Actions: Performs a concrete piece of "work"
28 changes: 28 additions & 0 deletions next/src/services/workflow/node-block-definitions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,33 @@ const FileUploadBlockDefinition: NodeBlockDefinition = {
output_fields: [],
};

const ContentRefresherAgent: NodeBlockDefinition = {
name: "Content Refresher Agent",
type: "ContentRefresherAgent",
description: "Refresh the content on an existing page",
image_url: "/tools/web.png",
icon: FaRobot,
input_fields: [
{
name: "url",
description: "The page whose content the agent will refresh",
type: "string",
},
],
output_fields: [
{
name: "original_content",
description: "The original content of the page",
type: "string",
},
{
name: "refreshed_content",
description: "The refreshed content for the page",
type: "string",
},
],
};

export const getNodeBlockDefinitions = (): NodeBlockDefinition[] => {
return [
APITriggerBlockDefinition,
Expand All @@ -316,6 +343,7 @@ export const getNodeBlockDefinitions = (): NodeBlockDefinition[] => {
TextInputWebhookBlockDefinition,
FileUploadBlockDefinition,
GenericLLMAgentBlockDefinition
ContentRefresherAgent
];
};

Expand Down
96 changes: 93 additions & 3 deletions platform/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions platform/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ stripe = "^5.4.0"
tabula-py = "^2.7.0"
slack-sdk = "^3.21.3"
python-docx = "^0.8.11"
scrapingbee = "^1.2.0"
anthropic = "^0.3.6"


[tool.poetry.dev-dependencies]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import re

import anthropic
import requests
from bs4 import BeautifulSoup
from loguru import logger
from scrapingbee import ScrapingBeeClient

from reworkd_platform.schemas.workflow.base import Block, BlockIOBase
from reworkd_platform.settings import settings


class ContentRefresherInput(BlockIOBase):
url: str


class ContentRefresherOutput(ContentRefresherInput):
original_content: str
refreshed_content: str


class ContentRefresherAgent(Block):
type = "ContentRefresherAgent"
description = "Refresh the content on an existing page"
input: ContentRefresherInput

async def run(self, workflow_id: str) -> ContentRefresherOutput:
logger.info(f"Starting {self.type}")
target_url = self.input.url

target_content = get_page_content(target_url)
logger.info(target_content)

keywords = find_content_kws(target_content)
logger.info(keywords)

source_urls = search_results(keywords)
if target_url in source_urls: # TODO: check based on content overlap
source_urls.remove(target_url)
logger.info(source_urls)

source_contents = [
get_page_content(url)
for url in source_urls[:3] # TODO: remove limit of 3 sources
] # TODO: async/multithread the LLM calls
source_contents = [
content for content in source_contents if content is not None
]
logger.info(source_contents)

new_infos = "\n\n".join(
[
find_new_info(target_content, source_content)
for source_content in source_contents
]
)
logger.info(new_infos)

updated_target_content = add_info(target_content, new_infos)
logger.info(updated_target_content)

return ContentRefresherOutput(
**self.input.dict(),
original_content=target_content,
refreshed_content=updated_target_content,
)


scraper = ScrapingBeeClient(
api_key=settings.scrapingbee_api_key,
)
claude = anthropic.Anthropic(
api_key=settings.anthropic_api_key,
)


def get_page_content(url: str) -> str:
page = requests.get(url)
if page.status_code != 200:
page = scraper.get(url)

html = BeautifulSoup(page.content, "html.parser")

pgraphs = html.find_all("p")
pgraphs = "\n".join(
[
f"{i + 1}. " + re.sub(r"\s+", " ", p.text).strip()
for i, p in enumerate(pgraphs)
]
)

prompt = f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here are the line numbers of the main content:",
max_tokens_to_sample=500,
temperature=0,
)
line_nums = response.completion.strip()
if len(line_nums) == 0:
return ""

pgraphs = pgraphs.split("\n")
content = []
for line_num in line_nums.split(","):
if "-" in line_num:
start, end = map(int, line_num.split("-"))
for i in range(start, end + 1):
text = ".".join(pgraphs[i - 1].split(".")[1:]).strip()
content.append(text)
else:
text = ".".join(pgraphs[int(line_num) - 1].split(".")[1:]).strip()
content.append(text)

return "\n".join(content)


def find_content_kws(content: str) -> str:
# Claude: find search keywords that content focuses on
prompt = f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a short search query that best matches the content of the article:",
max_tokens_to_sample=20,
temperature=0,
)
response_message = response.completion.strip()
return response_message


def search_results(search_query: str) -> list[str]:
# use SERP API
response = requests.post(
f"https://google.serper.dev/search",
headers={
"X-API-KEY": settings.serp_api_key or "",
"Content-Type": "application/json",
},
params={
"q": search_query,
},
)
response.raise_for_status()
search_results = response.json()
urls = [result["link"] for result in search_results["organic"]]
return urls


def find_new_info(target: str, source: str) -> str:
# Claude: info mentioned in source that is not mentioned in target
prompt = f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a list of claims in the SOURCE that are not in the TARGET:",
max_tokens_to_sample=5000,
temperature=0,
)
response_message = response.completion.strip()
new_info = "\n".join(response_message.split("\n\n"))
return new_info


def add_info(target: str, info: str) -> str:
# Claude: rewrite target to include the info
prompt = f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
max_tokens_to_sample=5000,
temperature=0,
)
response_message = response.completion.strip()
return response_message
2 changes: 2 additions & 0 deletions platform/reworkd_platform/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class Settings(BaseSettings):

replicate_api_key: Optional[str] = None
serp_api_key: Optional[str] = None
scrapingbee_api_key: Optional[str] = None
anthropic_api_key: Optional[str] = None

# Frontend URL for CORS
frontend_url: str = "http://localhost:3000"
Expand Down
Loading

0 comments on commit d030610

Please sign in to comment.