Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🥥 Content Refresher Agent #1146

Merged
merged 5 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions docs/development/workflows.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,25 @@ Each frontend `Node` will have an associated `Block`.
`Node` represents the frontend view / position while the `Block` represents what will actually happen when that `Node` is run.
For example, a "SlackMessageBlock" is a `Block` that, when executed, would send a user a message on "Slack".

## Adding a new block
To add a new block, start by updating the frontend:
- open next/src/services/workflow/node-block-definitions.ts
- figure it out
- (soon block definitions on frontend will be set from backend and edits won't be needed here)

Then update the backend:
- open platform/reworkd_platform/schemas/workflow/blocks
- add a new file for your block
- define the block's input and output types as classes
- add a class for the block with
- attributes: type, description, image_url, and input
- async method: `run`
- install dependencies for your block with `poetry add`
- open platform/reworkd_platform/web/api/workflow/blocks/web/__init__.py
- import your block
- add an if branch to get_block_runner
- `docker compose down; docker compose up --build`

## Node types
- Triggers: Designate how workflows are run
- Actions: Performs a concrete piece of "work"
28 changes: 28 additions & 0 deletions next/src/services/workflow/node-block-definitions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,33 @@ const FileUploadBlockDefinition: NodeBlockDefinition = {
output_fields: [],
};

const ContentRefresherAgent: NodeBlockDefinition = {
name: "Content Refresher Agent",
type: "ContentRefresherAgent",
description: "Refresh the content on an existing page",
image_url: "/tools/web.png",
icon: FaRobot,
input_fields: [
{
name: "url",
description: "The page whose content the agent will refresh",
type: "string",
},
],
output_fields: [
{
name: "original_content",
description: "The original content of the page",
type: "string",
},
{
name: "refreshed_content",
description: "The refreshed content for the page",
type: "string",
},
],
};

export const getNodeBlockDefinitions = (): NodeBlockDefinition[] => {
return [
APITriggerBlockDefinition,
Expand All @@ -316,6 +343,7 @@ export const getNodeBlockDefinitions = (): NodeBlockDefinition[] => {
TextInputWebhookBlockDefinition,
FileUploadBlockDefinition,
GenericLLMAgentBlockDefinition
ContentRefresherAgent
];
};

Expand Down
96 changes: 93 additions & 3 deletions platform/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions platform/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ stripe = "^5.4.0"
tabula-py = "^2.7.0"
slack-sdk = "^3.21.3"
python-docx = "^0.8.11"
scrapingbee = "^1.2.0"
anthropic = "^0.3.6"


[tool.poetry.dev-dependencies]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import re

import anthropic
import requests
from bs4 import BeautifulSoup
from loguru import logger
from scrapingbee import ScrapingBeeClient

from reworkd_platform.schemas.workflow.base import Block, BlockIOBase
from reworkd_platform.settings import settings


class ContentRefresherInput(BlockIOBase):
url: str


class ContentRefresherOutput(ContentRefresherInput):
original_content: str
refreshed_content: str


class ContentRefresherAgent(Block):
type = "ContentRefresherAgent"
description = "Refresh the content on an existing page"
input: ContentRefresherInput

async def run(self, workflow_id: str) -> ContentRefresherOutput:
logger.info(f"Starting {self.type}")
target_url = self.input.url

target_content = get_page_content(target_url)
logger.info(target_content)

keywords = find_content_kws(target_content)
logger.info(keywords)

source_urls = search_results(keywords)
if target_url in source_urls: # TODO: check based on content overlap
source_urls.remove(target_url)
logger.info(source_urls)

source_contents = [
get_page_content(url)
for url in source_urls[:3] # TODO: remove limit of 3 sources
] # TODO: async/multithread the LLM calls
source_contents = [
content for content in source_contents if content is not None
]
logger.info(source_contents)

new_infos = "\n\n".join(
[
find_new_info(target_content, source_content)
for source_content in source_contents
]
)
logger.info(new_infos)

updated_target_content = add_info(target_content, new_infos)
logger.info(updated_target_content)

return ContentRefresherOutput(
**self.input.dict(),
original_content=target_content,
refreshed_content=updated_target_content,
)


scraper = ScrapingBeeClient(
api_key=settings.scrapingbee_api_key,
)
claude = anthropic.Anthropic(
api_key=settings.anthropic_api_key,
)


def get_page_content(url: str) -> str:
page = requests.get(url)
if page.status_code != 200:
page = scraper.get(url)

html = BeautifulSoup(page.content, "html.parser")

pgraphs = html.find_all("p")
pgraphs = "\n".join(
[
f"{i + 1}. " + re.sub(r"\s+", " ", p.text).strip()
for i, p in enumerate(pgraphs)
]
)

prompt = f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here are the line numbers of the main content:",
max_tokens_to_sample=500,
temperature=0,
)
line_nums = response.completion.strip()
if len(line_nums) == 0:
return ""

pgraphs = pgraphs.split("\n")
content = []
for line_num in line_nums.split(","):
if "-" in line_num:
start, end = map(int, line_num.split("-"))
for i in range(start, end + 1):
text = ".".join(pgraphs[i - 1].split(".")[1:]).strip()
content.append(text)
else:
text = ".".join(pgraphs[int(line_num) - 1].split(".")[1:]).strip()
content.append(text)

return "\n".join(content)


def find_content_kws(content: str) -> str:
# Claude: find search keywords that content focuses on
prompt = f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a short search query that best matches the content of the article:",
max_tokens_to_sample=20,
temperature=0,
)
response_message = response.completion.strip()
return response_message


def search_results(search_query: str) -> list[str]:
# use SERP API
response = requests.post(
f"https://google.serper.dev/search",
headers={
"X-API-KEY": settings.serp_api_key or "",
"Content-Type": "application/json",
},
params={
"q": search_query,
},
)
response.raise_for_status()
search_results = response.json()
urls = [result["link"] for result in search_results["organic"]]
return urls


def find_new_info(target: str, source: str) -> str:
# Claude: info mentioned in source that is not mentioned in target
prompt = f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a list of claims in the SOURCE that are not in the TARGET:",
max_tokens_to_sample=5000,
temperature=0,
)
response_message = response.completion.strip()
new_info = "\n".join(response_message.split("\n\n"))
return new_info


def add_info(target: str, info: str) -> str:
# Claude: rewrite target to include the info
prompt = f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles."
response = claude.completions.create(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be worth it to create some abstraction over claude models or even just use langchain so we have retries with exponentail backoff, etc for prod

model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
max_tokens_to_sample=5000,
temperature=0,
)
response_message = response.completion.strip()
return response_message
2 changes: 2 additions & 0 deletions platform/reworkd_platform/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class Settings(BaseSettings):

replicate_api_key: Optional[str] = None
serp_api_key: Optional[str] = None
scrapingbee_api_key: Optional[str] = None
anthropic_api_key: Optional[str] = None

# Frontend URL for CORS
frontend_url: str = "http://localhost:3000"
Expand Down
Loading
Loading