Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🥥 Content Refresher Agent #1146

Merged
merged 5 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion docs/development/workflows.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,23 @@ The workflow hierarchy follows a graph-like structure. The frontend models only
The backend models represent the mechanisms to actually perform work for a given node.
Each frontend `Node` will have an associated `Block`.
`Node` represents the frontend view / position while the `Block` represents what will actually happen when that `Node` is run.
For example, a "SlackMessageBlock" is a `Block` that, when executed, would send a user a message on "Slack".
For example, a "SlackMessageBlock" is a `Block` that, when executed, would send a user a message on "Slack".

## Adding a new block
To add a new block, start by updating the frontend:
- open next/src/services/workflow/node-block-definitions.ts
- figure it out
- (soon block definitions on frontend will be set from backend and edits won't be needed here)

Then update the backend:
- open platform/reworkd_platform/schemas/workflow/blocks
- add a new file for your block
- define the block's input and output types as classes
- add a class for the block with
- attributes: type, description, image_url, and input
- async method: `run`
- install dependencies for your block with `poetry add`
- open platform/reworkd_platform/web/api/workflow/blocks/web/__init__.py
- import your block
- add an if branch to get_block_runner
- `docker compose down; docker compose up --build`
28 changes: 28 additions & 0 deletions next/src/services/workflow/node-block-definitions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,33 @@ const WebInteractionAgent: NodeBlockDefinition = {
output_fields: [],
};

const ContentRefresherAgent: NodeBlockDefinition = {
name: "Content Refresher Agent",
type: "ContentRefresherAgent",
description: "Refresh the content on an existing page",
image_url: "/tools/web.png",
icon: FaRobot,
input_fields: [
{
name: "url",
description: "The page whose content the agent will refresh",
type: "string",
},
],
output_fields: [
{
name: "original_content",
description: "The original content of the page",
type: "string",
},
{
name: "refreshed_content",
description: "The refreshed content for the page",
type: "string",
},
],
};

export const getNodeBlockDefinitions = (): NodeBlockDefinition[] => {
return [
UrlStatusCheckBlockDefinition,
Expand All @@ -209,6 +236,7 @@ export const getNodeBlockDefinitions = (): NodeBlockDefinition[] => {
TriggerBlockDefinition,
SummaryWebhookBlockDefinition,
TextInputWebhookBlockDefinition,
ContentRefresherAgent
];
};

Expand Down
228 changes: 161 additions & 67 deletions platform/poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions platform/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ networkx = "^3.1"
pusher = "^3.3.2"
pypdf2 = "^3.0.1"
python-multipart = "^0.0.6"
scrapingbee = "^1.2.0"
anthropic = "^0.3.6"


[tool.poetry.dev-dependencies]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
from loguru import logger
from reworkd_platform.settings import settings
from reworkd_platform.schemas.workflow.base import Block, BlockIOBase

import re
import requests
from scrapingbee import ScrapingBeeClient
from bs4 import BeautifulSoup
import anthropic


class ContentRefresherInput(BlockIOBase):
url: str


class ContentRefresherOutput(ContentRefresherInput):
original_content: str
refreshed_content: str


class ContentRefresherAgent(Block):
type = "ContentRefresherAgent"
description = "Refresh the content on an existing page"
input: ContentRefresherInput

async def run(self) -> BlockIOBase:
logger.info(f"Starting {self.type}")
target_url = self.input.url

target_content = get_page_content(target_url)
logger.info(target_content)

keywords = find_content_kws(target_content)
logger.info(keywords)

source_urls = search_results(keywords)
if target_url in source_urls: # TODO: check based on content overlap
source_urls.remove(target_url)
logger.info(source_urls)

source_contents = [
get_page_content(url)
for url in source_urls[:3] # TODO: remove limit of 3 sources
] # TODO: async/multithread the LLM calls
source_contents = [
content for content in source_contents if content is not None
]
logger.info(source_contents)

new_infos = "\n\n".join(
[
find_new_info(target_content, source_content)
for source_content in source_contents
]
)
logger.info(new_infos)

updated_target_content = add_info(target_content, new_infos)
logger.info(updated_target_content)

return ContentRefresherOutput(
**self.input.dict(),
original_content=target_content,
refreshed_content=updated_target_content,
)


scraper = ScrapingBeeClient(
api_key=settings.scrapingbee_api_key,
)
claude = anthropic.Anthropic(
api_key=settings.anthropic_api_key,
)


def get_page_content(url: str) -> str:
page = requests.get(url)
if page.status_code != 200:
page = scraper.get(url)

html = BeautifulSoup(page.content, "html.parser")

pgraphs = html.find_all("p")
pgraphs = "\n".join(
[
f"{i+1}. " + re.sub(r"\s+", " ", p.text).strip()
for i, p in enumerate(pgraphs)
]
)

prompt = f"Below is a numbered list of the text in all the <p> tags on a web page:\n{pgraphs}\nSome of these lines may not be part of the main content of the page (e.g. footer text, ads, etc). Please list the line numbers that *are* part of the main content (i.e. the article's paragraphs) of the page. You can list consecutive line numbers as a range (e.g. 23-27) and separated by a comma."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here are the line numbers of the main content:",
max_tokens_to_sample=500,
temperature=0,
)
line_nums = response.completion.strip()
if len(line_nums) == 0:
return ''

pgraphs = pgraphs.split("\n")
content = []
for line_num in line_nums.split(","):
if "-" in line_num:
start, end = map(int, line_num.split("-"))
for i in range(start, end + 1):
text = ".".join(pgraphs[i - 1].split(".")[1:]).strip()
content.append(text)
else:
text = ".".join(pgraphs[int(line_num) - 1].split(".")[1:]).strip()
content.append(text)

return "\n".join(content)


def find_content_kws(content: str) -> str:
# Claude: find search keywords that content focuses on
prompt = f"Below is content from a web article:\n{content}\nPlease list the keywords that best describe the content of the article. Format them so we can use them to query a search engine effectively."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a short search query that best matches the content of the article:",
max_tokens_to_sample=20,
temperature=0,
)
response_message = response.completion.strip()
return response_message


def search_results(search_query: str) -> list[str]:
# use SERP API
response = requests.post(
f"https://google.serper.dev/search",
headers={
"X-API-KEY": settings.serp_api_key or '',
"Content-Type": "application/json",
},
params={
"q": search_query,
},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should wrap this into a client abstraction and reuse across the app. Can be a separate ticket

)
response.raise_for_status()
search_results = response.json()
urls = [result["link"] for result in search_results["organic"]]
return urls


def find_new_info(target: str, source: str) -> str:
# Claude: info mentioned in source that is not mentioned in target
prompt = f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article."
response = claude.completions.create(
model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a list of claims in the SOURCE that are not in the TARGET:",
max_tokens_to_sample=5000,
temperature=0,
)
response_message = response.completion.strip()
new_info = "\n".join(response_message.split("\n\n"))
return new_info


def add_info(target: str, info: str) -> str:
# Claude: rewrite target to include the info
prompt = f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles."
response = claude.completions.create(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be worth it to create some abstraction over claude models or even just use langchain so we have retries with exponentail backoff, etc for prod

model="claude-2",
prompt=f"\n\nHuman: {prompt}\n\nAssistant: Here is a rewritten version of the target article that incorporates relevant information from the source articles:",
max_tokens_to_sample=5000,
temperature=0,
)
response_message = response.completion.strip()
return response_message


if __name__ == "main":
print("MAIN")
agent = ContentRefresherAgent(
id="test",
type="ContentRefresherAgent",
input=ContentRefresherInput(
url="https://www.science.org/content/article/embattled-physicist-files-patent-unprecedented-ambient-superconductor"
)
)
output = agent.run()
print(output)
KhoomeiK marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions platform/reworkd_platform/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class Settings(BaseSettings):

replicate_api_key: Optional[str] = None
serp_api_key: Optional[str] = None
scrapingbee_api_key: Optional[str] = None
anthropic_api_key: Optional[str] = None

# Frontend URL for CORS
frontend_url: str = "http://localhost:3000"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,18 @@
from reworkd_platform.schemas.workflow.blocks.url_status_check import (
UrlStatusCheckBlock,
)
from reworkd_platform.schemas.workflow.blocks.agents.content_refresher_agent import (
ContentRefresherAgent,
)


def get_block_runner(block: Block) -> Block:
if block.type == "IfCondition":
return IfCondition(**block.dict())
if block.type == "WebInteractionAgent":
return WebInteractionAgent(**block.dict())
if block.type == "ContentRefresherAgent":
return ContentRefresherAgent(**block.dict())
if block.type == "ManualTriggerBlock":
return ManualTriggerBlock(**block.dict())
if block.type == "UrlStatusCheck":
Expand Down
Loading