From 2a7620e006c6d3f65d439eb9ba430c3cd459b1bd Mon Sep 17 00:00:00 2001 From: Rishabh Date: Thu, 3 Aug 2023 16:30:17 -0700 Subject: [PATCH 1/7] pass source metadata to output + make output follow og doc format --- .../blocks/agents/content_refresher_agent.py | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py index d6208f4ddc..cbbfc3c44f 100644 --- a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py +++ b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py @@ -35,21 +35,23 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput: keywords = await find_content_kws(target_content) logger.info(keywords) - source_urls = search_results(keywords) - if target_url in source_urls: # TODO: check based on content overlap - source_urls.remove(target_url) - logger.info(source_urls) + sources = search_results(keywords) + sources = [ + (url, title) for url, title in sources if url != target_url + ] # TODO: check based on content overlap + logger.info(sources) source_contents = [ - await get_page_content(url) - for url in source_urls[:3] # TODO: remove limit of 3 sources + (url, title, await get_page_content(url)) for url, title in sources[:3] ] source_contents = [ - content for content in source_contents if content is not None + (url, title, page_content) + for url, title, page_content in source_contents + if page_content is not None ] - logger.info(source_contents) + new_info = [ await find_new_info(target_content, source_content) for source_content in source_contents @@ -134,7 +136,7 @@ async def find_content_kws(content: str) -> str: ) -def search_results(search_query: str) -> list[str]: +def search_results(search_query: str) -> list[tuple[str, str]]: # use SERP API response = requests.post( f"https://google.serper.dev/search", @@ -147,14 +149,19 @@ def search_results(search_query: str) -> list[str]: }, ) response.raise_for_status() - urls = [result["link"] for result in response.json()["organic"]] - return urls + source_information = [ + (result["link"], result["title"]) for result in response.json()["organic"] + ] + return source_information + +async def find_new_info(target: str, source: tuple[str, str, str]) -> str: + source_info = f"{source[0]}, {source[1]}" + source_content = source[2] -async def find_new_info(target: str, source: str) -> str: # Claude: info mentioned in source that is not mentioned in target prompt = HumanAssistantPrompt( - human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.", + human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source_content}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.\n----------------\nAt the top of the list, include a source citation (you MUST include the url AND full source/article title). Source information: \n{source_info}", assistant_prompt="Here is a list of claims in the SOURCE that are not in the TARGET:", ) @@ -170,7 +177,7 @@ async def find_new_info(target: str, source: str) -> str: async def add_info(target: str, info: str) -> str: # Claude: rewrite target to include the info prompt = HumanAssistantPrompt( - human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles.", + human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles. Maintain the format of the TARGET article. At the end of the article, include a list of source references (source url, title, and any additional information) ONLY for added information from SOURCE articles using the following example format: 'Source: https://www.wisnerbaum.com/prescription-drugs/gardasil-lawsuit/, Gardasil Vaccine Lawsuit Update August 2023 - Wisner Baum' Do not add citations for any info in the TARGET article.", assistant_prompt="Here is a rewritten version of the target article that incorporates relevant information from the source articles:", ) From 605f8a2de7d416b874a86b88572806bec38e3213 Mon Sep 17 00:00:00 2001 From: Rishabh Date: Thu, 3 Aug 2023 16:45:28 -0700 Subject: [PATCH 2/7] fix source listing --- .../schemas/workflow/blocks/agents/content_refresher_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py index cbbfc3c44f..f11c9d54c3 100644 --- a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py +++ b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py @@ -161,7 +161,7 @@ async def find_new_info(target: str, source: tuple[str, str, str]) -> str: # Claude: info mentioned in source that is not mentioned in target prompt = HumanAssistantPrompt( - human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source_content}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.\n----------------\nAt the top of the list, include a source citation (you MUST include the url AND full source/article title). Source information: \n{source_info}", + human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source_content}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article. Under the list, write a source citation (you MUST include the url AND full source title provided below: \n{source_info}", assistant_prompt="Here is a list of claims in the SOURCE that are not in the TARGET:", ) @@ -177,7 +177,7 @@ async def find_new_info(target: str, source: tuple[str, str, str]) -> str: async def add_info(target: str, info: str) -> str: # Claude: rewrite target to include the info prompt = HumanAssistantPrompt( - human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles. Maintain the format of the TARGET article. At the end of the article, include a list of source references (source url, title, and any additional information) ONLY for added information from SOURCE articles using the following example format: 'Source: https://www.wisnerbaum.com/prescription-drugs/gardasil-lawsuit/, Gardasil Vaccine Lawsuit Update August 2023 - Wisner Baum' Do not add citations for any info in the TARGET article.", + human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles. Maintain the format of the TARGET article. At the end of the article, include a list of sources (source url, title, and any additional information) ONLY for added information from SOURCE articles using the following example format: 'Source: https://www.wisnerbaum.com/prescription-drugs/gardasil-lawsuit/, Gardasil Vaccine Lawsuit Update August 2023 - Wisner Baum' Do not add citations for any info in the TARGET article.", assistant_prompt="Here is a rewritten version of the target article that incorporates relevant information from the source articles:", ) From 8ed1c0526c994fa086d8c11b9ec6d9629b64a44c Mon Sep 17 00:00:00 2001 From: Rishabh Date: Thu, 3 Aug 2023 16:55:49 -0700 Subject: [PATCH 3/7] improve var name --- .../schemas/workflow/blocks/agents/content_refresher_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py index f11c9d54c3..5c11a981c6 100644 --- a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py +++ b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py @@ -156,12 +156,12 @@ def search_results(search_query: str) -> list[tuple[str, str]]: async def find_new_info(target: str, source: tuple[str, str, str]) -> str: - source_info = f"{source[0]}, {source[1]}" + source_metadata = f"{source[0]}, {source[1]}" source_content = source[2] # Claude: info mentioned in source that is not mentioned in target prompt = HumanAssistantPrompt( - human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source_content}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article. Under the list, write a source citation (you MUST include the url AND full source title provided below: \n{source_info}", + human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source_content}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article. Under the list, write a source citation (you MUST include the url AND full source title provided below: \n{source_metadata}", assistant_prompt="Here is a list of claims in the SOURCE that are not in the TARGET:", ) From 840b0c5662f1f13cf1f74cdc4150e4f411013bf8 Mon Sep 17 00:00:00 2001 From: Rishabh Date: Thu, 3 Aug 2023 17:54:45 -0700 Subject: [PATCH 4/7] append metadata, not through prompt --- .../schemas/workflow/blocks/agents/content_refresher_agent.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py index 5c11a981c6..fb94b8e29f 100644 --- a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py +++ b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py @@ -161,7 +161,7 @@ async def find_new_info(target: str, source: tuple[str, str, str]) -> str: # Claude: info mentioned in source that is not mentioned in target prompt = HumanAssistantPrompt( - human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source_content}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article. Under the list, write a source citation (you MUST include the url AND full source title provided below: \n{source_metadata}", + human_prompt=f"Below is the TARGET article:\n{target}\n----------------\nBelow is the SOURCE article:\n{source_content}\n----------------\nIn a bullet point list, identify all facts, figures, or ideas that are mentioned in the SOURCE article but not in the TARGET article.", assistant_prompt="Here is a list of claims in the SOURCE that are not in the TARGET:", ) @@ -171,6 +171,7 @@ async def find_new_info(target: str, source: tuple[str, str, str]) -> str: ) new_info = "\n".join(response.split("\n\n")) + new_info += "\n" + source_metadata return new_info From 15df245444e2707bf96d4b5a0a08b267856a64f6 Mon Sep 17 00:00:00 2001 From: Rishabh Date: Thu, 3 Aug 2023 18:13:58 -0700 Subject: [PATCH 5/7] updates --- .../workflow/blocks/agents/content_refresher_agent.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py index fb94b8e29f..41657caa57 100644 --- a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py +++ b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py @@ -42,7 +42,9 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput: logger.info(sources) source_contents = [ - (url, title, await get_page_content(url)) for url, title in sources[:3] + (url, title, await get_page_content(url)) + for url, title in sources[:3] + # TODO: remove limit of 3 sources ] source_contents = [ @@ -178,7 +180,7 @@ async def find_new_info(target: str, source: tuple[str, str, str]) -> str: async def add_info(target: str, info: str) -> str: # Claude: rewrite target to include the info prompt = HumanAssistantPrompt( - human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles. Maintain the format of the TARGET article. At the end of the article, include a list of sources (source url, title, and any additional information) ONLY for added information from SOURCE articles using the following example format: 'Source: https://www.wisnerbaum.com/prescription-drugs/gardasil-lawsuit/, Gardasil Vaccine Lawsuit Update August 2023 - Wisner Baum' Do not add citations for any info in the TARGET article.", + human_prompt=f"Below are notes from some SOURCE articles:\n{info}\n----------------\nBelow is the TARGET article:\n{target}\n----------------\nPlease rewrite the TARGET article to include the information from the SOURCE articles. Maintain the format of the TARGET article. After any source info that you add, include inline citations using the following example format: 'So this is a cited sentence at the end of a paragraph[1](https://www.wisnerbaum.com/prescription-drugs/gardasil-lawsuit/, Gardasil Vaccine Lawsuit Update August 2023 - Wisner Baum).' Do not add citations for any info in the TARGET article. Do not list citations separately at the end of the response", assistant_prompt="Here is a rewritten version of the target article that incorporates relevant information from the source articles:", ) From a4a89cb634d2f090876ebcb59e9577758bbf1bb8 Mon Sep 17 00:00:00 2001 From: Rishabh Date: Thu, 3 Aug 2023 18:37:05 -0700 Subject: [PATCH 6/7] error handle on content check --- .../blocks/agents/content_refresher_agent.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py index 41657caa57..fea662d9a3 100644 --- a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py +++ b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py @@ -37,20 +37,16 @@ async def run(self, workflow_id: str, **kwargs: Any) -> ContentRefresherOutput: sources = search_results(keywords) sources = [ - (url, title) for url, title in sources if url != target_url + source for source in sources if source["url"] != target_url ] # TODO: check based on content overlap logger.info(sources) - source_contents = [ - (url, title, await get_page_content(url)) - for url, title in sources[:3] - # TODO: remove limit of 3 sources - ] + for source in sources[:3]: # TODO: remove limit of 3 sources + source["content"] = await get_page_content(source["url"]) + logger.info(sources) source_contents = [ - (url, title, page_content) - for url, title, page_content in source_contents - if page_content is not None + source for source in sources if source.get("content", None) is not None ] logger.info(source_contents) @@ -138,7 +134,7 @@ async def find_content_kws(content: str) -> str: ) -def search_results(search_query: str) -> list[tuple[str, str]]: +def search_results(search_query: str) -> list[dict[str, str, str]]: # use SERP API response = requests.post( f"https://google.serper.dev/search", @@ -152,14 +148,19 @@ def search_results(search_query: str) -> list[tuple[str, str]]: ) response.raise_for_status() source_information = [ - (result["link"], result["title"]) for result in response.json()["organic"] + { + "url": result.get("link", None), + "title": result.get("title", None), + "date": result.get("date", None), + } + for result in response.json().get("organic", []) ] return source_information -async def find_new_info(target: str, source: tuple[str, str, str]) -> str: - source_metadata = f"{source[0]}, {source[1]}" - source_content = source[2] +async def find_new_info(target: str, source: dict[str, str, str]) -> str: + source_metadata = f"{source['url']}, {source['title']}, {source['date']}" + source_content = source["content"] # Claude: info mentioned in source that is not mentioned in target prompt = HumanAssistantPrompt( From 29fd358ba5f9de4d9d1582f4c483f6d0d0b50d20 Mon Sep 17 00:00:00 2001 From: Rishabh Date: Thu, 3 Aug 2023 18:45:10 -0700 Subject: [PATCH 7/7] mypy --- .../workflow/blocks/agents/content_refresher_agent.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py index fea662d9a3..dced587393 100644 --- a/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py +++ b/platform/reworkd_platform/schemas/workflow/blocks/agents/content_refresher_agent.py @@ -134,7 +134,7 @@ async def find_content_kws(content: str) -> str: ) -def search_results(search_query: str) -> list[dict[str, str, str]]: +def search_results(search_query: str) -> list[dict[str, str]]: # use SERP API response = requests.post( f"https://google.serper.dev/search", @@ -158,8 +158,10 @@ def search_results(search_query: str) -> list[dict[str, str, str]]: return source_information -async def find_new_info(target: str, source: dict[str, str, str]) -> str: - source_metadata = f"{source['url']}, {source['title']}, {source['date']}" +async def find_new_info(target: str, source: dict[str, str]) -> str: + source_metadata = f"{source['url']}, {source['title']}" + ( + f", {source['date']}" if source["date"] else "" + ) source_content = source["content"] # Claude: info mentioned in source that is not mentioned in target