In [None]:
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.requests import RequestsWrapper
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
import requests
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from bs4 import BeautifulSoup
import csv


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_community.utilities.requests import TextRequestsWrapper
* 'allow_population_by_field_name' has been renamed to 'populate_by_name'


In [14]:
import dotenv
dotenv.load_dotenv()
import os

In [3]:
from datetime import datetime
import urllib.parse


def generate_nyt_url(start_date: str, end_date: str, keywords: str):
    """
    Generates the dynamic New York Times search URL based on the provided dates and keywords.

    :param start_date: The start date in YYYY-MM-DD format
    :param end_date: The end date in YYYY-MM-DD format
    :param keywords: The search keywords for the query (e.g., 'retail sales')
    :return: The dynamically constructed URL
    """
    base_url = "https://www.nytimes.com/search"

    # URL encode the keywords to handle spaces and special characters
    encoded_keywords = urllib.parse.quote(keywords)

    # Construct the dynamic URL
    url = f"{base_url}?dropmab=false&endDate={end_date}&lang=en&query={encoded_keywords}&sort=best&startDate={start_date}"

    return url

In [4]:
start_date = "2024-10-01"
end_date = "2024-11-01"
keywords = "retail sales"

# Generate the dynamic URL
url = generate_nyt_url(start_date, end_date, keywords)
print(url)

https://www.nytimes.com/search?dropmab=false&endDate=2024-11-01&lang=en&query=retail%20sales&sort=best&startDate=2024-10-01


In [7]:
response = requests_wrapper.get(url)  # Assume this returns a string
soup = BeautifulSoup(response, "html.parser")

In [9]:
articles = soup.find_all(
    "li", class_="css-1l4w6pd"
)  

In [10]:
articles

[<li class="css-1l4w6pd" data-testid="search-bodega-result"><div class="css-1bdu3ax"><span class="css-17ubb9w"> </span><div class="css-1i8vfl5"><div class="css-e1lvw9"><p class="css-myxawk">Business</p><a href="/2024/10/31/business/amazon-q3-earnings.html?searchResultPosition=1"><h4 class="css-nsjm9t">Amazon Reports Record $15.3 Billion Profit</h4><p class="css-16nhkrn">The tech giant’s cloud computing business picked up steam and its retail business showed resiliency.</p><p class="css-1engk30">By Karen Weise</p><span class="css-coykat"><div class="css-1rq7uyl"></div><span class="css-h4mf4">PRINT EDITION</span>Cloud Computing Rise Fuels Amazon’s $15.3 Billion Profit<span class="css-1t2tqhf">|</span>November 2, 2024, Page B3</span></a></div><figure aria-label="media" class="css-tap2ym" role="group"><div data-testid="imageContainer-children-Image"><img alt="Amazon has been making its fulfillment and delivery network faster and more efficient, driving down its costs to fulfill orders." cl

In [16]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer

In [None]:
loader = AsyncChromiumLoader([url])
html = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [23]:
import nest_asyncio

nest_asyncio.apply()

from langchain_community.document_loaders import AsyncChromiumLoader

loader = AsyncChromiumLoader([url])
html = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [24]:
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["li"])

In [27]:
docs_transformed


[Document(metadata={'source': 'https://www.nytimes.com/search?dropmab=false&endDate=2024-11-01&lang=en&query=retail%20sales&sort=best&startDate=2024-10-01'}, page_content='Oct. 31 Business Amazon Reports Record $15.3 Billion Profit The tech giant’s cloud computing business picked up steam and its retail business showed resiliency. By Karen Weise PRINT EDITION Cloud Computing Rise Fuels Amazon’s $15.3 Billion Profit | November 2, 2024, Page B3 Oct. 31 Business Inside the Winter Wonderland Where Walmart Hatches Its Holiday Plans To prepare for ever-longer shopping season, thousands of employees were assembled for a retail jamboree in Florida’s August heat. By Jordyn Holman PRINT EDITION Heated Rush To Lay Plans For Holidays | October 31, 2024, Page B1 Oct. 1 Style As Plant Sales Move Online, the Sill Closes Its Last Retail Store A trendy company that embraced the plant craze of a decade ago has evolved to fill the needs of a changing demographic. By Sandra E. Garcia PRINT EDITION The Sil

In [31]:
docs_transformed[0].page_content[0:]

'Oct. 31 Business Amazon Reports Record $15.3 Billion Profit The tech giant’s cloud computing business picked up steam and its retail business showed resiliency. By Karen Weise PRINT EDITION Cloud Computing Rise Fuels Amazon’s $15.3 Billion Profit | November 2, 2024, Page B3 Oct. 31 Business Inside the Winter Wonderland Where Walmart Hatches Its Holiday Plans To prepare for ever-longer shopping season, thousands of employees were assembled for a retail jamboree in Florida’s August heat. By Jordyn Holman PRINT EDITION Heated Rush To Lay Plans For Holidays | October 31, 2024, Page B1 Oct. 1 Style As Plant Sales Move Online, the Sill Closes Its Last Retail Store A trendy company that embraced the plant craze of a decade ago has evolved to fill the needs of a changing demographic. By Sandra E. Garcia PRINT EDITION The Sill Shutters Its Last Retail Store | October 6, 2024, Page ST9 Oct. 17 Business Shoppers Spend More Freely Than Expected, Sign of Solid U.S. Economy A new report showing a r

In [35]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-4o-mini-2024-07-18")

In [None]:
from langchain.chains import create_extraction_chain

schema = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_summary": {"type": "string"},
    },
    "required": ["news_article_title", "news_article_summary"],
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)

In [38]:
import pprint

from langchain_text_splitters import RecursiveCharacterTextSplitter


def scrape_with_playwright(urls, schema):
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["span"]
    )
    print("Extracting content with LLM")

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the first split
    extracted_content = extract(schema=schema, content=splits[0].page_content)
    pprint.pprint(extracted_content)
    return extracted_content


urls = [url]
extracted_content = scrape_with_playwright(urls, schema=schema)

USER_AGENT environment variable not set, consider setting it to identify your requests.


Extracting content with LLM


BadRequestError: Error code: 400 - {'error': {'message': "Invalid schema for function 'information_extraction': 'date' is not valid under any of the given schemas.", 'type': 'invalid_request_error', 'param': 'functions[0].parameters', 'code': 'invalid_function_parameters'}}

In [None]:
def summarize_article(url):
    article_response = requests_wrapper.get(url)
    article_soup = BeautifulSoup(article_response.text, "html.parser")

    # Find article content (this will depend on the structure of the NYTimes page)
    paragraphs = article_soup.find_all("p")
    content = " ".join([p.get_text() for p in paragraphs])

    # Define the prompt to clean and summarize the content using an LLM (e.g., GPT-3)
    prompt = """
    Clean the following article content, summarize it, and identify the key economic and political events discussed.
    Content:
    {content}
    """

    # Set up the LLM to run the prompt
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    formatted_prompt = prompt.format(content=content)
    summary = llm(formatted_prompt)

    return summary


# Loop over the articles and summarize each one
summaries = []
for article in articles:
    link = article.find("a")["href"]
    summary = summarize_article(link)
    summaries.append(
        {"Title": article.find("h4").text, "Link": link, "Summary": summary}
    )

# Save to CSV
with open("summarized_articles.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Title", "Link", "Summary"])
    writer.writeheader()
    writer.writerows(summaries)