In [1]:
## Gathering content from the web has a few components:
#     Search: Query to url (e.g., using GoogleSearchAPIWrapper).
#     Loading: Url to HTML (e.g., using AsyncHtmlLoader, AsyncChromiumLoader, etc).
#     Transforming: HTML to formatted text (e.g., using HTML2Text or Beautiful Soup).

In [2]:
from langchain_openai import AzureChatOpenAI
from util.config import load_configuration

PROP_FILE = 'app.properties'
configs = load_configuration(PROP_FILE)

API_TYPE = configs.get("API_TYPE").data
API_VERSION = configs.get("API_VERSION").data
API_KEY = configs.get("API_KEY").data
API_BASE = configs.get("API_BASE").data
LLM_ENGINE_GPT35_16K=configs.get("LLM_ENGINE_GPT35_16K").data
ADA_MODEL=configs.get("ADA_MODEL").data

In [3]:
llm = AzureChatOpenAI(
            azure_endpoint=API_BASE,
            api_key=API_KEY,
            azure_deployment=LLM_ENGINE_GPT35_16K,
            openai_api_version=API_VERSION,
        )


In [24]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer

# Load HTML
loader = AsyncHtmlLoader(["https://www.espn.com"])
docs = loader.load()

Fetching pages: 100%|#####################################################################| 1/1 [00:01<00:00,  1.19s/it]


In [25]:
from langchain_community.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs_transformed[0].page_content[0:500]

'Skip to main content  Skip to navigation\n\n<\n\n>\n\nMenu\n\n## ESPN\n\n  *   *   * scores\n\n  * NFL\n  * NBA\n  * MLB\n  * NHL\n  * Soccer\n  * MMA\n  * …\n\n    * NCAAF\n    * Sports Betting\n    * Boxing\n    * CFL\n    * NCAA\n    * Cricket\n    * F1\n    * Golf\n    * Horse\n    * LLWS\n    * NASCAR\n    * NBA G League\n    * NCAAM\n    * NCAAW\n    * Olympic Sports\n    * PLL\n    * Professional Wrestling\n    * Racing\n    * RN BB\n    * RN FB\n    * Rugby\n    * Tennis\n    * WNBA\n    * X Games\n    * UFL\n\n  * More ESPN\n  * Fan'

In [26]:
# Transform
#   HTML2Text provides a straightforward conversion of HTML content into plain text (with markdown-like formatting) without any specific tag manipulation.
#   It's best suited for scenarios where the goal is to extract human-readable text without needing to manipulate specific HTML elements.
#   Beautiful Soup offers more fine-grained control over HTML content, enabling specific tag extraction, removal, and content cleaning.
#   It's suited for cases where you want to extract specific information and clean up the HTML content according to your needs.

bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["span"])
docs_transformed

[Document(page_content='Menu PGA TOUR LIVE LPGA: Featured Groups MLB NCAA Softball NCAA Baseball Biggest Roster Holes For Every NFL Team NFL Draft NBA Postseason Tracker NFL Free Agency ESPN Radio: Listen Live How To Watch NHL Games Watch Golf on ESPN ESPN Deportes Andscape espnW ESPNFC X Games SEC Network ESPN ESPN Fantasy Tournament Challenge Facebook X/Twitter Instagram Snapchat TikTok YouTube West Semifinals - Game 2 Series tied 1-1 East 2nd Round - Game 3 NYR lead series 3-0 16h Zach Lowe East Semifinals - Game 2 Series tied 1-1 1d Jeff Passan 1h David Schoenfield 1h Alden Gonzalez 12m Timothy Bradley Jr. 1d Nick Parkinson West 2nd Round - Game 2 Series tied 1-1 Photo by Ken Blaze/USA TODAY Sports Brad Penner-USA TODAY Sports Rich Schultz/Getty Images Dale Zanine-USA TODAY Sports Jan Kruger/Getty Images Andrew D. Bernstein/NBAE via Getty Images Todd Rosenberg/Getty Images Chris Gardner/Getty Images PGA TOUR LIVE MLB Biggest Roster Holes For Every NFL Team NFL Draft NBA Postseason 

In [28]:
# Scraping with extraction

from langchain.chains import create_extraction_chain

schema = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_summary": {"type": "string"},
    },
    "required": ["news_article_title", "news_article_summary"],
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)

In [32]:
import pprint

from langchain_text_splitters import RecursiveCharacterTextSplitter

# Grab the first 1000 tokens of the site
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
splits = splitter.split_documents(docs_transformed)

# Process the first split
extracted_content = extract(schema=schema, content=splits[0].page_content)
pprint.pprint(extracted_content)

  warn_deprecated(


[{'news_article_title': 'Menu PGA TOUR LIVE LPGA: Featured Groups MLB NCAA '
                        'Softball NCAA Baseball Biggest Roster Holes For Every '
                        'NFL Team NFL Draft NBA Postseason Tracker NFL Free '
                        'Agency ESPN Radio: Listen Live How To Watch NHL Games '
                        'Watch Golf on ESPN ESPN Deportes Andscape espnW '
                        'ESPNFC X Games SEC Network ESPN ESPN Fantasy '
                        'Tournament Challenge Facebook X/Twitter Instagram '
                        'Snapchat TikTok YouTube West Semifinals - Game 2 '
                        'Series tied 1-1 East 2nd Round - Game 3 NYR lead '
                        'series 3-0 16h Zach Lowe East Semifinals - Game 2 '
                        'Series tied 1-1 1d Jeff Passan 1h David Schoenfield '
                        '1h Alden Gonzalez 12m Timothy Bradley Jr. 1d Nick '
                        'Parkinson West 2nd Round - Game 2 Series tied 1-1