In [1]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import SentenceTransformersTokenTextSplitter


url = 'https://www.federalreserve.gov/newsevents/pressreleases/bcreg20230829b.htm'
loader = WebBaseLoader(url)
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, model_name='sentence-transformers/all-MiniLM-L6-v2')

page = loader.load()
docs = splitter.split_documents(page)
docs_str = [doc.page_content for doc in docs]

### Load the topic models.
We load the topic models and extract a mapping between Topic no. and Name

In [24]:
from bertopic import BERTopic
sections = [f'Section{s}' for s in ['1', '1A', '7'] ]
topic_models = {s: BERTopic.load(f'../topic_models/topic_models_{s}', embedding_model='all-MiniLM-L6-v2') for s in sections}
topics_names_dict = {s: dict(zip(tm.get_topic_info()['Topic'], tm.get_topic_info()['Name'])) for s,tm in topic_models.items()}

In [31]:
topics_doc = {}
import numpy as np
for s, tm in topic_models.items():
    topics, _ = tm.transform(docs_str)
    
    # Use set to keep uniques
    topics_doc[s] = list(set(np.vectorize(topics_names_dict[s].get)(topics)))

In [32]:
topics_doc

{'Section1': ['86_fdic_reserve fdic_submit resolution_orderly resolution',
  '878_monetary policy_monetary policies_instruments monetary_fiscal policies'],
 'Section1A': ['44_federal reserve_capital liquidity_regulatory capital_basel iii',
  '1305_america corporation_bank america_fdic_bank',
  '178_bearing liabilities_net income_deposits borrowings_loans securities',
  '557_funds rate_federal funds_federal reserve_actions federal',
  '3_libor_sofr_reference rates_usd libor',
  '786_spoe_spoe strategy_parent company_support agreement'],
 'Section7': ['2573_aenb_qualifying collateral_charge trust_lending trust',
  '2462_secured funding_collateralized financings_gs bank_financings consolidated',
  '2005_corporation 8217_financial markets_instability impact_certain corporation',
  '846_backed securities_fasb financial_rmbs residential_capital financial',
  '2013_cares act_programs facilities_provisions cares_consolidated appropriations']}

Above, we can see the topics are all related to federal reserve and/or monetary policy related topics. For the API, we can return an object equal to topics_doc. For further improvements, we can include the probabilities.