In [1]:
import sys
# !{sys.executable} -m pip install --upgrade pip
# !{sys.executable} -m pip install langchain-text-splitters
# !{sys.executable} -m pip install langchain-pull-md

In [2]:
ARTICLE_URL = 'https://www.morningstar.com.au/stocks/10-best-us-dividend-aristocrats-buy-nowincluding-surprise-outperformer?user_segment=indinv'

# Path to the chroma db
CHROMA_DB = './db/langchain_rag'
# Name of the collection
COLLECTION_NAME = 'us_div_stocks'

## Drop Chromadb collection

In [3]:
import chromadb
from chromadb.config import Settings

def drop_collection():
    # Initialize the ChromaDB client (e.g., PersistentClient for local storage)
    client = chromadb.PersistentClient(path=CHROMA_DB, settings=Settings(allow_reset=True))
    
    try:
        # Attempt to get the collection
        collection = client.get_collection(name=COLLECTION_NAME)

        # If successful, the collection exists, so delete it
        client.delete_collection(name=COLLECTION_NAME)
        print(f"Collection '{COLLECTION_NAME}' deleted successfully.")
    except:
        # If get_collection raises an error, the collection does not exist
        print(f"Collection '{COLLECTION_NAME}' does not exist, no deletion performed.")

In [4]:
# Drop the collection at each run
drop_collection()

Collection 'us_div_stocks' deleted successfully.


## Split the article based on H2

In [5]:
from langchain_pull_md import PullMdLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter

def get_document_chunks():
    # Initialize the loader with the desired URL
    loader = PullMdLoader(url=ARTICLE_URL)
    # Load the content, which will be converted to Markdown
    documents = loader.load()
    # Headers to do the splitting
    headers_to_split_on = [("##", "Header 2")]
    md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
    return md_splitter.split_text(documents[0].page_content)

In [6]:
# Convert the URL into document checkins based on Heading Level 2
document_chunks = get_document_chunks()

## Suppress warnings from torch

In [7]:
import warnings
warnings.filterwarnings('ignore', module='torch')

## Create embeddings and vector store

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma 
from chromadb.config import Settings

# Our embeddings model
embeddings_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

# Vector store; data saved locally
vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings_model,
    client_settings=Settings(allow_reset=True),
    persist_directory=CHROMA_DB    
)
# Persist documents
vector_store.add_documents(document_chunks[1:13])

['52f1c92b-1e24-4552-8cd5-25b837a0a24e',
 '85f41006-eefc-4ce0-a45d-49aa0b007d34',
 '57e093e8-a29c-4f32-ae3f-00dc3181140f',
 '1ae2c486-d7e5-43d1-86d4-13fd48e153e1',
 '02058fbe-9176-4a09-bcdb-00b047df4c5a',
 '02a0b66a-e0b9-42c6-be8e-a8004d6bb4d7',
 '613addfb-bde9-4c53-b203-d2fbd9450438',
 '38d0ed4b-fd29-49f3-a75e-a20f6dba162f',
 'd7bd6b7a-43df-4caa-adf5-e1ba7e5f88bc',
 '9c4038bd-de0d-4437-88c7-266c68029ddd',
 '01284386-1123-44b8-b290-0239e849154c',
 '43fb801d-a800-4a65-afa6-ba10ac4d5dd7']

## Load API Keys

In [9]:
# To read environment property file
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()

True

## Define the LLM

In [10]:
from langchain_groq import ChatGroq

# Ensure your GROQ_API_KEY is set in environment variables
llm = ChatGroq(temperature=0.5, model_name='llama3-8b-8192')

## Build the RAG Chain

In [11]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Define the prompt template for the LLM
prompt = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant. Answer the query below based on the provided context:
    context: {context}
    query: {input}
    """
)
# Create a document chain to stuff retrieved documents into the prompt
document_chain = create_stuff_documents_chain(llm, prompt)

# Create a retriever from the vector store
retriever = vector_store.as_retriever()

# Combine the retriever and document chain into a RAG chain
qa_chain = create_retrieval_chain(retriever, document_chain)

## Queries

### Stock Symbols

In [13]:
query = 'What stock symbols are discussed?'
response = qa_chain.invoke({'input': query})
print(response['answer'])

The stock symbols discussed are:

1. BDX (Becton Dickinson)
2. BF.B (Brown-Forman)
3. CLX (Clorox)
4. XOM (ExxonMobil)
5. MDT (Medtronic)
6. WST (West Pharmaceutical Services)
7. AMCR (Amcor)
8. PEP (PepsiCo)
9. NDSN (Nordson)
10. KMB (Kimberly-Clark)


### Fair Value for ExxonMobil

In [14]:
query = 'What is the fair value for ExxonMobil?'
response = qa_chain.invoke({'input': query})
print(response['answer'])

According to the given context, the fair value for ExxonMobil is $135.


### Summarize a Company

In [15]:
query = 'Summarize Medtronic'
response = qa_chain.invoke({'input': query})
print(response['answer'])

Here is a summary of Medtronic:

Medtronic is a leading medical device company with a diversified product portfolio that spans various therapeutic areas, including cardiac, diabetes, chronic pain, and acute care. The company has a strong position as a major vendor to hospital customers due to its innovative technology and diversified product portfolio. Medtronic has a history of introducing new products and technologies, and has invested heavily in internal research and development efforts as well as acquiring emerging technologies. The company has a strong financial position and has consistently returned a significant portion of its annual free cash flow to shareholders. Despite being 17% undervalued relative to Morningstar's fair value estimate, Medtronic's stock has been outperforming the market this year. Morningstar's senior equity analyst, Debbie Wang, believes that the company's distributions have been appropriate and that its standing as the largest pure-play medical device mak

### Short Summary

In [16]:
query = 'Summarize Medtronic, make it less than 200 characters'
response = qa_chain.invoke({'input': query})
print(response['answer'])

Medtronic: Largest medical-device company with diversified portfolio, strong innovation, and significant market share. Undervalued, trading 17% below fair value.


### Sector

In [17]:
query = 'Which sector Becton Dickinson is in?'
response = qa_chain.invoke({'input': query})
print(response['answer'])

According to the provided context, Becton Dickinson (BDX) is in the Healthcare sector.


### Undervalued?

In [18]:
query = 'How much undervalued Amcor is?'
response = qa_chain.invoke({'input': query})
print(response['answer'])

According to the context, Amcor stock trades 11% below Morningstar's $11 fair value estimate, which means it is 11% undervalued.


### Goal of the article

In [32]:
query = 'What is the main goal of the document?'
response = qa_chain.invoke({'input': query})
print(response['answer'])

The main goal of the document is to provide a list of top dividend aristocrats to buy, along with a brief analysis of each company, including their Morningstar ratings, forward dividend yield, and reasons why they are attractive investments.


### Economic Rating

In [19]:
query = "What's the moat rating for Becton Dickinson?"
response = qa_chain.invoke({'input': query})
print(response['answer'])

According to the provided context, the moat rating for Becton Dickinson (BDX) is Narrow.


### Dividend Yield

In [20]:
query = "What's the dividend yield for Pepsi?"
response = qa_chain.invoke({'input': query})
print(response['answer'])

According to the provided context, the forward dividend yield for PepsiCo (PEP) is 3.97%.


### Stock Symbol

In [21]:
query = "What's the stock symbol for ExxonMobil?"
response = qa_chain.invoke({'input': query})
print(response['answer'])

The stock symbol for ExxonMobil is XOM.


## Load Watermark

In [22]:
from watermark import watermark
%load_ext watermark
# print(watermark())

## Display Modules

In [23]:
%watermark --iversions

langchain               : 0.3.27
langchain_groq          : 0.3.6
chromadb                : 1.0.15
langchain_text_splitters: 0.3.9
langchain_core          : 0.3.72
watermark               : 2.5.0
langchain_pull_md       : 0.1.1
sys                     : 3.13.5 (main, Jun 21 2025, 09:35:00) [GCC 15.1.1 20250425]
langchain_chroma        : 0.2.5
langchain_huggingface   : 0.3.1



In [24]:
retriever = vector_store.as_retriever()
query = "What is the discount for Medtronic?"
retrieved_docs = retriever.invoke(query)
for d in retrieved_docs:
    print(f'{d.id} {d.metadata}')

613addfb-bde9-4c53-b203-d2fbd9450438 {'Header 2': 'Medtronic'}
57e093e8-a29c-4f32-ae3f-00dc3181140f {'Header 2': 'Becton Dickinson'}
38d0ed4b-fd29-49f3-a75e-a20f6dba162f {'Header 2': 'West Pharmaceutical'}
01284386-1123-44b8-b290-0239e849154c {'Header 2': 'Nordson'}


In [25]:
retriever = vector_store.as_retriever()
query = 'Summarize Medtronic'
retrieved_docs = retriever.invoke(query)
for d in retrieved_docs:
    print(f'{d.id} {d.metadata}')

613addfb-bde9-4c53-b203-d2fbd9450438 {'Header 2': 'Medtronic'}
57e093e8-a29c-4f32-ae3f-00dc3181140f {'Header 2': 'Becton Dickinson'}
38d0ed4b-fd29-49f3-a75e-a20f6dba162f {'Header 2': 'West Pharmaceutical'}
01284386-1123-44b8-b290-0239e849154c {'Header 2': 'Nordson'}


In [26]:
retriever = vector_store.as_retriever()
query = 'what are stock symbols discussed?'
retrieved_docs = retriever.invoke(query)
for d in retrieved_docs:
    print(f'{d.id} {d.metadata}')

85f41006-eefc-4ce0-a45d-49aa0b007d34 {'Header 2': 'The 10 best dividend aristocrats to buy now'}
02a0b66a-e0b9-42c6-be8e-a8004d6bb4d7 {'Header 2': 'ExxonMobil'}
9c4038bd-de0d-4437-88c7-266c68029ddd {'Header 2': 'PepsiCo'}
52f1c92b-1e24-4552-8cd5-25b837a0a24e {'Header 2': 'What is a dividend aristocrat?'}
