In [None]:
!pip install --upgrade langchain chromadb


In [None]:
!pip install pdfplumber

In [None]:
!pip install chromadb==0.5.3

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

#using NYU and UC Berkeley's PhD websites as starting urls to scrape the relevant PhD advertisement pages
output_folder = "scraped_data_wf_hmc"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

In [2]:

# Base URL to start with
start_urls = ['https://homeloans.wellsfargo.com'
    ,"https://homeloans.wellsfargo.com/mortgage/ny/new-york/jeffery-b-deshields",
    'https://homeloans.wellsfargo.com/mortgage/ny/new-york/grace-bozick',
            'https://homeloans.wellsfargo.com/mortgage/ny/new-york/brian-p-mcnamara',
            'https://homeloans.wellsfargo.com/mortgage/ny/new-york/thomas-j-sharkey',
             'https://homeloans.wellsfargo.com/mortgage/ca/san-francisco/anna-weinstein']
results=[]
webpages=[]
for start_url in start_urls:
# Replace with your target page

# Step 1: Get the main page
    response = requests.get(start_url)
    soup = BeautifulSoup(response.text, 'html.parser')

# Step 2: Find all anchor tags
    links = set()
    for tag in soup.find_all('a', href=True):
        full_url = urljoin(start_url, tag['href'])  # Handle relative URLs
        links.add(full_url)

    print(f"\n🔗 Found {len(links)} links:\n")
    

# Step 3: Optional — Visit and scrape each linked page
    print("\n📄 Scraping text from each link...\n")
    for link in list(links):
        if 'mailto' or 'tel' not in link and link not in webpages:
            try:
                print(link)
                r = requests.get(link, timeout=10)
                sub_soup = BeautifulSoup(r.text, 'html.parser')
                text = sub_soup.get_text(strip=True)
                results.append(text)
                webpages.append(link)
                print(f"\nURL: {link}")
                #print(f"Content Preview: {text[:300]}...")  # Show first 300 characters
            except Exception as e:
                print(f"❌ Failed to scrape {link}: {e}")



🔗 Found 39 links:


📄 Scraping text from each link...

https://homeloans.wellsfargo.com/mortgage/tx

URL: https://homeloans.wellsfargo.com/mortgage/tx
https://homeloans.wellsfargo.com/mortgage/ny

URL: https://homeloans.wellsfargo.com/mortgage/ny
https://homeloans.wellsfargo.com/mortgage/dc

URL: https://homeloans.wellsfargo.com/mortgage/dc
https://homeloans.wellsfargo.com/mortgage/co

URL: https://homeloans.wellsfargo.com/mortgage/co
https://homeloans.wellsfargo.com/mortgage/sd

URL: https://homeloans.wellsfargo.com/mortgage/sd
https://www.wellsfargo.com/

URL: https://www.wellsfargo.com/
https://homeloans.wellsfargo.com/mortgage/ia

URL: https://homeloans.wellsfargo.com/mortgage/ia
https://www.wellsfargo.com/privacy-security/notice-of-data-collection/

URL: https://www.wellsfargo.com/privacy-security/notice-of-data-collection/
https://homeloans.wellsfargo.com/mortgage/pa

URL: https://homeloans.wellsfargo.com/mortgage/pa
https://homeloans.wellsfargo.com/mortgage/al

URL: https://hom

In [3]:
import torch

# Automatically choose GPU if available, otherwise fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")


Using device: cuda


In [4]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import os
import pandas as pd

In [5]:
embeddings = OllamaEmbeddings(model="llama3.2")

In [6]:
db_location = "./chrome_langchain_db/hmcs"
#add_documents = not os.path.exists(db_location)

In [8]:
#Creating Documents from scraped data

documents = []
ids = []

for url in start_urls:
    try:
        #web,results=scrape(url)


        for i in range(len(results)):
            document = Document(
                page_content=results[i],
                metadata={"url":webpages[i]},
                id=str(i)
    )
            ids.append(str(i))
            documents.append(document)
    except:
        pass

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(documents)

In [11]:
#Creating Chroma Vector Database
vector_store = Chroma(
    collection_name="hmcs",
    persist_directory=db_location,
    embedding_function=embeddings
)

In [12]:
len(documents),len(ids)

(6132, 906)

In [14]:
vector_store.add_documents(documents=documents)

['a9dc9124-3a11-4055-9dd3-c2219d20624c',
 '6799c418-6eb2-491e-9429-c3d2fde0eab7',
 'c9bf18f1-b623-4004-b2a7-9f0422f97712',
 'c8cf3902-fc9c-4726-a948-65064f124a78',
 'b2f1c19a-4d31-4f52-bebd-5c41499031e4',
 'c8ac0924-8d0d-4670-92be-f3ca5c34b9c3',
 '8ff3e688-8107-4e81-9d81-d7ae544efc4d',
 '99b04b5e-98cb-413d-831d-f60ffa02a8a0',
 '014c147d-fc56-4799-82a3-a94ea1e6fe2a',
 'a3d06f64-274d-4bcd-b64d-18d1b0489430',
 '0904c811-e234-42bb-a036-7236e6058b4f',
 'f8de4a01-40f7-4ab6-b4e6-872396c8a6a3',
 '9489d5df-5a08-4193-87b1-044e5e0584c2',
 'f33b6d1a-d942-4c15-9a47-9f75f1a832f6',
 'ccdc10dd-1348-45c5-8563-7a9d92c13d7f',
 'd41c6076-e85d-4230-bb4c-94260257466e',
 '6d4c9b14-1eb5-4f0f-86fa-93333d45b39e',
 '18610530-533c-402b-bfc7-e4905456d21d',
 '9ca555d0-aaab-4bc2-ac5c-b48cde4494be',
 'ae26ead5-7d13-4a0b-a3fc-eb7c7749264f',
 '855ea107-9994-4725-afd0-a5d9304c40ac',
 '200f1bb5-e977-4d0e-b0dd-724c1ae64e83',
 'e920ef88-b842-4777-98ea-fdf3e87b7a30',
 '38304f9e-4cf6-496c-bf35-395f99166302',
 '5af58ef7-a7dd-

In [15]:
retriever = vector_store.as_retriever(
    search_kwargs={"k": 5})

In [16]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
#from vector import retriever

model = OllamaLLM(model="llama3.2")

In [21]:
#Creating template for QA model
template = """
You are an expert in real estate and mortgage information in US for Wells Fargo Bank.
When a user asks you property or mortgage rate information in specific states, 
generate names, contact information, and links to the websites of Home Mortgage Consultants(HMC) 
from the database. Ensure the link that is provided is correct and opens the contact page of the mortgage consultants.
If the link is not the contact page of the mortgage consultant, do not display their names or other details.
Make sure the email ids don't have unnecessary space and the information is accurate.
Results should be of format:

HMC Name
Email
Phone Number
Link 


Here are some relevant questions: {reviews}

Here is the question to answer: {question}
"""

In [22]:
prompt = ChatPromptTemplate.from_template(template)

In [23]:
chain = prompt | model

In [24]:
while True:
    print("\n\n-------------------------------")
    question = input("Ask your question (q to quit): ")
    print("\n\n")
    if question == "q":
        break
   
    reviews = retriever.invoke(question)
    result = chain.invoke({"reviews": reviews, "question": question})
    print(result)



-------------------------------
Ask your question (q to quit): I am looking for a fixed rate mortgage for a two bedroom apartment in New York.



Based on my database, here are some Home Mortgage Consultants (HMC) from Wells Fargo Bank in New York who specialize in fixed-rate mortgages for two-bedroom apartments:

1. Brian P. McNama
Email: brian.p.mcnama@wellsfargo.com
Phone Number: 212-661-1111
Link: https://homeloans.wellsfargo.com/mortgage/ny/new-york/brian-p-mcnamara

Please note that the link provided is to the same page as the provided document ID, which may not be the contact page of the mortgage consultant. I recommend verifying the contact information with Wells Fargo Bank directly.

If you would like me to search for additional HMCs or provide more information, please let me know!


-------------------------------
Ask your question (q to quit): I am looking for a fixed rate mortgage for a two bedroom apartment in California.



Based on my database, here are some Home Mortg