In [None]:
!pip install --upgrade langchain chromadb


In [None]:
!pip install pdfplumber

In [None]:
!pip install chromadb==0.5.3

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

#using NYU and UC Berkeley's PhD websites as starting urls to scrape the relevant PhD advertisement pages
output_folder = "scraped_data_wf"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

In [31]:

# Base URL to start with
start_urls = ['https://homeloans.wellsfargo.com'
    ,"https://homeloans.wellsfargo.com/mortgage/ny/new-york/jeffery-b-deshields",
    'https://homeloans.wellsfargo.com/mortgage/ny/new-york/grace-bozick',
            'https://homeloans.wellsfargo.com/mortgage/ny/new-york/brian-p-mcnamara',
            'https://homeloans.wellsfargo.com/mortgage/ny/new-york/thomas-j-sharkey']
results=[]
webpages=[]
for start_url in start_urls:
# Replace with your target page

# Step 1: Get the main page
    response = requests.get(start_url)
    soup = BeautifulSoup(response.text, 'html.parser')

# Step 2: Find all anchor tags
    links = set()
    for tag in soup.find_all('a', href=True):
        full_url = urljoin(start_url, tag['href'])  # Handle relative URLs
        links.add(full_url)

    print(f"\n🔗 Found {len(links)} links:\n")
    

# Step 3: Optional — Visit and scrape each linked page
    print("\n📄 Scraping text from each link...\n")
    for link in list(links):
        if 'mailto' or 'tel' not in link:
            try:
                print(link)
                r = requests.get(link, timeout=5)
                sub_soup = BeautifulSoup(r.text, 'html.parser')
                text = sub_soup.get_text(strip=True)
                results.append(text)
                webpages.append(link)
                print(f"\nURL: {link}")
                print(f"Content Preview: {text[:300]}...")  # Show first 300 characters
            except Exception as e:
                print(f"❌ Failed to scrape {link}: {e}")



🔗 Found 39 links:


📄 Scraping text from each link...

https://homeloans.wellsfargo.com/mortgage/nd

URL: https://homeloans.wellsfargo.com/mortgage/nd
Content Preview: Wells Fargo Home Mortgage Consultants in North Dakota | Mortgage, Home Mortgage Loans, Check RatesOpen mobile menuSwitch language toEspañolHome MortgageWells Fargo Home MortgageConsultants in North DakotaSearch by city and state or ZIP codeCity, State/Province, Zip or City & CountrySubmit a search.U...
https://homeloans.wellsfargo.com/es

URL: https://homeloans.wellsfargo.com/es
Content Preview: Todos los consultores de Wells Fargo Home Mortgage | Mortgage, Home Mortgage Loans, Check RatesOpen mobile menuSwitch language toEnglishHome MortgageWells FargoHome Mortgage ConsultoresBuscar por ciudad, país o código postalCiudad, Estado/Provincia, Código postal o Ciudad y PaísSubmit a search.Use m...
https://www.wellsfargo.com/privacy-security/terms/

URL: https://www.wellsfargo.com/privacy-security/terms/
Content Preview: Gen

In [32]:
import torch

# Automatically choose GPU if available, otherwise fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")


Using device: cuda


In [33]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import os
import pandas as pd

In [34]:
embeddings = OllamaEmbeddings(model="llama3.2")

In [35]:
db_location = "./chrome_langchain_db/hmcs"
#add_documents = not os.path.exists(db_location)

In [36]:
#Creating Documents from scraped data

documents = []
ids = []

for url in urls:
    try:
        #web,results=scrape(url)


        for i in range(len(results)):
            document = Document(
                page_content=results[i],
                metadata={"url":webpages[i]},
                id=str(i)
    )
            ids.append(str(i))
            documents.append(document)
    except:
        pass

In [37]:
documents

[Document(id='0', metadata={'url': 'https://homeloans.wellsfargo.com/mortgage/nd'}, page_content='Wells Fargo Home Mortgage Consultants in North Dakota | Mortgage, Home Mortgage Loans, Check RatesOpen mobile menuSwitch language toEspañolHome MortgageWells Fargo Home MortgageConsultants in North DakotaSearch by city and state or ZIP codeCity, State/Province, Zip or City & CountrySubmit a search.Use my locationWells Fargo Home Mortgage ConsultantsNDFargoWells Fargo Home Mortgage ConsultantsNDEqual Housing Lender logoEqual Housing LenderWells Fargo Home Mortgage is a division of Wells Fargo Bank, N.A.LRC-0624Privacy, Cookies, Security & LegalDo Not Sell or Share My Personal InformationNotice of Data CollectionGeneral Terms of UseReport FraudSitemapAbout Wells FargoCareersDiversity and AccessibilityHome© 2025 Wells Fargo. NMLSR ID 399801'),
 Document(id='1', metadata={'url': 'https://homeloans.wellsfargo.com/es'}, page_content='Todos los consultores de Wells Fargo Home Mortgage | Mortgage,

In [38]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(documents)

In [39]:
#Creating Chroma Vector Database
vector_store = Chroma(
    collection_name="hmcs",
    persist_directory=db_location,
    embedding_function=embeddings
)

In [40]:
len(documents),len(ids)

(833, 124)

In [41]:
documents

[Document(metadata={'url': 'https://homeloans.wellsfargo.com/mortgage/nd'}, page_content='Wells Fargo Home Mortgage Consultants in North Dakota | Mortgage, Home Mortgage Loans, Check RatesOpen mobile menuSwitch language toEspañolHome MortgageWells Fargo Home MortgageConsultants in North DakotaSearch by city and state or ZIP codeCity, State/Province, Zip or City & CountrySubmit a search.Use my locationWells Fargo Home Mortgage ConsultantsNDFargoWells Fargo Home Mortgage ConsultantsNDEqual Housing Lender logoEqual Housing LenderWells Fargo Home Mortgage is a division of Wells Fargo Bank, N.A.LRC-0624Privacy, Cookies, Security & LegalDo Not Sell or Share My Personal InformationNotice of Data CollectionGeneral Terms of UseReport FraudSitemapAbout Wells FargoCareersDiversity and AccessibilityHome© 2025 Wells Fargo. NMLSR ID 399801'),
 Document(metadata={'url': 'https://homeloans.wellsfargo.com/es'}, page_content='Todos los consultores de Wells Fargo Home Mortgage | Mortgage, Home Mortgage L

In [42]:
vector_store.add_documents(documents=documents)

['ceda895e-a8d7-4f9a-8bca-16f1afda374f',
 'e6ef716a-51ab-4ab4-9fde-0541a55d9aab',
 'ea3c8f75-a7ba-4772-97c2-0ce49824e60f',
 '011ac1cc-dd4e-44e9-b237-3e83cfaed669',
 '7b128e23-c6da-4bfd-8b9e-c4137f2ebd73',
 'd0a89e05-56e2-4286-ac64-10f7dc3c4ad3',
 'bcb4e7e3-1280-4a1b-89d0-04cdf570ce14',
 'aee50fac-109e-493b-9a53-cbc7f0f35763',
 '78a79b44-c8cd-4bb9-979d-277f30d8c0d8',
 '6a1d1ece-3731-43fb-ae43-39bb12baf0a5',
 'c6ab4af7-9008-4f03-b606-6fb48435f8ba',
 '76106044-fe58-469f-879b-53cb3908e5a3',
 'ab146ebd-7f7a-46a6-8afb-a85e6e7a17ab',
 '877fac9b-a546-4ed2-8ff8-f0822c10329e',
 '8a70b2c6-7073-4cf6-a63e-6a9a35799166',
 '096c0241-f249-49f2-a385-3f9ccc2a3fe3',
 'ea11a868-db82-4807-9571-575a5f2c005e',
 'c3f256bb-e823-4e6f-bb25-06f7f36decd5',
 '8a369dfe-162b-472f-a50d-da56a910ebcd',
 '6aa81047-f483-47a7-8c3f-67c163579711',
 'e1c925d6-c727-4a73-beaf-0b4ce3517d1b',
 '026b806d-19cf-45d8-98d5-dadfd04f5ef8',
 'bef642cd-88ab-4600-b0db-3e4fbeb30092',
 'e04a7550-05be-4888-9eb6-737601f53081',
 '30e99210-f9e8-

In [43]:
retriever = vector_store.as_retriever(
    search_kwargs={"k": 5})

In [44]:
retriever

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002A25DAA4390>, search_kwargs={'k': 5})

In [45]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
#from vector import retriever

model = OllamaLLM(model="llama3.2")

In [46]:
#Creating template for QA model
template = """
You are an expert in real estate and mortgage information in US for Wells Fargo Bank.
When a user asks you property information in specific states, 
generate names, contact information, and links to the websites of Home Mortgage Consultants(HMC) 
from the database. Ensure the link that is provided is correct and opens the contact page of the mortgage consultants.
If the link is not the contact page of the mortgage consultant, do not display their names or other details.
Results should be of format:

HMC Name
Email
Phone Number
Link 


Here are some relevant questions: {reviews}

Here is the question to answer: {question}
"""

In [47]:
prompt = ChatPromptTemplate.from_template(template)

In [48]:
chain = prompt | model

In [49]:
while True:
    print("\n\n-------------------------------")
    question = input("Ask your question (q to quit): ")
    print("\n\n")
    if question == "q":
        break
   
    reviews = retriever.invoke(question)
    result = chain.invoke({"reviews": reviews, "question": question})
    print(result)



-------------------------------
Ask your question (q to quit): i want a two bedroom apartment in New York.



Based on your request for a 2-bedroom apartment in New York, I've searched our database and found some information for you.

Here are the results:

1. HMC Name - Jeffery-B-Deshields
Email - [jeffrey.b.deshields@wellsfargo.com](mailto:jeffrey.b.deshields@wellsfargo.com)
Phone Number - Not available publicly
Link - https://web.secure.wellsfargo.com/mortgage/connect/contact-us?siteSuffix=Jeffery-B-Deshields

2. HMC Name - Thomas-J-Sharkey
Email - [thomas.j.sharkey@wellsfargo.com](mailto:thomas.j.sharkey@wellsfargo.com)
Phone Number - Not available publicly
Link - https://web.secure.wellsfargo.com/mortgage/connect/contact-us?siteSuffix=Thomas-J-Sharkey

3. HMC Name - Brian-P-Mcnamara
Email - [brian.p.mcnamara@wellsfargo.com](mailto:brian.p.mcnamara@wellsfargo.com)
Phone Number - Not available publicly
Link - https://web.secure.wellsfargo.com/mortgage/connect/contact-us?siteSuffix