In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin,urlparse
import os

#using NYU and UC Berkeley's PhD websites as starting urls to scrape the relevant PhD advertisement pages
output_folder = "scraped_data_wf_hmc"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

In [2]:
from transformers import pipeline

summarizer = pipeline("summarization", model="Falconsai/text_summarization")




Device set to use cuda:0


In [3]:
results=[]
visited=[]


def is_valid_url(url):
    parsed=urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def extract_visible_text(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Remove scripts and styles
    for tag in soup(['script', 'style', 'noscript']):
        tag.decompose()
        
    text = soup.get_text(separator=' ', strip=True)
    return ' '.join(text.split())


def crawl(url,base_url):
        
    if url in visited:
        return
    if '/es/' in url:
        return
    if url+'/' in visited:
        return

    if base_url not in url:
        return
    print(f"Crawling {url}")
    visited.append(url)
    
    try:
        response=requests.get(url,timeout=30)
        response.raise_for_status()
    except Exception as e:
        print("Failed to fetch {url}")
        return
        
    html_content=response.text
    results.append(extract_visible_text(html_content))
    soup=BeautifulSoup(response.text,'html.parser')
    #print(extract_visible_text(html_content))
    
    
    for link_tag in soup.find_all('a',href=True):
        
        href=link_tag['href']
        full_url=urljoin(base_url,href)
        if is_valid_url(full_url) and full_url not in visited:
            if len(results)<100:
                crawl(full_url,base_url)
            else:
                break

In [26]:
start_urls=['https://homeloans.wellsfargo.com/']
crawl(start_urls[0],start_urls[0])

In [7]:
summaries=[]
for i in range(len(results)):
    #print(len(results[i].split()))
    if len(results[i].split())<500:
        summaries.append(results[i])
    else:
        a=summarizer(" ".join(results[i].split()[:512]), max_length=500, min_length=30, do_sample=False)
        summaries.append(a[0]['summary_text'])

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [8]:
import torch

# Automatically choose GPU if available, otherwise fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")


Using device: cuda


In [9]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import os
import pandas as pd

In [10]:
embeddings = OllamaEmbeddings(model="llama3.2")

In [11]:
db_location = "./chrome_langchain_db/hmcs info"
#add_documents = not os.path.exists(db_location)

In [12]:
#Creating Documents from scraped data

documents = []
ids = []

for url in start_urls:
    try:
        #web,results=scrape(url)


        for i in range(len(results)):
            document = Document(
                page_content=results[i],
                metadata={"url":visited[i],"summary":summaries[i]},
                id=str(i)
    )
            ids.append(str(i))
            documents.append(document)
    except:
        pass

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
documents=text_splitter.split_documents(documents)

In [14]:
#Creating Chroma Vector Database
vector_store = Chroma(
    collection_name="hmcsinfo",
    persist_directory=db_location,
    embedding_function=embeddings
)

In [15]:
len(documents),len(ids)

(636, 100)

In [25]:
vector_store.add_documents(documents=documents)

In [18]:
retriever = vector_store.as_retriever(
    search_kwargs={"k": 5})

In [19]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
#from vector import retriever

model = OllamaLLM(model="llama3.2")

In [31]:
#Creating template for QA model
template = """
You are an expert in real estate and mortgage information in US for Wells Fargo Bank.
When a user asks you property or mortgage rate information in specific states, 
generate names, contact information, links to the websites of Home Mortgage Consultants(HMC), and their reviews by customers.
from the database. The page content is the entire text of the webpage, the summary is the summary of the page,
url is the link to the page. Ensure the link that is provided is correct and opens the contact page of the mortgage consultants.
If the link is not the contact page of the mortgage consultant, do not display their names or other details.
Make sure that the address of the HMC is in the same state as requested by the user.
Make sure the email ids don't have unnecessary space and the information is accurate.
Results should be of format:

HMC Name
Email
Link 
Reviews


Here are some relevant questions: {reviews}

Here is the question to answer: {question}
"""

In [32]:
prompt = ChatPromptTemplate.from_template(template)

In [33]:
chain = prompt | model

In [34]:
while True:
    print("\n\n-------------------------------")
    question = input("Ask your question (q to quit): ")
    print("\n\n")
    if question == "q":
        break
   
    reviews = retriever.invoke(question)
    result = chain.invoke({"reviews": reviews, "question": question})
    print(result)



-------------------------------
Ask your question (q to quit): i'm looking for a mortgage for a house in California



Based on the provided information, I've generated the names, contact information, links to the websites of Home Mortgage Consultants (HMC), and their reviews by customers for mortgage rate information in California.

 HMC Name
Email
Link 
Reviews


Coreen Jamison
coreen.jamison@wellsfargo.com
https://homeloans.wellsfargo.com/mortgage/ca/corte-madera/coreen
4.5/5 stars on Google, 4.9/5 stars on Yelp

Ed Walters
edward.j.walters@wellsfargo.com
https://homeloans.wellsfargo.com/mortgage/ca/corte-madera/edward-j-walters
4.8/5 stars on Google, 5/5 stars on Zillow


-------------------------------
Ask your question (q to quit): i'm looking for a mortgage for a house in Arizona.



Based on the information provided, I've generated some names, contact information, links to the websites of Home Mortgage Consultants (HMC), and their reviews by customers in Arizona.

Please note