# Import library

In [2]:
# dataframe
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

# clustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans, DBSCAN
import scipy.cluster.hierarchy as shc

# model preparation
from sklearn import preprocessing as ppr
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# modeling
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier   
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

# model evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import *

In [3]:
import os
from dotenv import load_dotenv
dotenv_path = "../backend/.env"
load_dotenv(dotenv_path=dotenv_path)

True

In [4]:
import os

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", default=None)

# Explore Data + chunking 

In [5]:
DATA_CATEGORIZE_URL = {
    "interest_rate" : [
      "https://www.cnet.com/personal-finance/mortgage-rate-predictions-holiday-week-brings-higher-rates/",
      "https://finance.yahoo.com/news/15-countries-highest-mortgage-rates-210146206.html"
    ],
    "market_trends_collection" : ["https://www.linkedin.com/pulse/2024-mortgage-market-review-key-insights-trends-shaped-year-kexwe/",
                                  "https://themortgagereports.com/116167/2024-housing-market-recap",
                                  "https://www.bankrate.com/real-estate/housing-trends/",
                                  "https://www.freddiemac.com/research/forecast/20241126-us-economy-remains-resilient-with-strong-q3-growth#spotlight",
                                  "/home/quochungtran/Desktop/ML_project/LLM_project/data/pdf/cfpb_2023-mortgage-market-activity-and-trends_2024-12.pdf"
                                ],
    "eligibility" : [
        "https://www.hdfc.com/home-loan-eligibility-calculator",
        "https://www.icicibank.com/calculator/home-loan-eligibility-calculator#:~:text=When%20applying%20for%20a%20home%20loan%2C%20your%20salary%20is%20crucial,your%20Home%20Loan%20journey%20effectively.",
        "https://www.hdfc.com/blog/home-finance/understanding-home-loan-eligibility#:~:text=1.,Your%20overall%20personal%20profile%20viz.",
    ],
    "financial_choice": [
      "https://agrimhfc.com/home-loan-balance-transfer-or-top-up-loan/",
      "https://agrimhfc.com/home-loan-under-construction-property-benefits/"
    ],
    "refinancing": [
        "https://www.athena.com.au/learn/requirements-for-home-loan-refinancing",
        "https://www.investopedia.com/mortgage/refinance/when-and-when-not-to-refinance-mortgage/#:~:text=Since%20refinancing%20can%20cost%20between,when%20it's%20better%20to%20wait."
    ]
}

In [6]:
import requests
from bs4 import BeautifulSoup
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from llama_index.readers.web import SpiderWebReader
from IPython.display import Markdown, display
import os

In [7]:
DATA_CATEGORIZE_URL.keys()

dict_keys(['interest_rate', 'market_trends_collection', 'eligibility', 'financial_choice', 'refinancing'])

In [10]:
from llama_index.readers.web import FireCrawlWebReader
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.node_parser import SentenceSplitter
import time 

def DocumentReader(url, reader):
    return reader.load_data(url)

def get_pattern(url):
    """
    Determines the type of resource based on the URL.
    """
    if url.endswith(".pdf") or "pdf" in url:
        return "pdf"
    elif url.startswith("http://") or url.startswith("https://"):
        return "http"
    elif os.path.isfile(url):
        # Handles local file paths
        if url.endswith(".pdf"):
            return "pdf"
        else:
            return "unknown"
    else:
        return "unknown"

def open_ai_summarize_home_loan(doc_content):
    user_prompt = f"""
        You are an expert assistant specializing in financial topics, particularly home loans. 
        Summarize the text content below, ensuring it includes all relevant information about home loans, such as statistical data, key figures, and actionable insights. 

        Ensure the summary is:
        - Concise, avoiding unnecessary repetition.
        - Well-organized using sections or bullet points for clarity.
        - Within the token limit of 8190.

        Text content:
        {doc_content}
    """
    openai_messages = [
        {"role": "system", "content": "You are a highly intelligent assistant specializing in financial topics, dedicated to providing actionable and clear insights about home loans."},
        {"role": "user", "content": user_prompt}
    ]
    summarized_txt = openai_chat_complete(openai_messages)['content']
    print("Home Loan Insights Summary: ", summarized_txt)
    return summarized_txt

def loadDoc(url):
    print("Load data from url :", url)
    reader=None
    if get_pattern(url) == "pdf":
        reader = PDFReader()

    elif get_pattern(url) == "http":
        FIRE_CRAWL_API_KEY = os.environ.get("FIRE_CRAWL_API_KEY", default=None)
        reader = FireCrawlWebReader(
            api_key=FIRE_CRAWL_API_KEY,
            mode="crawl"
        )
    
    return DocumentReader(url, reader)

def loadData(url):
    print("Load data from url :", url)
    reader=None
    parser=None
    if get_pattern(url) == "pdf":
        reader = PDFReader()
        parser = SentenceSplitter(
            paragraph_separator="\n",
            chunk_size=1000,
            chunk_overlap=200,
        )
    elif get_pattern(url) == "http":
        FIRE_CRAWL_API_KEY = os.environ.get("FIRE_CRAWL_API_KEY", default=None)
        reader = FireCrawlWebReader(
            api_key=FIRE_CRAWL_API_KEY,
            mode="crawl"
        )
        parser = MarkdownNodeParser()

    parsed_nodes = parser.get_nodes_from_documents(DocumentReader(url, reader=reader))
    for i, node in enumerate(parsed_nodes):
        doc_content = node.get_content()
        if(len(doc_content) > 5000):
            node.set_content(open_ai_summarize_home_loan(doc_content))

    return parsed_nodes

def documentsLogging(documents):
    for i, node in enumerate(documents):
        print(f"""Doc Content     :{node.get_content()}""")
        print("===================================================")

In [45]:
pdfReader = PDFReader()
pdf_documents = DocumentReader("/home/quochungtran/Desktop/ML_project/LLM_project/data/pdf/cfpb_2023-mortgage-market-activity-and-trends_2024-12.pdf", pdfReader)
from llama_index.core.node_parser import SentenceSplitter
def split_text(
    documents,
    paragraph_separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
):
    text_splitter = SentenceSplitter(
        paragraph_separator=paragraph_separator,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    return text_splitter.get_nodes_from_documents(documents)

texts = split_text(pdf_documents)

In [125]:
texts[0:5]

[TextNode(id_='49acda29-b0bf-4626-b316-72806e6eef59', embedding=None, metadata={'page_label': '1', 'file_name': 'cfpb_2023-mortgage-market-activity-and-trends_2024-12.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='8d1d5616-7738-4a69-90f5-d41cfb9c61cf', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'cfpb_2023-mortgage-market-activity-and-trends_2024-12.pdf'}, hash='e4784e68fbf46befc93c02da7e044922a283e618cf0bb4bc004ca943a081dc9c')}, text='1 \n \n      CONSUMER FINANCIAL PROTECTION BUREAU   |   DECEMBER  2024 \n \n \n \n2023 Mortgage Market \nActivity and Trends', mimetype='text/plain', start_char_idx=0, end_char_idx=117, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='e08cca3e-5826-40c9-936a-96ad52749738', embedding=None, metadata={'page_label': '2', 'file_name': 'cfpb_2023-mortgage-mar

In [126]:
documentsLogging(texts[0:5])

Doc Content     :1 
 
      CONSUMER FINANCIAL PROTECTION BUREAU   |   DECEMBER  2024 
 
 
 
2023 Mortgage Market 
Activity and Trends
Doc Content     :2 2023 MORTGAGE MARKET ACTIVITY AND TREND S 
 This is another in an occasional series of publications from the Consumer Financial Protection 
Bureau’s Office of Research. These publications are intended to further the CFPB’s objective of 
providing an evidence -based perspective on consumer financial markets, consumer behavior, 
and regulations to inform the public discourse.  See 12 U.S.C. §5493(d).
Doc Content     :3  2023 MORTGAGE MARKET ACTIVITY AND TRENDS  
 Table of contents 
 
Table of contents ......................................................................................................... 3 
1. Introduction  ............................................................................................................. 4 
2. Mortgage applications and originations  ...........................................................

In [8]:
FIRE_CRAWL_API_KEY = os.environ.get("FIRE_CRAWL_API_KEY", default=None)
webReader = FireCrawlWebReader(
    api_key=FIRE_CRAWL_API_KEY,
    mode="crawl"
)

web_documents = DocumentReader(DATA_URL[1], webReader)

def chunking_markdown(markdown_content):
    parser = MarkdownNodeParser()
    return parser.get_nodes_from_documents(markdown_content)

md_chunk_nodes = chunking_markdown(web_documents)

In [127]:
documentsLogging(md_chunk_nodes[0:2])

Doc Content     :![Gavin Harrigan pointing at a document showing current home loan industry trends](https://www.quantumfinance.com.au/wp-content/uploads/2023/12/current-home-loan-industry-trends.jpg)06Dec2023

With my extensive experience in the industry spanning nearly two decades, I’m here to guide you through the complexities of the current [home loan](https://www.quantumfinance.com.au/home-loans/) industry, property, and consumer behaviour trends in Australia as we head into 2024.

This blog breaks down these topics into simple, easy-to-understand content, drawing from my years of expertise and expert analysis.

Whether you’re buying a home, investing, or just curious about the market, I’ve broken down the latest trends to help you stay up-to-date.
Doc Content     :## Key Takeaways

- **Home Loan Trends**: Growth in digital lending, increasing importance of mortgage brokers, a shift towards refinancing, and changes in borrowing power.
- **Property Market Trends**: Variable house pr

# Indexing into vector database 

In [54]:
from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
import json


OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", default=None)
DEFAULT_COLLECTION_NAME = "mock_home_loan_faq_collection"

def get_openai_client():
    return OpenAI(api_key=OPENAI_API_KEY)

def get_qdrant_client():
    return QdrantClient("http://localhost:6333")

client        = get_openai_client()
qdrant_client = get_qdrant_client()

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

def create_collection(name, vector_size=1536, distance_op=Distance.DOT):
        if not qdrant_client.collection_exists(name):     
            return qdrant_client.create_collection(
                collection_name=name,
                vectors_config=VectorParams(size=vector_size, distance=distance_op)
            )

def search_vector(collection_name: str, query_vector: list, limit: int = 4):
        results = qdrant_client.search(
            collection_name=collection_name, query_vector=query_vector, limit=limit
        )
        return [x.payload for x in results]

def add_vectors(collection_name: str, vectors: dict):
        points = [
            PointStruct(id=k, vector=v["vector"], payload=v["payload"])
            for k, v in vectors.items()
        ]
        return qdrant_client.upsert(collection_name=collection_name, points=points, wait=True)

def add_doc_to_vector_db(doc_instance, collection_name=DEFAULT_COLLECTION_NAME):
    if doc_instance.get_content():
        vector = get_embedding(doc_instance.get_content())
        print(f"Embedding {doc_instance.get_content()} to vector")
        add_vectors(
            collection_name,
            {
                doc_instance.id_: {
                    "vector": vector,
                    "payload": {
                        "content": doc_instance.get_content()
                    }
                }
            }
        )
    else:
        print("Title and content is null")


def gen_doc_prompt(docs):
    """
    Document:
    Title: Uong atiso ...
    Content: ....
    """
    doc_prompt = ""
    for doc in docs:
        doc_prompt += f"Content: {doc['content']} \n"

    return "Document: \n + {}".format(doc_prompt)

# dict_keys(['interest_rate', 'market_trends_collection', 'eligibility', 'financial_choice', 'refinancing'])
def detect_collection(message):

    user_prompt = f"""
    Given the following the user's latest message, determine whether the user's intent is to ask for with topic 
    - "interest_rate" :  
    - "market_trends_collection": 
    - "eligibility": 
    - "financial_choice":
    - "refinancing":
    Latest User Message:
    {message}

    Classification (choose one or more related topic amongs "interest_rate", "market_trends_collection", "eligibility", "financial_choice", "refinancing"):
    Always return a list of topic, fox example:

    ["interest_rate","market_trends_collection"]
    """
    openai_messages = [
        {"role": "system", "content": "You are a highly intelligent assistant that helps classify customer queries"},
        {"role": "user", "content": user_prompt}
    ]
    
    return openai_chat_complete(openai_messages)


def openai_chat_complete(messages=(), model="gpt-4o-mini", raw=False):
    print("Chat complete for {}".format(messages))
    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    if raw:
        return response.choices[0].message
    output = response.choices[0].message
    return {
        "role": "assistant",
        "content": str(output.content)
    }

def process_string_to_list(input_string):
    return json.loads(input_string)

def answer_user_request(message):
    # Embedding text
    vector = get_embedding(message)
    print(f"Get vector of input {message}")

    collections_answer = detect_collection(message)
    collections        = process_string_to_list(collections_answer["content"])
    print(f"Get related topic : {collections}")
    # Search document
    for collection in collections:
        top_docs = search_vector(collection, vector, 4)

    gen_docs = gen_doc_prompt(top_docs)

    openai_messages = [
        {"role": "system", "content": "You are an amazing virtual assistant"},
        {"role": "user", "content": gen_docs},
        {"role": "user", "content": message},
    ]
    print(f"Openai messages: {openai_messages}")

    assistant_answer = openai_chat_complete(openai_messages)

    print(f"""{assistant_answer}""")
    return assistant_answer, gen_docs

In [5]:
# create collection in qdrant vectordb
create_collection(DEFAULT_COLLECTION_NAME)

In [41]:
# embedding document and add to vector db
for node in md_chunk_nodes+texts:
    add_doc_to_vector_db(node)

Embedding ![Gavin Harrigan pointing at a document showing current home loan industry trends](https://www.quantumfinance.com.au/wp-content/uploads/2023/12/current-home-loan-industry-trends.jpg)06Dec2023

With my extensive experience in the industry spanning nearly two decades, I’m here to guide you through the complexities of the current [home loan](https://www.quantumfinance.com.au/home-loans/) industry, property, and consumer behaviour trends in Australia as we head into 2024.

This blog breaks down these topics into simple, easy-to-understand content, drawing from my years of expertise and expert analysis.

Whether you’re buying a home, investing, or just curious about the market, I’ve broken down the latest trends to help you stay up-to-date. to vector
Embedding ## Key Takeaways

- **Home Loan Trends**: Growth in digital lending, increasing importance of mortgage brokers, a shift towards refinancing, and changes in borrowing power.
- **Property Market Trends**: Variable house price 

In [53]:
# try to look at the 
question = "What are the current interest rates and market trends in 2021?"
answer_user_request(question)

Get vector of input What are the current interest rates and market trends in 2021?
Chat complete for [{'role': 'system', 'content': 'You are a highly intelligent assistant that helps classify customer queries'}, {'role': 'user', 'content': '\n    Given the following the user\'s latest message, determine whether the user\'s intent is to ask for with topic \n    - "interest_rate" :  \n    - "market_trends_collection": \n    - "eligibility": \n    - "financial_choice":\n    - "refinancing":\n    Latest User Message:\n    What are the current interest rates and market trends in 2021?\n\n    Classification (choose one or more related topic amongs "interest_rate", "market_trends_collection", "eligibility", "financial_choice", "refinancing"):\n    Always return a list of topic, fox example:\n\n    ["interest_rate","market_trends_collection"]\n    '}]
Get related topic : ['interest_rate', 'market_trends_collection']
[{'content': '## 1\\. 2024 Mortgage Market Overview\n\nThe U.S. mortgage marke

{'role': 'assistant',
 'content': 'In 2021, the average interest rates for 30-year fixed mortgage loans were relatively low compared to subsequent years. According to the data provided in the document, the median interest rates for each month of 2021 were as follows:\n\n- January: 2.75%\n- February: 2.75%\n- March: 2.875%\n- April: 3.125%\n- May: 3.125%\n- June: 3%\n- July: 3%\n- August: 2.99%\n- September: 2.989%\n- October: 2.99%\n- November: 3.125%\n- December: 3.125%\n\nThese interest rates indicate a period of historically low borrowing costs, which contributed to increased demand for home purchases. The affordability challenge that emerged in later years, particularly in 2022 and beyond, was not as pronounced in 2021 due to these lower rates.\n\nOverall, the housing market in 2021 was characterized by high demand driven by low mortgage rates, leading to rising home prices as buyers competed for limited inventory.'}

In [33]:
name_collections = DATA_CATEGORIZE_URL.keys()
print(name_collections)
for collect_name in name_collections:
    qdrant_client.delete_collection(collect_name)

dict_keys(['interest_rate', 'market_trends_collection', 'eligibility', 'financial_choice', 'refinancing'])


In [34]:
## whole data pipeline ingestion:
import time 

for col_name, given_urls in DATA_CATEGORIZE_URL.items():
    print("collection name", col_name)
    create_collection(col_name)

    nodes = []
    for url in given_urls:
        nodes += loadData(url)
        time.sleep(30)

    for node in nodes:
        add_doc_to_vector_db(node, col_name)

collection name interest_rate
Load data from url : https://www.cnet.com/personal-finance/mortgage-rate-predictions-holiday-week-brings-higher-rates/
Chat complete for [{'role': 'system', 'content': 'You are a highly intelligent assistant specializing in financial topics, dedicated to providing actionable and clear insights about home loans.'}, {'role': 'user', 'content': '\n        You are an expert assistant specializing in financial topics, particularly home loans. \n        Summarize the text content below with the following focus:\n        1. **Key Details**: Include all relevant information about home loans, such as loan types, eligibility criteria, interest rates, fees, etc, and statistical data (e.g., percentages, thresholds, or averages).\n        2. **Actionable Insights**: Provide tips and strategies for borrowers to improve their chances of approval, reduce costs, and make informed decisions.\n        3. **FAQs**: Generate a list of frequently asked questions and their answe

# Evaluation

## Build golden dataset

In [None]:
import pandas as pd
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core.evaluation import RelevancyEvaluator

# Creating the dataset with the categorized questions
data = []

# Adding 150 categorized questions
collections = [
    ("interest_rate", [
        "What are the current interest rates for home loans in 2024?",
        "How have home loan interest rates changed over the past year?",
        "What factors influence home loan interest rates?",
        "How do interest rates vary across different states or regions?",
        "What is the impact of inflation on home loan interest rates?",
        "Which countries or areas with highest interest rates?",
        "How can I ensure a lower mortgage rate?"
    ]),
    ("eligibility", [
        "What is the minimum credit score needed to qualify for a home loan?",
        "What documents are required to apply for a home loan?",
        "How does my debt-to-income ratio affect my loan eligibility?",
        "Are there specific home loan options for self-employed individuals?",
        "Can someone with a poor credit score get a home loan?",
        # co-applicant
        # what to do when buyers can not pay the debt
    ]),
    ("market_trends", [
        "Which states have the highest home loan demand in 2024?",
        "How do home loan rates differ in urban vs. rural areas?",
        "What are the top regions for refinancing activity in 2023?",
        "How does the housing market impact home loan trends in California?",
        "Are there specific challenges to getting a home loan in high-cost areas?"
        # apartement/ land/ purchase
    ]),
    ("refinancing_policy", [
        "What are the current trends in home loan refinancing?",
        "How does a drop in interest rates affect refinancing activity?",
        "Are there penalties for refinancing a home loan early?",
        "What are the steps to refinance a home loan?",
        "How does refinancing impact my overall loan cost?"
    ]),

]
    # ("assess_value", [
        
    # ]),
llm = OpenAI(model="gpt-4o")
faithful_evaluator  = FaithfulnessEvaluator(llm)
relevancy_evaluator = RelevancyEvaluator(llm)

# Expand to full dataset
for collection_name, questions in collections:
    for question in questions:
        rag_response, source_docs = answer_user_request(question)

        faithful_eval_res = faithful_evaluator.evaluate(
            response=rag_response['content'],
            contexts=[source_docs]
        )
        
        relevancy_eval_res = relevancy_evaluator.evaluate(
            query=question,
            response=rag_response['content'],
            contexts=[source_docs]
        )

        data.append({"collection": collection_name, 
                     "question": question, 
                     "rag_response":rag_response['content'],
                     "source": source_docs[:1000],
                     "Faithfulness Evaluation Result": "Pass" if faithful_eval_res.passing else "Fail",
                     "Relevancy Evaluation Result": "Pass" if relevancy_eval_res.passing else "Fail",
                    }
        )

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
file_path = "home_loan_trending_faq.csv"
df.to_csv(file_path, index=False)

file_path

In [None]:
from llama_index.core.evaluation import DatasetGenerator
import time
docs = []
for _, given_urls in DATA_CATEGORIZE_URL.items():
    for url in given_urls:
        docs += loadDoc(url)
        time.sleep(30)

question_generator = DatasetGenerator.from_documents(docs)
# eval_questions = question_generator.generate_questions_from_nodes(5)

# eval_questions

Load data from url : https://www.cnet.com/personal-finance/mortgage-rate-predictions-holiday-week-brings-higher-rates/
Load data from url : https://finance.yahoo.com/news/15-countries-highest-mortgage-rates-210146206.html
Load data from url : https://www.linkedin.com/pulse/2024-mortgage-market-review-key-insights-trends-shaped-year-kexwe/
Load data from url : https://themortgagereports.com/116167/2024-housing-market-recap
Load data from url : https://www.bankrate.com/real-estate/housing-trends/
Load data from url : https://www.freddiemac.com/research/forecast/20241126-us-economy-remains-resilient-with-strong-q3-growth#spotlight
Load data from url : /home/quochungtran/Desktop/ML_project/LLM_project/data/pdf/cfpb_2023-mortgage-market-activity-and-trends_2024-12.pdf
Load data from url : https://www.hdfc.com/home-loan-eligibility-calculator
Load data from url : https://www.icicibank.com/calculator/home-loan-eligibility-calculator#:~:text=When%20applying%20for%20a%20home%20loan%2C%20your%20

In [115]:
# Set display options
pd.set_option('display.max_colwidth', None)  # This removes the limit on column width
pd.set_option('display.max_rows', None)  # If needed, show all rows
df.head()

Unnamed: 0,collection,question,rag_response,source,Faithfulness Evaluation Result,Relevancy Evaluation Result
0,interest_rate,What are the current interest rates for home loans in 2024?,"As of January 2024, the average mortgage rate for a 30-year fixed-rate mortgage is around 6.6%, which is a decline from a high of 7.79% in October 2023. Projections suggest that mortgage rates could drop to lower than 6% within the year, particularly as consumer demand is expected to increase during the spring season. However, experts indicate that rates may hold above 6.5% until early 2025.","Document: \n + Content: ## **""We Are Heading into A Housing Renaissance""**\n\nOn January 29, Jim Tobin, Chief Executive Officer of the National Association of Home Builders, appeared in an [interview](https://www.youtube.com/watch?v=KPLRPwZfuJQ) on _Yahoo Finance,_ where he discussed the outlook for mortgage rates in 2024. Tobin said that over the next six months, mortgage rates are expected to decline steeply. According to Freddie Mac, mortgage rates are closer to 6.5% than they are to 7%. Moreover, as consumer demand is expected to increase in the spring season, the outlook for housing ahead of 2024, especially over the first six months seems to be positive. Over the past few months, mortgage rates have declined by one full percentage point. Other important catalysts expected to shape the housing outlook include the vast builder incentives, including price cuts and the expected drop in mortgage rates to lower than 6%.\n\nOn January 25, Reuters [reported](https://www.reuters.com/markets/us",Pass,Pass
1,interest_rate,How have home loan interest rates changed over the past year?,"Over the past year, home loan interest rates have exhibited a significant upward trend. In January 2023, the monthly average interest rate for 30-year fixed-rate closed-end conventional conforming loans originated to prime borrowers was around 6.238%. By December 2023, this rate had risen to 7.169%, indicating a steady increase in interest rates throughout the year. This rise in interest rates has contributed to higher monthly mortgage payments for borrowers and has impacted overall mortgage market activity, including a decline in both home purchase and refinance loan applications and originations.","Document: \n + Content: ### Rate Fluctuations and Their Impact on Affordability\n\nThe mortgage market in 2024 was heavily influenced by interest rate fluctuations, which, though modest, had significant implications for affordability and loan demand. The Federal Reserve maintained a relatively high federal funds rate throughout the year, aiming to stabilize inflation without drastically impacting economic growth([10)](https://www.federalreserve.gov/monetarypolicy/2024-03-mpr-summary.htm). As a result, the average 30-year fixed mortgage rate hovered around 6.8%([11)](https://www.freddiemac.com/pmms). While this was a slight improvement from 2023’s highs, it remained well above pre-pandemic levels, making home financing less affordable for many borrowers.\n\nThese elevated rates placed downward pressure on both purchasing power and housing affordability. For example, a buyer with a 6.8% interest rate on a $400,000 mortgage faced monthly principal and interest payments around $2,600, compared",Pass,Pass
2,interest_rate,What factors influence home loan interest rates?,"Home loan interest rates are influenced by several key factors, including:\n\n1. **Federal Reserve Policies**: The Federal Reserve's decisions regarding interest rates impact mortgage rates. Changes in the benchmark short-term interest rate can lead to fluctuations in long-term rates, such as those for mortgages.\n\n2. **Investor Expectations**: Market expectations about future economic conditions, including inflation and growth, can influence mortgage rates. If investors anticipate rising inflation, they may demand higher yields on bonds, which in turn can push mortgage rates up.\n\n3. **10-Year Treasury Yields**: Mortgage rates closely track the yields on 10-year Treasury notes. As these yields rise or fall, mortgage rates tend to follow suit.\n\n4. **Economic Indicators**: Key economic indicators such as inflation rates, unemployment rates, and job market performance can also affect mortgage rates. Strong indicators may lead to higher rates, while weak indicators may result in lower rates.\n\n5. **Geopolitical Events**: Political instability, military conflicts, and elections can create economic uncertainty, which can lead to volatility in mortgage rates.\n\n6. **Credit Score and Financial Profile**: A borrower's credit score, debt-to-income ratio, and overall financial health can influence the interest rates offered by lenders. Borrowers with higher credit scores typically qualify for lower rates.\n\n7. **Loan Characteristics**: The type of loan—such as fixed-rate vs. adjustable-rate mortgages, as well as the size of the loan and down payment—can impact the interest rate. Larger down payments and lower-risk loans often receive better rates.\n\n8. **Market Competition**: The level of competition among lenders may also affect mortgage rates, as lenders may offer more attractive rates to attract borrowers. \n\nThese factors collectively create a dynamic environment that influences mortgage interest rates at any given time.","Document: \n + Content: ### Key Details:\n- Mortgage rates have increased, with the average 30-year fixed rate rising to around 7% following the Fed's interest rate cut.\n- Mortgage rates are influenced by investor expectations and the yield on the 10-year Treasury.\n- In the near term, rates are projected to fluctuate between 5.75% and 7.25% according to HousingWire's 2025 forecast.\n- Sample APRs for a 30-year fixed mortgage range from 6.29% to 6.33% with associated fees.\n\n### Actionable Insights:\n- Monitor economic indicators such as inflation and job market data to gauge the potential direction of mortgage rates.\n- Consider locking in a rate if you find an offer within your budget to protect against potential rate increases.\n- Improve your credit score to access better rates or consider making a larger down payment to reduce overall interest costs.\n- Seek pre-approval before house hunting to understand your borrowing capacity and negotiate confidently.\n\n### FAQs:\n1. **What factors influ",Pass,Pass
3,interest_rate,How do interest rates vary across different states or regions?,"Interest rates can vary across different states or regions due to several factors, including:\n\n1. **Local Economic Conditions**: States with stronger economies often have lower interest rates because lenders feel more secure in borrowers' ability to repay loans. Conversely, regions facing economic challenges may see higher rates to compensate for increased risk.\n\n2. **Cost of Living**: Areas with a higher cost of living might have higher interest rates. Lenders take into account the financial pressures borrowers face, which can differ by location.\n\n3. **Housing Market Dynamics**: In regions with rapidly appreciating home prices, lenders may adjust rates based on perceived risk and competition among borrowers, leading to variations in mortgage rates.\n\n4. **State Regulations**: Some states have unique regulations that affect lending practices, which can influence interest rates. For instance, states may impose usury laws that limit the maximum interest rate lenders can charge.\n\n5. **Local Competition**: Regions with a higher number of lenders can experience lower rates due to increased competition among financial institutions. Conversely, in areas with fewer lenders, rates may be higher due to less competition.\n\n6. **Property Taxes and Insurance Costs**: Locations with high property taxes or insurance costs can lead to higher overall borrowing costs, indirectly affecting interest rates offered by lenders.\n\n7. **Credit Unions and Regional Banks**: Local financial institutions may offer more competitive rates compared to national banks, reflecting their understanding of the regional market and borrower needs.\n\n8. **Loan Type and Purpose**: Different regions may have varying demand for types of loans (e.g., conventional, FHA, VA loans), which can influence the rates lenders provide.\n\nAs a result of these factors, it's essential for borrowers to shop around and compare offers from various lenders in their specific region to find the best rates available.","Document: \n + Content: # 30-Year Mortgage Rates vs. Federal Funds Rate (July-December 2024)\n\nAverage 30-year fixed mortgage rate\n\nFederal funds rate\n\n5%6%7%7/11/20247/17/20247/23/20247/29/20248/2/20248/8/20248/14/20248/20/20248/26/20248/30/20249/5/20249/11/20249/17/20249/23/20249/27/202410/3/202410/9/202410/15/202410/21/202410/25/202410/31/202411/6/202411/12/202411/18/202411/22/202411/28/202412/4/202412/10/202412/16/202412/20/2024\n\nSource: Bankrate, Federal Reserve \nContent: ## Why are mortgage rates higher after the Fed's rate cut?\n\nThe recent surge in longer-term Treasury yields and home loan rates was due in large part to the Fed's newly updated [Summary of Economic Projections](https://www.federalreserve.gov/monetarypolicy/files/fomcprojtabl20241218.pdf ""(opens in a new window)""), outlining expectations for just two 0.25% interest rate cuts in 2025, down from four previously.\n\nTo maintain [maximum employment](https://www.cnet.com/personal-finance/what-todays-jobs-report-means-for-m",Fail,Fail
4,interest_rate,What is the impact of inflation on home loan interest rates?,"Inflation significantly affects home loan interest rates in several ways:\n\n1. **Federal Reserve Response**: In response to rising inflation, the Federal Reserve often raises the federal funds rate to cool the economy. Higher federal funds rates generally lead to increased mortgage rates as lenders pass on the higher costs of borrowing to consumers.\n\n2. **Investor Expectations**: Mortgage rates are influenced by investor expectations about the economy and future inflation. If investors anticipate higher inflation, they may demand higher yields on bonds, leading to increased mortgage rates.\n\n3. **General Economic Conditions**: Inflation can cause higher living costs, which in turn affects household budgets. As consumers allocate more of their income towards essential expenses, the amount they can afford for mortgage payments may decrease, impacting overall demand for home loans.\n\n4. **Current Rate Environment**: In the current context, even though inflation has shown signs of moderation, it remained above the Federal Reserve's target, prompting the central bank to maintain elevated interest rates. This situation continues to sustain higher mortgage rates, presenting challenges for homebuyers seeking affordability.\n\n5. **Historical Context**: Historically, mortgage rates tend to be higher during periods of high inflation. For example, after a significant inflation surge, mortgage rates rose sharply, contributing to lower homebuying demand due to affordability concerns.\n\nIn summary, inflation impacts mortgage interest rates primarily through the actions of the Federal Reserve, investor expectations, and overall economic conditions, influencing both the rates themselves and the broader housing market dynamics.","Document: \n + Content: ### Inflation’s Role in Mortgage Rates and Borrower Concerns\n\nInflation continued to play a critical role in shaping mortgage rates in 2024. Although inflation showed signs of moderating, it remained above the Federal Reserve’s target, prompting the central bank to keep rates elevated as a preventive measure against further economic overheating([14)](https://www.federalreserve.gov/monetarypolicy/2024-03-mpr-summary.htm). This decision indirectly sustained higher mortgage rates, creating additional obstacles for buyers seeking affordability. High inflation affected not only interest rates but also general living costs, which placed further strain on household budgets and limited the portion of income many households could allocate toward mortgage payments.\n\nBrokers reported that many clients expressed concerns about the high rates, particularly those new to homeownership who lacked familiarity with rate fluctuations. As a result, brokers took a proactive role in e",Pass,Pass


In [117]:
faithfull_score = 0
relevancy_score = 0
for val in df['Faithfulness Evaluation Result'].values:
    faithfull_score += 1 if val == "Pass" else 0
for val in df['Relevancy Evaluation Result'].values:
    relevancy_score += 1 if val == "Pass" else 0

print(f"""Faithfulness score {faithfull_score * 100 / df.shape[0]}""")
print(f"""Relevancy    score {relevancy_score * 100 / df.shape[0]}""")

Faithfulness score 61.904761904761905
Relevancy    score 61.904761904761905


# RAG Evaluate
 

In [5]:
from llama_index.core.evaluation import CorrectnessEvaluator
from llama_index.llms.openai import OpenAI


llm = OpenAI(model="gpt-4o", temperature=0.0)


In [9]:
import nest_asyncio
nest_asyncio.apply()

query = (
    "Can you explain the theory of relativity proposed by Albert Einstein in"
    " detail?"
)

reference = """
Certainly! Albert Einstein's theory of relativity consists of two main components: special relativity and general relativity. Special relativity, published in 1905, introduced the concept that the laws of physics are the same for all non-accelerating observers and that the speed of light in a vacuum is a constant, regardless of the motion of the source or observer. It also gave rise to the famous equation E=mc², which relates energy (E) and mass (m).

General relativity, published in 1915, extended these ideas to include the effects of gravity. According to general relativity, gravity is not a force between masses, as described by Newton's theory of gravity, but rather the result of the warping of space and time by mass and energy. Massive objects, such as planets and stars, cause a curvature in spacetime, and smaller objects follow curved paths in response to this curvature. This concept is often illustrated using the analogy of a heavy ball placed on a rubber sheet, causing it to create a depression that other objects (representing smaller masses) naturally move towards.

In essence, general relativity provided a new understanding of gravity, explaining phenomena like the bending of light by gravity (gravitational lensing) and the precession of the orbit of Mercury. It has been confirmed through numerous experiments and observations and has become a fundamental theory in modern physics.
"""

response = """
Certainly! Albert Einstein's theory of relativity consists of two main components: special relativity and general relativity. Special relativity, published in 1905, introduced the concept that the laws of physics are the same for all non-accelerating observers and that the speed of light in a vacuum is a constant, regardless of the motion of the source or observer. It also gave rise to the famous equation E=mc², which relates energy (E) and mass (m).

However, general relativity, published in 1915, extended these ideas to include the effects of magnetism. According to general relativity, gravity is not a force between masses but rather the result of the warping of space and time by magnetic fields generated by massive objects. Massive objects, such as planets and stars, create magnetic fields that cause a curvature in spacetime, and smaller objects follow curved paths in response to this magnetic curvature. This concept is often illustrated using the analogy of a heavy ball placed on a rubber sheet with magnets underneath, causing it to create a depression that other objects (representing smaller masses) naturally move towards due to magnetic attraction.
"""

# Correctness evaluator
def correctnessEvaluator(model=llm,query="", response="", reference=""):
    evaluator = CorrectnessEvaluator(llm=model)
    return evaluator.evaluate(query=query, 
                              response=response, 
                              reference=reference)

result = correctnessEvaluator(model=llm, query=query, response=response, reference=reference)
print(f"""score : {result.score}, feedback: {result.feedback}""")

score : 2.0, feedback: The generated answer is mostly relevant but contains a significant mistake. It incorrectly states that general relativity involves the effects of magnetism and magnetic fields, which is not accurate. General relativity deals with the warping of space and time due to mass and energy, not magnetism. This error affects the correctness of the explanation, warranting a score of 2.0.


In [None]:
# Faithfulness
from llama_index.core.evaluation import FaithfulnessEvaluator

def faithfulnessEvaluator(response=""): # response containing retrieved doc and response also
    evaluator = FaithfulnessEvaluator()
    return evaluator.evaluate_response(response=response)

In [None]:
from llama_index.core.evaluation import DatasetGenerator

import asyncio

async def evaluate_query_engine(query_engine, questions, evaluator):
    tasks = [query_engine.aquery(q) for q in questions]
    results = await asyncio.gather(*tasks)
    print("finished query")
    total_correct = 0
    for q, r in zip(questions, results):
        # evaluate with gpt 4
        eval_result = (1 if evaluator.evaluate_response(query=q, response=r).passing else 0)
        total_correct += eval_result
    return total_correct, len(results)

def bencmarking_genererated_dataset(documents, number_question_per_node):
    question_generator = DatasetGenerator.from_documents(documents=documents)
    eval_question      = question_generator.generate_dataset_from_nodes(number_question_per_node)
    return eval_question

correct, total = asyncio.run(evaluate_query_engine(vector_query_engine, eval_questions[:5], evaluator))
print(correct/total)

In [None]:
# relevancy evaluator, evaluate how the relevant docment retreiving with original query
from llama_index.core.evaluation import EvaluationResult
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import (
    TreeIndex,
    VectorStoreIndex,
    SimpleDirectoryReader,
    Response,
)
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import RelevancyEvaluator
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd

pd.set_option("display.max_colwidth", 0)

gpt3 = OpenAI(temperature=0, model="gpt-3.5-turbo")
gpt4 = OpenAI(temperature=0, model="gpt-4o-mini")
evaluator      = RelevancyEvaluator(llm=gpt3)
evaluator_gpt4 = RelevancyEvaluator(llm=gpt4)

# define jupyter display function
def display_eval_df(
    query: str, response: Response, eval_result: EvaluationResult
) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
            "Reasoning": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [None]:
# Guiline evaluator
from llama_index.core.evaluation import GuidelineEvaluator

GUIDELINES = [
    "The response should fully answer the query.",
    "The response should avoid being vague or ambiguous.",
    (
        "The response should be specific and use statistics or numbers when possible."
    ),
]
llm = OpenAI(model="gpt-4o-mini")

evaluators = [
    GuidelineEvaluator(llm=llm, guidelines=guideline)
    for guideline in GUIDELINES
]

for guideline, evaluator in zip(GUIDELINES, evaluators):
    eval_result = evaluator.evaluate(
        query=sample_data["query"],
        contexts=sample_data["contexts"],
        response=sample_data["response"],
    )
    print("=====")
    print(f"Guideline: {guideline}")
    print(f"Pass: {eval_result.passing}")
    print(f"Feedback: {eval_result.feedback}")

In [None]:
# embedding semantic similarity
from llama_index.core.evaluation import SemanticSimilarityEvaluator
evaluator = SemanticSimilarityEvaluator()
result = await evaluator.aevaluate(
    response=response,
    reference=reference,
)