https://www.icicibank.com/personal-banking/loans

In [20]:
# crawling the sitemap url

import requests
from bs4 import BeautifulSoup
from pathlib import Path

OUTPUT_FILE = Path(r"D:\icici_rag\data\urls.txt")
url = "https://www.icicibank.com/personal-banking-sitemap.xml"

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

result = requests.get(url, headers = headers)

print(f"Status code: {result.status_code}")

if result.status_code == 200:
    soup = BeautifulSoup(result.content,"lxml-xml")
    urls = [loc.text for loc in soup.find_all("loc")]
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for url in urls:
            f.write(url+"\n")
    print(f"saved to {OUTPUT_FILE}")


else:
    print(f"Request failed with status : {result.status_code}")


Status code: 200
saved to D:\icici_rag\data\urls.txt


In [21]:
# length of the urls
len(urls)

4370

In [22]:
# FILTERING HOME LOAN SITEMAP URLS

home_loan_related_patterns = [
    '/home'
]

home_loan_urls = [url for url in urls if any(pattern in url for pattern in home_loan_related_patterns)]


len(home_loan_urls)


169

In [23]:
home_loan_urls

['https://www.icicibank.com/personal-banking/faq/loan/home-loans-faqs',
 'https://www.icicibank.com/personal-banking/faq/loan/home-loan-cashback-offer-faqs',
 'https://www.icicibank.com/personal-banking/faq/insurance/home-insurance-faqs',
 'https://www.icicibank.com/personal-banking/accounts/home-assure-savings-account',
 'https://www.icicibank.com/personal-banking/loans/personal-loan/home-renovation',
 'https://www.icicibank.com/personal-banking/loans/home-loan',
 'https://www.icicibank.com/personal-banking/loans/home-loan/pahl-sanction-disbursement',
 'https://www.icicibank.com/personal-banking/loans/home-loan/instant-home-loan',
 'https://www.icicibank.com/personal-banking/loans/home-loan/instant-home-loan/fees-and-charges',
 'https://www.icicibank.com/personal-banking/loans/home-loan/instant-home-loan/interest-rates',
 'https://www.icicibank.com/personal-banking/loans/home-loan/instant-home-loan/index',
 'https://www.icicibank.com/personal-banking/loans/home-loan/top-up-home-loan',

Filter home loans by categories:
    

In [25]:
#Define your target categories with specific patterns
home_loan_categories = {
    # Loan variations and types
    'loan_types': [
        'top-up', 'balance-transfer', 'loan-against-property', 'land-loan',
        'home-renovation', 'home-improvement', 'construction-loan',
        'commercial-property', 'office-premises', 'lease-rental-discounting'
    ],
    
    # Financial details - CRITICAL for RAG
    'rates_interest': [
        'interest-rate', 'interest-rates', 'rates', 'home-loan-interest-rates',
        'home-overdraft-interest-rate', 'service', 'charges', 'forex', 'fees-and-charges', 'service-charges', 'penal-charges', 'charges',
        'fees', 'cost', 'penal-and-service-charges'
    ],
    
    
    # Calculators - Separate category for numerical content
    'calculators_tools': [
        'calculator', 'emi-calculator', 'affordability-calculator',
        'prepayment-calculator', 'dummy-calculator'
    ],
    
    # Eligibility and requirements
    'eligibility_docs': [
        'eligibility', 'documents-required', 'required-documents',
        'document-checklist', 'checklist', 'qualification', 'who-can-apply'
    ],
    
    # Application process - Step-by-step guidance
    'application_process': [
        'apply', 'application', 'process', 'how-to-apply', 'sanction-disbursement',
        'disbursement', 'track-my-loan', 'online-sanction', 'pahl-sanction',
        'hl-sanction', 'pabt-sanction'
    ],
    
    # Policies and terms - Legal context
    'policies_terms': [
        'policy', 'colending-policy', 'terms', 'conditions', 'cgtmse',
        'legal-valuation', 'escrow-services', 'tnc', 't&c', 'legal'
    ],
    
    # FAQs and help - Q&A format content
    'faqs_help': [
        'faqs', 'frequently-asked-questions', 'help', 'faq', 'home-loans-faqs',
        'affordable-home-loans-faqs', 'home-overdraft-faqs', 'step-up-home-loans-faqs'
    ],
    
    # Account management - Existing customers
    'account_management': [
        'existing-customers', 'rate-reset', 'home-loan-statement',
        'repayment', 'prepayment', 'emi-under-construction'
    ],
    
    # Insurance and protection
    'insurance_protection': [
        'insurance', 'insure', 'home-insurance', 'home-claims'
    ],
    
    # Government schemes
    'government_schemes': [
        'pmay', 'pradhan-mantri-awas-yojna', 'government-schemes'
    ],
    
    # Pre-approved offers
    'pre_approved_offers': [
        'pre-approved', 'pre-approved-home-loan', 'pre-approved-loan-against-property'
    ],

    'product_overviews': [
    'home-loan', 'instant-home-loan', 'pratham-home-loan', 'step-up-home-loans',
    'pragati-home-loan', 'express-home-loan', 'extraa-home-loans', 'money-saver',
    '30-years-tenure', 'affordable-home-loans', 'saral-housing-loan', 'index'
    ]
}

def categorize_home_loan_urls(all_urls):
    categorized = {category: [] for category in home_loan_categories.keys()}
    

    
    # Then categorize by content type
    for url in all_urls:
        url_lower = url.lower()
        # categorized_flag = False
        
        for category, patterns in home_loan_categories.items():
            if any(pattern in url_lower for pattern in patterns):
               
                categorized[category].append(url)
                # categorized_flag = True
                break
        
        # If doesn't match specific patterns, put in features_benefits as default
        # if not categorized_flag:
        #     categorized['features_benefits'].append(url)
    
    return categorized

Review and Validate Your Filtered URLs

In [26]:
# Get your categorized URLs
categorized_urls = categorize_home_loan_urls(home_loan_urls)

# Review what you got
for category, urls in categorized_urls.items():
    print(f"\n{category.upper()}: {len(urls)} URLs")
    for url in urls[:10]:
        print(f"  - {url}")
    if len(urls) > 3:
        print(f"  ... and {len(urls) - 3} more")



LOAN_TYPES: 62 URLs
  - https://www.icicibank.com/personal-banking/loans/personal-loan/home-renovation
  - https://www.icicibank.com/personal-banking/loans/home-loan/top-up-home-loan
  - https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property
  - https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/fees-and-charges
  - https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/interest-rate
  - https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/index
  - https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/required-documents
  - https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/calculator
  - https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/cgtmse
  - https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/interest-rates
  ... and 59 more

RATES_INTEREST: 15 URLs
  - 

In [27]:
# classified_urls
classified_urls=0
for key, urls in categorized_urls.items():
    classified_urls+=len(urls)
classified_urls

165

Implement Category-Specific Content Extraction

In [28]:
def extract_content_by_category(soup, category):
    """Extract relevant content based on category"""
    
    # Remove unwanted elements
    for element in soup(['script', 'style', 'nav', 'footer', 'header', 'menu']):
        element.decompose()

    return soup.get_text(strip=True)


crawl with enhanced structure

In [29]:
import time
import requests
from bs4 import BeautifulSoup
def crawl_home_loan_comprehensive(categorized_urls, delay = 3):
    all_content=[]
    for category, urls in categorized_urls.items():
        if not urls:
            continue
        print(f"Crawling {category}:{len(urls)} URLs")
        for i, url in enumerate(urls):
            print(f"processsing {i+1}/{len(urls)} : {url}")
            content = extract_home_loan_content_enhanced(url, category)
            if content:
                all_content.append(content)
            time.sleep(delay)
    return all_content



def extract_home_loan_content_enhanced(url, category):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    try:
        response = requests.get(url, headers = headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            content = {
                'url': url,
                'category': category,
                'title': soup.find('title').text if soup.find('title') else '',
                'content': extract_content_by_category(soup, category),
                'content_length': len(soup.get_text(strip=True)),
                'last_scraped': time.strftime('%Y-%m-%d %H:%M:%S')
            }
            return content
        else:
            print(f"Failed: Status {response.status_code}")
            return None
    except Exception as e:
        print(f"Error: {e}")
        return None



Prepare Data for RAG

In [30]:
# def prepare_home_loan_rag_data(scraped_data):
#     rag_chunks = []
#     for item in scraped_data:
#         content = item['content']
#         chunks = content.split('.')
#         for i, chunk in enumerate(chunks):
#             if len(chunk.strip()) > 50:
#                 rag_chunk = {
#                     'id': f"home_loan_{item['category']}_{hash(item['url'])}_{i}",
#                     'content': chunk.strip(),
#                     'metadata': {
#                         'loan_type': 'home_loan',
#                         'category': item['category'],
#                         'source_url': item['url'],
#                         'title': item['title'],
#                         'chunk_index': i
#                     }
#                 }
#                 rag_chunks.append(rag_chunk)

#     return rag_chunks

In [31]:
print("Starting Home Loan Data Collection")

Starting Home Loan Data Collection


In [32]:
import requests
from bs4 import BeautifulSoup
import time
import json


def extract_home_loan_content(url, category):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract structured content based on category
            content = {
                'url': url,
                'category': category,
                'title': soup.find('title').text if soup.find('title') else '',
                'content': extract_relevant_content(soup, category),
                'last_scraped': time.strftime('%Y-%m-%d %H:%M:%S')
            }
            return content
        else:
            print(f"Failed to fetch {url}: Status {response.status_code}")
            return None
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

def extract_relevant_content(soup, category):
    # Remove scripts, styles, and navigation elements
    for element in soup(['script', 'style', 'nav', 'footer', 'header']):
        element.decompose()

    return soup.get_text(strip=True)


In [34]:
print("Starting Home Loan Data Collection...")
categorized_urls = categorize_home_loan_urls(home_loan_urls)
scraped_data = crawl_home_loan_comprehensive(categorized_urls)
#rag_chunks = prepare_home_loan_rag_data(scraped_data)

#Save results
import json
import os 
save_path = r"D:\icici_rag\data"
os.makedirs(save_path, exist_ok=True)
file_path = os.path.join(save_path, 'home_loan_raw_data_final.json')
with open(file_path, 'w',encoding='utf-8') as f:
    json.dump(scraped_data, f, indent=2)

# with open('home_loan_rag_chunks.json', 'w') as f:
#     json.dump(rag_chunks, f, indent=2)

print(f"Collected {len(scraped_data)} documents")
# print(f"Created {len(rag_chunks)} RAG chunks")

Starting Home Loan Data Collection...
Crawling loan_types:62 URLs
processsing 1/62 : https://www.icicibank.com/personal-banking/loans/personal-loan/home-renovation
processsing 2/62 : https://www.icicibank.com/personal-banking/loans/home-loan/top-up-home-loan
processsing 3/62 : https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property
processsing 4/62 : https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/fees-and-charges
processsing 5/62 : https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/interest-rate
processsing 6/62 : https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/index
processsing 7/62 : https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/required-documents
processsing 8/62 : https://www.icicibank.com/personal-banking/loans/home-loan/loan-against-property/calculator
processsing 9/62 : https://www.icicibank.com/personal-banking/loans/home-l