https://www.icicibank.com/personal-banking/loans

https://www.icicibank.com/personal-banking-sitemap.xml

In [22]:
# Filter education loans by categories:
# Step 1: Read all URLs from urls.txt

file_path = r"D:\icici_rag\data\urls.txt"

with open(file_path, "r", encoding="utf-8") as f:
    urls = [line.strip() for line in f if line.strip()]

print("All URLs:")
for u in urls:
    print(u)


All URLs:
https://www.icicibank.com/personal-banking
https://www.icicibank.com/personal-banking/inspection-documents-during-the-thirty-annual-general-meeting
https://www.icicibank.com/personal-banking/faq/infosys-smart-card-faqs
https://www.icicibank.com/personal-banking/faq/zero-balance-requirement-faq
https://www.icicibank.com/personal-banking/faq/internet-banking
https://www.icicibank.com/personal-banking/faq/internet-banking/frequently-asked-questions
https://www.icicibank.com/personal-banking/faq/internet-banking/pockets-mobile-app-faqs
https://www.icicibank.com/personal-banking/faq/internet-banking/data-card-recharge-faqs
https://www.icicibank.com/personal-banking/faq/internet-banking/pay-any-visa-credit-card-faqs
https://www.icicibank.com/personal-banking/faq/internet-banking/online-recharge-faqs
https://www.icicibank.com/personal-banking/faq/internet-banking/online-password-faqs
https://www.icicibank.com/personal-banking/faq/internet-banking/facebook-banking-faqs
https://www.ic

In [2]:
education_urls = [u for u in urls if "education" in u.lower()]

print("\nFiltered URLs (with 'education'):")
for u in education_urls:
    print(u)


Filtered URLs (with 'education'):
https://www.icicibank.com/personal-banking/faq/loan/education-loan-faqs
https://www.icicibank.com/personal-banking/faq/loan/insta-education-loan-faqs
https://www.icicibank.com/personal-banking/faq/accounts/child-education-plan-faqs
https://www.icicibank.com/personal-banking/accounts/child-education-plan-faqs
https://www.icicibank.com/personal-banking/loans/education-loan
https://www.icicibank.com/personal-banking/loans/education-loan/education-loan
https://www.icicibank.com/personal-banking/loans/education-loan/canada-select
https://www.icicibank.com/personal-banking/loans/education-loan/benefits-and-features
https://www.icicibank.com/personal-banking/loans/education-loan/courses-expenses-covered
https://www.icicibank.com/personal-banking/loans/education-loan/refer-a-friend
https://www.icicibank.com/personal-banking/loans/education-loan/tax-benefit-calculator
https://www.icicibank.com/personal-banking/loans/education-loan/insta-education-loan
https://

In [3]:
len(education_urls)

32

In [4]:
education_loan_categories = {
    # Loan variations and types

    # Financial details - Rates, charges, benefits
    "education_loan_rates_charges": [
        "interest-rates", "tax-benefit", "tax-benefit-calculator",
        "fees-charges", "prepayment", "emi-calculator", "emi-options"
    ],

    # Application process & eligibility
    "education_loan_application": [
        "eligibility", "eligibility-calculator", "apply-now", "how-to-apply",
        "documents-required", "sanction-process", "instant-sanction",
        "sanction-letter-download", "nomination"
    ],

    # Repayment & insurance
    "education_loan_repayment": [
        "repayment", "repayment-process", "repayment-options", "repayment-faqs",
        "loan-insurance", "loan-protection"
    ],

    # Country-specific schemes
    "education_loan_country_schemes": [
        "study-in-usa", "study-in-canada", "study-in-australia",
        "study-in-uk", "study-in-germany", "study-in-france",
        "study-in-new-zealand", "study-in-singapore"
    ],

    # Govt & special schemes
    "education_loan_govt_schemes": [
        "vidyalakshmi", "csis", "government-subsidy-schemes", "fund"
    ],

    # General FAQ / Help
    "education_loan_help": [
        "faq", "grievance-redressal", "contact-us", "important"
    ],
    "education_loan_types": [
        "education-loan", "insta-education-loan", "pre-approved-education-loan",
        "canada-select", "csis"
    ]
}

In [5]:
def categorize_loan_urls(all_urls, categories):
    """
    Categorize URLs based on keyword patterns from given categories dict.

    :param all_urls: list of URLs
    :param categories: dict of {category_name: [patterns]}
    :return: dict with categorized URLs
    """
    categorized = {category: [] for category in categories.keys()}

    for url in all_urls:
        url_lower = url.lower()
        for category, patterns in categories.items():
            if any(pattern in url_lower for pattern in patterns):
                categorized[category].append(url)
                break  # stop at first match

    return categorized


Review and Validate Your Filtered URLs

In [6]:
# Get your categorized URLs
categorized_urls = categorize_loan_urls(education_urls, education_loan_categories)

# Review what you got
for category, urls in categorized_urls.items():
    print(f"\n{category.upper()}: {len(urls)} URLs")
    for url in urls[:10]:
        print(f"  - {url}")
    if len(urls) > 3:
        print(f"  ... and {len(urls) - 3} more")



EDUCATION_LOAN_RATES_CHARGES: 2 URLs
  - https://www.icicibank.com/personal-banking/loans/education-loan/tax-benefit-calculator
  - https://www.icicibank.com/personal-banking/loans/education-loan/interest-rates

EDUCATION_LOAN_APPLICATION: 1 URLs
  - https://www.icicibank.com/personal-banking/products/online-safe-banking/customer-education/nomination-form

EDUCATION_LOAN_REPAYMENT: 0 URLs

EDUCATION_LOAN_COUNTRY_SCHEMES: 0 URLs

EDUCATION_LOAN_GOVT_SCHEMES: 2 URLs
  - https://www.icicibank.com/personal-banking/loans/education-loan/csis
  - https://www.icicibank.com/personal-banking/depositor-education-and-awareness-fund

EDUCATION_LOAN_HELP: 7 URLs
  - https://www.icicibank.com/personal-banking/faq/loan/education-loan-faqs
  - https://www.icicibank.com/personal-banking/faq/loan/insta-education-loan-faqs
  - https://www.icicibank.com/personal-banking/faq/accounts/child-education-plan-faqs
  - https://www.icicibank.com/personal-banking/accounts/child-education-plan-faqs
  - https://www.

In [7]:
categorized_urls

{'education_loan_rates_charges': ['https://www.icicibank.com/personal-banking/loans/education-loan/tax-benefit-calculator',
  'https://www.icicibank.com/personal-banking/loans/education-loan/interest-rates'],
 'education_loan_application': ['https://www.icicibank.com/personal-banking/products/online-safe-banking/customer-education/nomination-form'],
 'education_loan_repayment': [],
 'education_loan_country_schemes': [],
 'education_loan_govt_schemes': ['https://www.icicibank.com/personal-banking/loans/education-loan/csis',
  'https://www.icicibank.com/personal-banking/depositor-education-and-awareness-fund'],
 'education_loan_help': ['https://www.icicibank.com/personal-banking/faq/loan/education-loan-faqs',
  'https://www.icicibank.com/personal-banking/faq/loan/insta-education-loan-faqs',
  'https://www.icicibank.com/personal-banking/faq/accounts/child-education-plan-faqs',
  'https://www.icicibank.com/personal-banking/accounts/child-education-plan-faqs',
  'https://www.icicibank.com/p

In [8]:
#classified_urls
classified_urls=0
for key, urls in categorized_urls.items():
    classified_urls+=len(urls)
classified_urls

22

Implement Category-Specific Content Extraction

In [9]:
def extract_content_by_category(soup, category):
    """Extract relevant content based on category"""
    
    # Remove unwanted elements
    for element in soup(['script', 'style', 'nav', 'footer', 'header', 'menu']):
        element.decompose()

    return soup.get_text(strip=True)


crawl with enhanced structure

In [10]:
import time
import requests
from bs4 import BeautifulSoup
def crawl_loan_comprehensive(categorized_urls, delay = 3):
    all_content=[]
    for category, urls in categorized_urls.items():
        if not urls:
            continue
        print(f"Crawling {category}:{len(urls)} URLs")
        for i, url in enumerate(urls):
            print(f"processsing {i+1}/{len(urls)} : {url}")
            content = extract_loan_content_enhanced(url, category)
            if content:
                all_content.append(content)
            time.sleep(delay)
    return all_content



def extract_loan_content_enhanced(url, category):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    try:
        response = requests.get(url, headers = headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            content = {
                'url': url,
                'category': category,
                'title': soup.find('title').text if soup.find('title') else '',
                'content': extract_content_by_category(soup, category),
                'content_length': len(soup.get_text(strip=True)),
                'last_scraped': time.strftime('%Y-%m-%d %H:%M:%S')
            }
            return content
        else:
            print(f"Failed: Status {response.status_code}")
            return None
    except Exception as e:
        print(f"Error: {e}")
        return None



Prepare Data for RAG

In [11]:
# def prepare_loan_rag_data(scraped_data):
#     rag_chunks = []
#     for item in scraped_data:
#         content = item['content']
#         chunks = content.split('.')
#         for i, chunk in enumerate(chunks):
#             if len(chunk.strip()) > 50:
#                 rag_chunk = {
#                     'id': f"home_loan_{item['category']}_{hash(item['url'])}_{i}",
#                     'content': chunk.strip(),
#                     'metadata': {
#                         'loan_type': 'home_loan',
#                         'category': item['category'],
#                         'source_url': item['url'],
#                         'title': item['title'],
#                         'chunk_index': i
#                     }
#                 }
#                 rag_chunks.append(rag_chunk)

#     return rag_chunks

In [12]:
import requests
from bs4 import BeautifulSoup
import time
import json


def extract_loan_content(url, category):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract structured content based on category
            content = {
                'url': url,
                'category': category,
                'title': soup.find('title').text if soup.find('title') else '',
                'content': extract_relevant_content(soup, category),
                'last_scraped': time.strftime('%Y-%m-%d %H:%M:%S')
            }
            return content
        else:
            print(f"Failed to fetch {url}: Status {response.status_code}")
            return None
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

def extract_relevant_content(soup, category):
    # Remove scripts, styles, and navigation elements
    for element in soup(['script', 'style', 'nav', 'footer', 'header']):
        element.decompose()

    return soup.get_text(strip=True)


In [13]:
print("Starting Education Loan Data Collection...")
categorized_urls = categorize_loan_urls(education_urls, education_loan_categories)
scraped_data = crawl_loan_comprehensive(categorized_urls)
# rag_chunks = prepare_loan_rag_data(scraped_data)


import json
import os

save_path = r"D:\icici_rag\data"
os.makedirs(save_path, exist_ok=True)
file_path = os.path.join(save_path, 'loan_raw_data_final.json')

# Load existing data if file exists
if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        existing_data = json.load(f)
else:
    existing_data = []

# Extend the list instead of append
existing_data.extend(scraped_data)

# Save back to file
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(existing_data, f, indent=2, ensure_ascii=False)

print(f"Collected {len(scraped_data)} new documents, total now: {len(existing_data)}")


Starting Education Loan Data Collection...
Crawling education_loan_rates_charges:2 URLs
processsing 1/2 : https://www.icicibank.com/personal-banking/loans/education-loan/tax-benefit-calculator
processsing 2/2 : https://www.icicibank.com/personal-banking/loans/education-loan/interest-rates
Crawling education_loan_application:1 URLs
processsing 1/1 : https://www.icicibank.com/personal-banking/products/online-safe-banking/customer-education/nomination-form
Crawling education_loan_govt_schemes:2 URLs
processsing 1/2 : https://www.icicibank.com/personal-banking/loans/education-loan/csis
processsing 2/2 : https://www.icicibank.com/personal-banking/depositor-education-and-awareness-fund
Crawling education_loan_help:7 URLs
processsing 1/7 : https://www.icicibank.com/personal-banking/faq/loan/education-loan-faqs
processsing 2/7 : https://www.icicibank.com/personal-banking/faq/loan/insta-education-loan-faqs
processsing 3/7 : https://www.icicibank.com/personal-banking/faq/accounts/child-education

GOLD LOAN

In [23]:
# LIST OF GOLD RELATED URLS

gold_urls = [u for u in urls if "gold" in u.lower()]

print("\nFiltered URLs (with 'gold'):")
for u in gold_urls:
    print(u)


Filtered URLs (with 'gold'):
https://www.icicibank.com/personal-banking/faq/loan/loan-against-gold-and-gold-ornaments-faqs
https://www.icicibank.com/personal-banking/faq/investment/digital-gold-faqs
https://www.icicibank.com/personal-banking/faq/investment/sovereign-gold-bonds-faqs
https://www.icicibank.com/personal-banking/faq/investment/gold-monetisation-scheme-faqs
https://www.icicibank.com/personal-banking/faq/investment/gold-loan-referral-programme-faqs
https://www.icicibank.com/personal-banking/faq/accounts/gold-plus-savings-account-faqs
https://www.icicibank.com/personal-banking/faq/accounts/gold-privilege-account-faqs
https://www.icicibank.com/personal-banking/cards/consumer-cards1/credit-card/instant_gold-review
https://www.icicibank.com/personal-banking/cards/consumer-cards1/credit-card/instant_gold-moderate
https://www.icicibank.com/personal-banking/cards/commercial-cards/gold-business-card
https://www.icicibank.com/personal-banking/cards/debit-card/debit-cards/smart-shoppe

In [24]:
len(gold_urls)

105

In [25]:
gold_loan_categories = {
    # Gold-backed loans
    "gold_loan_types": [
        "loan-against-gold", "gold-loan", "gold-loan-topup",
        "gold-loan-referral-programme"
    ],

    # Loan interest rates & charges
    "gold_loan_rates_charges": [
        "interest-rate", "interest-rates", "charges", "fees", "calculator"
    ],

    # Loan application process & features
    "gold_loan_application": [
        "apply-now", "key-features", "documentation", "features-and-documents",
        "doorstep-gold-loans", "index"
    ],

    # Repayment & loan protection
    "gold_loan_repayment": [
        "repayment", "repayment-process", "repayment-options", "repayment-faqs"
    ],

    # Investment products (digital/sovereign gold, deposits)
    "gold_investment_products": [
        "digital-gold", "sovereign-gold-bond", "gold-deposits",
        "icici-bank-pure-gold", "gold-monetisation-scheme"
    ],

    # Accounts / savings linked with gold
    "gold_account_products": [
        "gold-plus-saving", "gold-plus-savings-account", "gold-privilege-account",
        "gold-salary-account", "privilege-banking/gold-privilege"
    ],

    # Cards with gold branding
    "gold_cards": [
        "gold-debit-card", "gold-business-card", "instant_gold", "gold-silver-credit-card",
        "smart-shopper-gold", "senior-citizen-gold"
    ],

    # General FAQs / help
    "gold_help": [
        "faq", "important", "grievance-redressal", "contact-us"
    ]
}


In [26]:
# Get your categorized URLs
categorized_gold_loan_urls = categorize_loan_urls(gold_urls, gold_loan_categories)

# Review what you got
for category, urls in categorized_gold_loan_urls.items():
    print(f"\n{category.upper()}: {len(urls)} URLs")
    for url in urls[:10]:
        print(f"  - {url}")
    if len(urls) > 3:
        print(f"  ... and {len(urls) - 3} more")



GOLD_LOAN_TYPES: 29 URLs
  - https://www.icicibank.com/personal-banking/faq/loan/loan-against-gold-and-gold-ornaments-faqs
  - https://www.icicibank.com/personal-banking/faq/investment/gold-loan-referral-programme-faqs
  - https://www.icicibank.com/personal-banking/accounts/savings-account/rewards/auto-gold-loan
  - https://www.icicibank.com/personal-banking/loans/loan-against-gold
  - https://www.icicibank.com/personal-banking/loans/loan-against-gold/apply-now
  - https://www.icicibank.com/personal-banking/loans/loan-against-gold/key-features
  - https://www.icicibank.com/personal-banking/loans/loan-against-gold/gold-loan-referral-programme-terms-and-conditions
  - https://www.icicibank.com/personal-banking/loans/loan-against-gold/gold-loan-referral-programme
  - https://www.icicibank.com/personal-banking/loans/loan-against-gold/documentation
  - https://www.icicibank.com/personal-banking/loans/loan-against-gold-and-gold-ornaments-faqs
  ... and 26 more

GOLD_LOAN_RATES_CHARGES: 0 UR

In [27]:
#classified_urls
classified_urls=0
for key, urls in categorized_gold_loan_urls.items():
    classified_urls+=len(urls)
classified_urls

97

In [30]:
print("Starting Gold Loan Data Collection...")
categorized_gold_loan_urls = categorize_loan_urls(gold_urls, gold_loan_categories)
scraped_data = crawl_loan_comprehensive(categorized_gold_loan_urls)
# rag_chunks = prepare_loan_rag_data(scraped_data)



import json
import os 
save_path = r"D:\icici_rag\data"
os.makedirs(save_path, exist_ok=True)
file_path = os.path.join(save_path, 'loan_raw_data_final.json')
if os.path.exists(file_path):
    with open(file_path, 'r',encoding='utf-8') as f:
        existing_data = json.load(f)
else:
    existing_data=[]
existing_data.append(scraped_data)

# Save back to file
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(existing_data, f, indent=2, ensure_ascii=False)

print(f"Collected {len(scraped_data)} new documents, total now: {len(existing_data)}")

Starting Gold Loan Data Collection...
Crawling gold_loan_types:29 URLs
processsing 1/29 : https://www.icicibank.com/personal-banking/faq/loan/loan-against-gold-and-gold-ornaments-faqs
processsing 2/29 : https://www.icicibank.com/personal-banking/faq/investment/gold-loan-referral-programme-faqs
processsing 3/29 : https://www.icicibank.com/personal-banking/accounts/savings-account/rewards/auto-gold-loan
processsing 4/29 : https://www.icicibank.com/personal-banking/loans/loan-against-gold
processsing 5/29 : https://www.icicibank.com/personal-banking/loans/loan-against-gold/apply-now
processsing 6/29 : https://www.icicibank.com/personal-banking/loans/loan-against-gold/key-features
Failed: Status 404
processsing 7/29 : https://www.icicibank.com/personal-banking/loans/loan-against-gold/gold-loan-referral-programme-terms-and-conditions
processsing 8/29 : https://www.icicibank.com/personal-banking/loans/loan-against-gold/gold-loan-referral-programme
processsing 9/29 : https://www.icicibank.com