In [226]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin


In [227]:
visited_links = set()
file_counts = {}
data = {}

In [228]:
def get_hrefs(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = set()
        for tag in soup.find_all(['a', 'link', 'script', 'img']):
            href = tag.get('href')
            src = tag.get('src')

            if href:
                full_url = urljoin(url, href)
                parsed_url = urlparse(full_url)
                if parsed_url.fragment:
                    updated_url = urljoin(parsed_url.geturl(), parsed_url.fragment)
                    links.add(updated_url)
                else:
                    links.add(full_url)

            if src:
                links.add(urljoin(url, src))

        return list(links)
    except requests.exceptions.RequestException as e:
        print(f'Error occurred while retrieving {url}: {e}')
        return []

In [229]:
def filter_internal_links(links, domain):
    internal_links = []

    for link in links:
        parsed_url = urlparse(link)

        if parsed_url.netloc == domain:
            internal_links.append(link)

    return internal_links

In [230]:
def get_data(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        # get h1 tags data and return it
        headings = []
        h1 = soup.find_all('h1')
        h2 = soup.find_all('h2')
        h3 = soup.find_all('h3')
        for h in h1:
            headings.append(h.get_text())
        for h in h2:
            headings.append(h.get_text())
        for h in h3:
            headings.append(h.get_text())
        
        for a in headings:
            a.replace('\n', '')
        return headings
        
    except requests.exceptions.RequestException as e:
        print(f'Error occurred while retrieving {url}: {e}')
        return []

In [231]:
def crawl(url, threshold, max_visited_links, depth=1):
    global visited_links

    if depth > threshold:
        return
    
    if len(visited_links) >= max_visited_links:
        return
    
    if url in visited_links:
        return

    visited_links.add(url)
    print(f'Processing: {url}')
    links = get_hrefs(url)
    internal_links = filter_internal_links(links, urlparse(url).netloc)

    data[url] = internal_links

    for link in internal_links:
        if not link.endswith(('.png', '.jpg', '.jpeg', '.gif', '.pdf', 'js', '.css', '.webp')):
            crawl(link, threshold, max_visited_links, depth + 1)

In [232]:
def print_data(data):
    for url, h1_data in data.items():
        print(f'url: {url}')
        print('data: \n')
        for data in h1_data:
            print(data)
        print('')

In [233]:
def main(url, threshold, max_visited_links=10):
    domain = urlparse(url).netloc
    crawl(url, threshold, max_visited_links)
    print(f"Data Dictionary Size: {len(data)}")
    # print_data(data)
    for i in data:
        print(i)

In [234]:
main('https://www.google.com/search?q=reliance+industries+ril&sca_esv=4ecf938d57668945&rlz=1C1RXQR_enIN1032IN1032&tbm=nws&prmd=nivmsbtz&sxsrf=ACQVn0_5_MBzeVn8bgvWC_GqwvqHY9Gbxw:1710697576484&ei=aCz3ZcSYHdOZvr0PvLinkAE&start=10&sa=N&ved=2ahUKEwjE7PSK7fuEAxXTjK8BHTzcCRI4FBDy0wN6BAgDEAY&biw=2133&bih=1196&dpr=0.9', 3, 50)

Processing: https://www.google.com/search?q=reliance+industries+ril&sca_esv=4ecf938d57668945&rlz=1C1RXQR_enIN1032IN1032&tbm=nws&prmd=nivmsbtz&sxsrf=ACQVn0_5_MBzeVn8bgvWC_GqwvqHY9Gbxw:1710697576484&ei=aCz3ZcSYHdOZvr0PvLinkAE&start=10&sa=N&ved=2ahUKEwjE7PSK7fuEAxXTjK8BHTzcCRI4FBDy0wN6BAgDEAY&biw=2133&bih=1196&dpr=0.9


Processing: https://www.google.com/url?q=https://economictimes.indiatimes.com/markets/stocks/stock-liveblog/reliance-industries-share-price-today-live-updates-11-mar-2024/liveblog/108379328.cms&sa=U&ved=2ahUKEwiViKz_7fuEAxWzZWwGHfxUDvY4ChDF9AF6BAgAEAI&usg=AOvVaw2h_XHu-tHdctNQnbHPhQsb
Processing: https://www.google.com/url?q=/search%3Fq%3Dreliance%2Bindustries%2Bril%26sca_esv%3D4ecf938d57668945%26rlz%3D1C1RXQR_enIN1032IN1032%26biw%3D2133%26bih%3D1196%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449&sa=U&ved=0ahUKEwiViKz_7fuEAxWzZWwGHfxUDvY4ChCJoAwICigF&usg=AOvVaw05tGEyP-W7MsqvyxHJt16_
Processing: https://www.google.com/url?q=https://www.businesstoday.in/markets/company-stock/story/reliance-industries-shares-near-record-high-buy-sell-hold-412201-2024-01-08&sa=U&ved=2ahUKEwiViKz_7fuEAxWzZWwGHfxUDvY4ChDF9AF6BAgEEAI&usg=AOvVaw2gAEw8-CRQLXbJ9aI46yrt
Processing: https://www.google.com/search?q=reliance+industries+ril&sca_esv=4ecf938d57668945&rlz=1C1RXQR_en