In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin


In [2]:
# Global variables
visited_links = set()   # Set to keep track of visited links
file_counts = {}        # Dictionary to store counts of different file types
data = {}               # Dictionary to store extracted data
    

In [3]:
def get_hrefs(url):
    """Retrieve all links (href, src) from a given URL's webpage."""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = set()   # Using a set to avoid duplicate links

        # Extracting href and src attributes from a, link, script, and img tags
        for tag in soup.find_all(['a', 'link', 'script', 'img']):
            href = tag.get('href')
            src = tag.get('src')

            # Processing href links
            if href:
                full_url = urljoin(url, href)
                parsed_url = urlparse(full_url)
                if parsed_url.fragment:
                    updated_url = urljoin(parsed_url.geturl(), parsed_url.fragment)
                    links.add(updated_url)
                else:
                    links.add(full_url)

            # Processing src links
            if src:
                links.add(urljoin(url, src))

        return list(links)
    except requests.exceptions.RequestException as e:
        print(f'Error occurred while retrieving {url}: {e}')
        return []

In [4]:
def filter_internal_links(links, domain):
    """Filter out internal links based on a provided domain."""
    internal_links = []

    for link in links:
        parsed_url = urlparse(link)

        if parsed_url.netloc == domain:
            internal_links.append(link)

    return internal_links

In [5]:
def get_data(url):
    """Retrieve data from a given URL's webpage."""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()

        # Extracting headings (h1 tags)
        headings = []
        h1 = soup.find_all('h1')
        for h in h1:
            if len(h.get_text()) > 8:   # Filtering headings based on length
                headings.append(h.get_text())

        # Extracting paragraphs (p tags)
        p = soup.find_all('p')
        paragraphs = []
        for para in p:
            if len(para.get_text()) > 15:   # Filtering paragraphs based on length
                paragraphs.append(para.get_text())

        return {
            'headings': headings,
            'paragraphs': paragraphs
        }
        
    except requests.exceptions.RequestException as e:
        print(f'Error occurred while retrieving {url}: {e}')
        return []

In [6]:
def crawl(url, threshold, max_visited_links, depth=1):
    """Crawl the web starting from a given URL."""
    global visited_links

    if depth > threshold or "/api/" in url:
        return
    
    if len(visited_links) >= max_visited_links:
        return
    
    if url in visited_links:
        return

    visited_links.add(url)
    print(f'Processing: {url}')
    links = get_hrefs(url)
    internal_links = filter_internal_links(links, urlparse(url).netloc)

    temp = get_data(url)
    if temp['headings'] and temp['paragraphs']:
        data[url] = temp

    for link in internal_links:
        # Skipping certain file types
        if not link.endswith(('.png', '.jpg', '.jpeg', '.gif', '.pdf', 'js', '.css', '.webp')):
            crawl(link, threshold, max_visited_links, depth + 1)

In [7]:
def print_data(data):
    """Print the extracted data."""
    for url, h1_data in data.items():
        print(f'url: {url}')
        if h1_data["headings"]:
            print(f'headings: {h1_data["headings"]}')
        if h1_data["paragraphs"]:
            print(f'paragraphs: {h1_data["paragraphs"]}')
        print('')


In [8]:
def m(url, threshold, max_visited_links=10):
    """Main function to initiate crawling."""
    domain = urlparse(url).netloc
    crawl(url, threshold, max_visited_links)

In [9]:
m('https://news.google.com/search?q=reliance%20industries%20ril&hl=en-IN&gl=IN&ceid=IN%3Aen', 2, 100)

# The below are to get the data from the past 24 hours
# main('https://news.google.com/search?q=reliance%20industries%20limited%20ril%20when%3A1d&hl=en-IN&gl=IN&ceid=IN%3Aen', 2, 100)
# main('https://www.google.com/search?q=reliance+industries+ril&sca_esv=c1ac07e53b73c3dc&rlz=1C1RXQR_enIN1032IN1032&tbm=nws&prmd=nivmsbtz&sxsrf=ACQVn09X_ir61R3B9Qm49R9w-IyctdIDhw:1710700407558&source=lnt&tbs=qdr:d&sa=X&ved=2ahUKEwji7O_Q9_uEAxXAnK8BHVoMCocQpwV6BAgCEAg&biw=2133&bih=1196&dpr=0.9', 2, 100)

print(f"Data Dictionary Size: {len(data)}")
print_data(data)

Processing: https://news.google.com/search?q=reliance%20industries%20ril&hl=en-IN&gl=IN&ceid=IN%3Aen


Processing: https://news.google.com/articles/CBMipQFodHRwczovL3d3dy5saXZlbWludC5jb20vY29tcGFuaWVzL2NvbXBhbnktcmVzdWx0cy9yZWxpYW5jZS1pbmR1c3RyaWVzLXEzLXJlc3VsdHMtY29tcGFueS1yZXBvcnRzLW5ldC1wcm9maXQtb2YtcnMtMTctMjY1LXJldmVudWUtcmlsLXNoYXJlLXByaWNlLTExNzA1NjY0MzE4MDUzLmh0bWzSAakBaHR0cHM6Ly93d3cubGl2ZW1pbnQuY29tL2NvbXBhbmllcy9jb21wYW55LXJlc3VsdHMvcmVsaWFuY2UtaW5kdXN0cmllcy1xMy1yZXN1bHRzLWNvbXBhbnktcmVwb3J0cy1uZXQtcHJvZml0LW9mLXJzLTE3LTI2NS1yZXZlbnVlLXJpbC1zaGFyZS1wcmljZS9hbXAtMTE3MDU2NjQzMTgwNTMuaHRtbA?hl=en-IN&gl=IN&ceid=IN%3Aen
Processing: https://news.google.com/_/DotsSplashUi/manifest.json
Processing: https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen
Processing: https://news.google.com/rss/search?hl=en-IN&gl=IN&ceid=IN%3Aen&oc=11&q=reliance%20industries%20ril


  k = self.parse_starttag(i)


Processing: https://news.google.com/articles/CBMitwFodHRwczovL3d3dy5saXZlbWludC5jb20vY29tcGFuaWVzL2NvbXBhbnktcmVzdWx0cy9yZWxpYW5jZS1xMy1yZXN1bHRzLWxpdmUtdXBkYXRlcy1yZWxpYW5jZS1pbmR1c3RyaWVzLXJlc3VsdHMtMjAyMy1yaWwtcTMtcmVzdWx0cy0xOS1qYW51YXJ5LTIwMjQtcmlsLXNoYXJlLXByaWNlLTExNzA1NTkwNTg3NTk2Lmh0bWzSAbsBaHR0cHM6Ly93d3cubGl2ZW1pbnQuY29tL2NvbXBhbmllcy9jb21wYW55LXJlc3VsdHMvcmVsaWFuY2UtcTMtcmVzdWx0cy1saXZlLXVwZGF0ZXMtcmVsaWFuY2UtaW5kdXN0cmllcy1yZXN1bHRzLTIwMjMtcmlsLXEzLXJlc3VsdHMtMTktamFudWFyeS0yMDI0LXJpbC1zaGFyZS1wcmljZS9hbXAtMTE3MDU1OTA1ODc1OTYuaHRtbA?hl=en-IN&gl=IN&ceid=IN%3Aen
Processing: https://news.google.com/articles/CBMijwFodHRwczovL3d3dy5idXNpbmVzcy1zdGFuZGFyZC5jb20vY29tcGFuaWVzL25ld3MvcmVsaWFuY2UtaW5kdXN0cmllcy1wYXlzLXJzLTI1NC1jcm9yZS1saWNlbnNlLWZlZS10by1tZXRyby10by11c2UtaXRzLW5hbWUtMTIzMTIyMDAwMjI3XzEuaHRtbNIBkwFodHRwczovL3d3dy5idXNpbmVzcy1zdGFuZGFyZC5jb20vYW1wL2NvbXBhbmllcy9uZXdzL3JlbGlhbmNlLWluZHVzdHJpZXMtcGF5cy1ycy0yNTQtY3JvcmUtbGljZW5zZS1mZWUtdG8tbWV0cm8tdG8tdXNlLWl0cy1uYW1lLTE

In [17]:
# Function to filter extracted data
def filter_data(data):
    filtered_data = {}
    for url, h1_data in data.items():
        filtered_headings = [heading for heading in h1_data['headings'] if len(heading) > 8 and "Access Denied" not in heading]
        filtered_paragraphs = [para for para in h1_data['paragraphs']]
        
        if filtered_headings and filtered_paragraphs:
            filtered_data[url] = {
                'headings': filtered_headings,
                'paragraphs': filtered_paragraphs
            }
    return filtered_data

In [18]:
# Filter the extracted data
filtered_data = filter_data(data)
print_data(filtered_data)



url: https://news.google.com/articles/CBMipQFodHRwczovL3d3dy5saXZlbWludC5jb20vY29tcGFuaWVzL2NvbXBhbnktcmVzdWx0cy9yZWxpYW5jZS1pbmR1c3RyaWVzLXEzLXJlc3VsdHMtY29tcGFueS1yZXBvcnRzLW5ldC1wcm9maXQtb2YtcnMtMTctMjY1LXJldmVudWUtcmlsLXNoYXJlLXByaWNlLTExNzA1NjY0MzE4MDUzLmh0bWzSAakBaHR0cHM6Ly93d3cubGl2ZW1pbnQuY29tL2NvbXBhbmllcy9jb21wYW55LXJlc3VsdHMvcmVsaWFuY2UtaW5kdXN0cmllcy1xMy1yZXN1bHRzLWNvbXBhbnktcmVwb3J0cy1uZXQtcHJvZml0LW9mLXJzLTE3LTI2NS1yZXZlbnVlLXJpbC1zaGFyZS1wcmljZS9hbXAtMTE3MDU2NjQzMTgwNTMuaHRtbA?hl=en-IN&gl=IN&ceid=IN%3Aen
headings: ['Reliance Q3 results: Net profit up 11% to  ₹19,641 crore; EBITDA up 17% led by oil and gas, retail segments']
paragraphs: ["      Reliance Q3 Results: Reliance Industries Limited reported a 10.9% increase in its net profit to  ₹19,641 crore in the third quarter of FY 23-24. The company's total revenue from operations stood at  ₹2,27,970 crore during the quarter under review.\xa0", ' Reliance Industries Q3 results live updates', "   The giant conglomerate's gr

In [29]:
# Export filtered data to a text file
def export_data(data):
    with open("data.txt", 'w', encoding='utf-8') as f:
        for url, h1_data in data.items():
            f.write(f'url: {url}\n')
            if h1_data["headings"]:
                f.write(f'headings: {h1_data["headings"]}\n')
            if h1_data["paragraphs"]:
                f.write(f'paragraphs: {h1_data["paragraphs"]}\n')
            f.write('\n')

In [30]:
export_data(filtered_data)