In [2]:
import requests
from bs4 import BeautifulSoup
import os

# File name containing the URLs
file_name = 'bajaj_urls.csv'
# Directory where the output files will be stored
output_directory = 'output_files'

# Create the output directory if it does not exist
os.makedirs(output_directory, exist_ok=True)

# Read URLs from the file
with open(file_name, 'r') as file:
    urls = [line.strip() for line in file if line.strip()]

# Function to fetch and parse a URL
def fetch_and_parse(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

# Function to format and extract text, including bold inline
def format_text(element):
    # Replace bold tags (<b> and <strong>) with a markdown-style indication for bold
    for bold in element.find_all(['strong', 'b']):
        bold.string = f"**{bold.get_text().strip()}**"
    return element.get_text().strip()

# Function to extract and write content from a page
def extract_content(soup, f_out):
    # Try to find a generic container for the main content
    container = None

    # Common selectors that might contain the main content
    selectors = [
        'article',  # Most articles are enclosed in <article> tags
        'div.entry-content',  # Common class for WordPress blogs
        'div.post-content',  # Generic class for blog post content
        'div.content',  # Another common class name
        'div.main-content',  # Common for many sites
        'div.container',  # General container
    ]
    
    # Iterate over selectors to find the first matching container
    for selector in selectors:
        container = soup.select_one(selector)
        if container:
            break
    
    # Check if the container was found
    if container:
        # Find all headings and paragraph elements
        elements = container.find_all(['h2', 'h3', 'h4', 'p'])
        
        # Iterate through each element and write to file
        for element in elements:
            text = format_text(element)
            if element.name in ['h2', 'h3', 'h4']:  # Headings
                f_out.write(f"\n## Heading: {text}\n")
            elif element.name == 'p':  # Paragraphs
                f_out.write(f"{text}\n")
    else:
        f_out.write("Main content container not found.\n")

# Function to extract all pagination links
def get_pagination_links(soup):
    pagination_links = []
    page_links_container = soup.select_one('div.page-links')
    
    if page_links_container:
        # Find all the anchor tags within the pagination container
        page_links = page_links_container.find_all('a')
        for link in page_links:
            page_url = link.get('href')
            if page_url and page_url not in pagination_links:
                pagination_links.append(page_url)
    
    return pagination_links

# Iterate through each URL and perform the operations
for url in urls:
    # Create a filename based on the URL
    url_filename = url.split('/')[-2] if url.endswith('/') else url.split('/')[-1]
    output_file = os.path.join(output_directory, f"{url_filename}.txt")
    
    with open(output_file, 'w', encoding='utf-8') as f_out:
        f_out.write(f"URL: {url}\n\n")
        
        # Fetch and parse the main page
        soup = fetch_and_parse(url)
        
        if soup:
            # Extract and write content from the main page
            extract_content(soup, f_out)
            
            # Check if there are multiple pages
            pagination_links = get_pagination_links(soup)
            
            # If pagination links are found, fetch and extract content from each page
            for page_url in pagination_links:
                f_out.write(f"\nProcessing additional page: {page_url}\n\n")
                page_soup = fetch_and_parse(page_url)
                if page_soup:
                    extract_content(page_soup, f_out)
        else:
            f_out.write("Failed to parse the main page.\n")


The chromedriver version (114.0.5735.90) detected in PATH at /usr/local/bin/chromedriver might not be compatible with the detected chrome version (130.0.6723.58); currently, chromedriver 130.0.6723.69 is recommended for chrome 130.*, so it is advised to delete the driver in PATH and retry


WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
  (unknown error: DevToolsActivePort file doesn't exist)
  (The process started from chrome location /usr/bin/google-chrome is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
#0 0x561af61004e3 <unknown>
#1 0x561af5e2fc76 <unknown>
#2 0x561af5e58d78 <unknown>
#3 0x561af5e55029 <unknown>
#4 0x561af5e93ccc <unknown>
#5 0x561af5e9347f <unknown>
#6 0x561af5e8ade3 <unknown>
#7 0x561af5e602dd <unknown>
#8 0x561af5e6134e <unknown>
#9 0x561af60c03e4 <unknown>
#10 0x561af60c43d7 <unknown>
#11 0x561af60ceb20 <unknown>
#12 0x561af60c5023 <unknown>
#13 0x561af60931aa <unknown>
#14 0x561af60e96b8 <unknown>
#15 0x561af60e9847 <unknown>
#16 0x561af60f9243 <unknown>
#17 0x7f94ef2e4609 start_thread
