In [None]:
pip install requests beautifulsoup4




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_web_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Will raise an HTTPError for bad responses (4xx and 5xx)
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving the web page: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    data = {}

    # Extracting tables
    tables = soup.find_all('table')
    table_data = []
    for table in tables:
        headers = [header.text.strip() for header in table.find_all('th')]
        rows = table.find_all('tr')
        table_rows = []
        for row in rows:
            cols = row.find_all('td')
            col_data = [col.text.strip() for col in cols]
            if col_data:
                table_rows.append(col_data)
        if headers and table_rows:
            df = pd.DataFrame(table_rows, columns=headers)
        else:
            df = pd.DataFrame(table_rows)
        table_data.append(df)
    data['tables'] = table_data

    # Extracting paragraphs
    paragraphs = soup.find_all('p')
    paragraph_texts = [para.text.strip() for para in paragraphs]
    data['paragraphs'] = paragraph_texts

    # Extracting headings
    headings = {}
    for level in range(1, 7):
        heading_tags = soup.find_all(f'h{level}')
        headings[f'h{level}'] = [heading.text.strip() for heading in heading_tags]
    data['headings'] = headings

    # Extracting lists
    lists = []
    for list_tag in ['ul', 'ol']:
        list_elements = soup.find_all(list_tag)
        for element in list_elements:
            items = [li.text.strip() for li in element.find_all('li')]
            lists.append(items)
    data['lists'] = lists

    # Extracting links
    links = soup.find_all('a', href=True)
    link_texts = {link.text.strip(): link['href'] for link in links}
    data['links'] = link_texts

    return data

def format_output(data):
    output = []

    if data['headings']:
        output.append("Headings:\n")
        for level, headings in data['headings'].items():
            if headings:
                output.append(f"{level.upper()}:\n")
                for heading in headings:
                    output.append(f"  - {heading}")
        output.append("\n")

    if data['paragraphs']:
        output.append("Paragraphs:\n")
        for para in data['paragraphs']:
            output.append(f"  - {para}")
        output.append("\n")

    if data['tables']:
        output.append("Tables:\n")
        for i, table in enumerate(data['tables']):
            output.append(f"Table {i + 1}:\n{table.to_string(index=False)}\n")
        output.append("\n")

    if data['lists']:
        output.append("Lists:\n")
        for lst in data['lists']:
            for item in lst:
                output.append(f"  - {item}")
        output.append("\n")

    if data['links']:
        output.append("Links:\n")
        for text, href in data['links'].items():
            output.append(f"  - {text}: {href}")

    return "\n".join(output)

def main():
    url = input("Enter the URL of the website to scrape: ")
    data = scrape_web_page(url)
    if data:
        formatted_output = format_output(data)
        print(formatted_output)

if __name__ == "__main__":
    main()


Enter the URL of the website to scrape: https://www.amazon.in/?tag=googmantxtmob170-21&ascsubtag=_k_Cj0KCQjwhb60BhClARIsABGGtw-6TqJGvSnlN_UAwv2zx4veEdvHmTiv3ddMM3GS5yDTeuHCNwVzHUUaAmeREALw_wcB_k_
Headings:

H2:

  - Makeup products
  - New looks for the new season
  - Do up your home
  - Smart gadgets by Amazon
  - Value bazaar
  - Work from home essentials
  - Revamp your home in style
  - Innovations from Emerging Indian Brands


Tables:

Table 1:
0
 

Table 2:
                               0    1                                                   2    3                                          4    5                                                                            6
AbeBooksBooks, art& collectibles      Amazon Web ServicesScalable CloudComputing Services                      AudibleDownloadAudio Books                                                       IMDbMovies, TV& Celebrities
                                 None                                                None No