#### Author : Rahul Bhoyar

We will do the basic scraping and explore what are the different tags that are present.

In [1]:
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


url = 'https://en.wikipedia.org/wiki/India'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

     # Extracting paragraphs
    titles = [clean_text(title.text) for title in soup.find_all('title')]
    print(f"Titles: {titles}")
    print("-"*200)

    # Extracting paragraphs
    paragraphs = [clean_text(paragraph.text) for paragraph in soup.find_all('p')]
    print(f"Paragraphs: {paragraphs}")
    print("Total number of paragraphs :", len(paragraphs))
    print("-"*200)

    # Extracting categories
    categories = [clean_text(category.text) for category in soup.find_all('span', {'class': 'mw-headline'})]
    print(f"Categories: {categories}")
    print("-"*200)

    # Extracting references
    references = [clean_text(reference.text) for reference in soup.find_all('span', {'class': 'reference-text'})]
    print(f"References: {references}")
    print("-"*200)

    # Extracting images
    images = [clean_text(image['src']) for image in soup.find_all('img')]
    print(f"Images: {images}")
    print("-"*200)

    # Extracting table data (if available)
    tables = soup.find_all('table')
    for table in tables:
        table_data = [[clean_text(td.text.strip()) for td in row.find_all(['th', 'td'])] for row in table.find_all('tr')]
        print(f"Table Data: {table_data}")
    print("-"*200)

    # Extracting external links
    external_links = [clean_text(link['href']) for link in soup.find_all('a', {'class': 'external text'})]
    print(f"External Links: {external_links}")
    print("-"*200)

    # Extracting see also section
    see_also = [clean_text(link.text) for link in soup.find_all('div', {'class': 'div-col columns column-width'})]
    print(f"See Also: {see_also}")
    print("-"*200)

    # Extracting revision history
    revision_history = [clean_text(revision.text) for revision in soup.find_all('li', {'class': 'history-changed'})]
    print(f"Revision History: {revision_history}")
    print("-"*200)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Titles: ['India Wikipedia']
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Paragraphs: ['', 'India officially the Republic of India ISO Bhrat Gaarjya22 is a country in South Asia It is the seventhlargest country by area the most populous country as of June 20232324 and from the time of its independence in 1947 the worlds most populous democracy252627 Bounded by the Indian Ocean on the south the Arabian Sea on the southwest and the Bay of Bengal on the southeast it shares land borders with Pakistan to the westj China Nepal and Bhutan to the north and Bangladesh and Myanmar to the east In the Indian Ocean India is in the vicinity of Sri Lanka and the Maldives its Andaman and Nicobar Islands share a maritime border with Thailand Myanmar and Indonesia', 'Modern humans arrived on the Indian subcontinent from Africa no later than 55000 year