In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import time
import urllib.robotparser
import urllib.parse

def extract_information(url):
    page = requests.get(url)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        headlines = []
        for i in range(1, 7):
            headlines.extend([h.get_text() for h in soup.find_all(f'h{i}')])
        links = [(link.get('href'), link.get_text()) for link in soup.find_all('a') if link.get('href')]
        images = [(img.get('src'), img.get('alt')) for img in soup.find_all('img')]
        return headlines, links, images
    else:
        print(f"Request failed with status code: {page.status_code}")
        return None, None, None

def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerows(data)

def save_to_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def make_request(url, retries=3):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as err:
        print(f"Error: {err}")
        if retries > 0:
            time.sleep(2)
            return make_request(url, retries - 1)
        else:
            return None

def extract_all_links(url):
    page = requests.get(url)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        links = [(link.get('href'), link.get_text()) for link in soup.find_all('a') if link.get('href')]
        return links
    else:
        print(f"Request failed with status code: {page.status_code}")
        return []

def search_for_text(url, keyword):
    page = requests.get(url)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        occurrences = soup.body.find_all(string=lambda text: keyword.lower() in text.lower())
        return occurrences
    else:
        print(f"Request failed with status code: {page.status_code}")
        return []

def use_css_selectors(url, selector):
    page = requests.get(url)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        elements = soup.select(selector)
        return elements
    else:
        print(f"Request failed with status code: {page.status_code}")
        return []

def count_elements(url, tag):
    page = requests.get(url)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        elements = soup.find_all(tag)
        return len(elements)
    else:
        print(f"Request failed with status code: {page.status_code}")
        return 0

def follow_links(url, depth=1):
    if depth < 0:
        return []
    page = requests.get(url)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a') if link.get('href') and link.get('href').startswith('/wiki')]
        return links
    else:
        print(f"Request failed with status code: {page.status_code}")
        return []

def can_fetch(url, user_agent='*'):
    try:
        parsed_url = urllib.parse.urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
        rp = urllib.robotparser.RobotFileParser()
        rp.set_url(base_url)
        rp.read()
        return rp.can_fetch(user_agent, url)
    except Exception as e:
        print(f"Error checking robots.txt: {e}")
        return False

url = input("Enter URL: ")

if can_fetch(url):
    print("Allowed to scrape")
    
    headlines, links, images = extract_information(url)
    print("Headlines:", headlines)
    print("Links:", links)
    print("Images:", images)

    save_to_csv(links, 'links.csv')
    save_to_json(images, 'images.json')

    page = make_request(url)
    if page:
        print("Request successful")
    else:
        print("Request failed after retries")

    all_links = extract_all_links(url)
    print("All Links:", all_links)

    keyword = input("Enter keyword to search: ")
    results = search_for_text(url, keyword)
    print(f"Occurrences of '{keyword}':", results)

    selector = input("Enter CSS selector to use: ")
    elements = use_css_selectors(url, selector)
    print(f"Elements matching '{selector}':", elements)

    tag = input("Enter HTML tag to count: ")
    count = count_elements(url, tag)
    print(f"Number of '{tag}' elements:", count)

    depth = int(input("Enter depth to follow links: "))
    followed_links = follow_links(url, depth)
    print("Followed Links:", followed_links)
else:
    print("Not allowed to scrape")


Enter URL:  https://www.irctc.co.in/nget/train-search


Error checking robots.txt: <urlopen error [SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1002)>
Not allowed to scrape
