In [None]:
URL = "https://www.hackerrank.com/" # Using hackerrank site to site the data scraping for

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import gzip



class SitemapParser:

    def __init__(self, base_url):
      """
        params:
          base_url: url to parse the robots.txt data to retrieve the Sitemap data
        return: None
      """
      self.base_url = base_url
      self.sitemaps = []

    def fetch_robots_txt(self):
      """
        params: None
        return: str
      """
      robots_url = f"{self.base_url}/robots.txt"
      # headers for the authentication purpose of the data
      headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
      try:
          # getting response of urL with help of requests module.
          response = requests.get(robots_url, headers= headers)
          response.raise_for_status()
          # print(response.text)
          return response.text
      except requests.exceptions.RequestException as e:
          print(f"Error fetching robots.txt: {e}")
          return None

    def extract_sitemaps(self, robots_content):
      """
        params:
            robots_content: content of the urL data
        return: list of sitemaps urls
      """
      sitemaps = []
      if robots_content:
          # fetching the content of the sitemap for the urL only.
          lines = robots_content.split('\n')
          for line in lines:
              if line.startswith('Sitemap:'):
                  sitemap_url = line.split(': ')[1].strip()
                  sitemaps.append(sitemap_url)
      return sitemaps

    def parse_sitemap(self, sitemap_url):
      """
        params:
          sitemap_urL: url content of the xmL data containing the sitemap for the hackerrank
        return: list of contents urL's of the xmL data for the hackerrank
      """
      try:
          # fetching the content of the urLs data using requests api
          response = requests.get(sitemap_url)
          response.raise_for_status()
          # Decompress the gzip content
          decompressed_content = gzip.decompress(response.content).decode('utf-8')
          # Parse XML content
          root = ET.fromstring(decompressed_content)
          # Extract URLs
          urls = [elem.text for elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]
          return urls
      except requests.exceptions.RequestException as e:
          print(f"Error parsing sitemap: {e}")
          return []

    def parse_all_sitemaps(self):
      """
        params: None
        return: data frame with list of urL's of the hackerrank sitemap xmL's data
      """
      robots_content = self.fetch_robots_txt()
      self.sitemaps = self.extract_sitemaps(robots_content)

      all_urls = []
      for sitemap_url in self.sitemaps:
          urls = self.parse_sitemap(sitemap_url)
          all_urls.extend(urls)

      df = pd.DataFrame(all_urls, columns=['URL'])
      return df
        # return None

# Example usage with a website
website_url = URL
sitemap_parser = SitemapParser(website_url)
result_df = sitemap_parser.parse_all_sitemaps()

# Display the DataFrame
print(result_df)


                                                  URL
0                         https://www.hackerrank.com/
1                         https://www.hackerrank.com/
2                https://www.hackerrank.com/dashboard
3                 https://www.hackerrank.com/contests
4              https://www.hackerrank.com/jobs/search
5              https://www.hackerrank.com/leaderboard
6                 https://www.hackerrank.com/calendar
7                  https://www.hackerrank.com/scoring
8              https://www.hackerrank.com/environment
9                      https://www.hackerrank.com/faq
10                 https://www.hackerrank.com/aboutus
11                 https://www.hackerrank.com/careers
12        https://www.hackerrank.com/terms-of-service
13                 https://www.hackerrank.com/privacy
14               https://www.hackerrank.com/interview
15         https://www.hackerrank.com/support/feature
16               https://www.hackerrank.com/companies
17                    https:

In [None]:
result_df.head()

Unnamed: 0,URL
0,https://www.hackerrank.com/
1,https://www.hackerrank.com/
2,https://www.hackerrank.com/dashboard
3,https://www.hackerrank.com/contests
4,https://www.hackerrank.com/jobs/search


For my code, I have first requested the hackerrank website data to fetch the data for robots.txt to get the sitemaps data. Then Followed by, parsed the sitemap url by using requests module and as the data is in format of zip file, I have unzipped the file and then parsed the xml data and converted the data into dataframe.
To describe regarding the working of the above dataset, we can use the following above dataframe consisting of the hackerrank urL's, to get the daily basis updates over the hackerrank contests, or job selections etc by scraping the right website, such as for the contests related daily activity data, we can have the "https://www.hackerrank.com/contests" website get it fetched. We can further parse these sites data to get the data regarding the particular sites, such as contest on going, which has most effective one, that would be helpful for job search, which would be seen by recruiters.