In [1]:
#importing libraries
import os
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import re

In [2]:
#get current directory
current_directory = os.getcwd()

# make output directory if not exist
output_directory = os.path.join(current_directory, "Output")
os.makedirs(output_directory, exist_ok=True)

url = "https://www.cic.gc.ca/english/helpcentre/index-a-z-can.asp"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

In [4]:
#scraping the front page
section = soup.find("section", class_="container")
h2_elements = section.find_all('h2')
links = []
link_titles = []
for h2_element in h2_elements:
    a_element = h2_element.find('a')
    if a_element:
        link_title = a_element.text
        link_titles.append(link_title)
        link = a_element['href']
        links.append(f"https://www.cic.gc.ca/english/helpcentre/{link}")
print(link_titles[:3], links[:3])

['Access to Information and Privacy', 'Adoption', 'Application status'] ['https://www.cic.gc.ca/english/helpcentre/results-by-topic.asp?top=1', 'https://www.cic.gc.ca/english/helpcentre/results-by-topic.asp?top=2', 'https://www.cic.gc.ca/english/helpcentre/results-by-topic.asp?top=3']


In [None]:
#scraping the subpages
sublinks = []
sublink_titles = []
for link in links:
    response = requests.get(link)
    print(response.status_code)
    try:
        soup = BeautifulSoup(response.text, 'lxml')
        section = soup.find_all('div', class_='clearfix')
        a_elements = section[1].find_all('a')
        for a_element in a_elements:
            if a_element:
                sublink = a_element['href']
                sublink_title = a_element.text
                sanitized_title = re.sub(r'[<>:"/\\|?*]', '_', sublink_title)
                sublinks.append(f"https://www.cic.gc.ca/english/helpcentre/{sublink}")
                sublink_titles.append(sublink_title)
                print(sublink_title)

            sub_response = requests.get(sublink)
            sub_soup = BeautifulSoup(sub_response.text, 'lxml')
            sub_section = sub_soup.find('div', itemprop='text')
            if sub_section:
                text_data = sub_section.text
                json_data = {
                    'Title' : sublink_title,
                    'Source' : f"https://www.cic.gc.ca/english/helpcentre/{sublink}",
                    'Content' : text_data,
                    'DateOfScrapping' : datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }

                try:

                    txt_file_path = os.path.join(output_directory, f'{sanitized_title}.txt')
                    json_file_path = os.path.join(output_directory, f'{sanitized_title}.json')

                    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
                        txt_file.write(text_data)

                    with open(json_file_path, 'w', encoding='utf-8') as json_file:
                        json.dump(json_data, json_file, indent=4)
                except FileNotFoundError as e:
                    print(f"Error: {e}")
                except Exception as e:
                    print(f"An error occurred: {e}")
    except:
        pass
