# Scraping Express Entry_Check your application status: https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/express-entry/apply-permanent-residence/check-your-status.html

In [26]:
#importing libraries
import os
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import re

In [27]:
new_headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }

url = "https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/express-entry/apply-permanent-residence/check-your-status.html"

response = requests.get(url, headers=new_headers)
print(response.status_code)
soup = BeautifulSoup(response.text, 'lxml')


200


In [28]:
section = soup.find_all('div', class_='mwsgeneric-base-html parbase section')
tags = section[1].find_all(['h2', 'h3', 'p', 'ul'], class_= '')
print(tags)

[<h2>Check your application status online</h2>, <p><strong>Starting February 8, 2023</strong>, use the <a href="https://ircc-tracker-suivi.apps.cic.gc.ca/en" target="_blank">application status tracker<span class="wb-inv">(opens in a new tab)</span></a> <span aria-hidden="true" class="glyphicon glyphicon-new-window"></span> to get the most up-to-date and detailed information about the status of your application.</p>, <p>You have <strong>60 days</strong> after you’ve been invited to apply to submit your application for permanent residence.</p>, <p>Once you have submitted your application, you can check its status in <a href="/en/immigration-refugees-citizenship/services/application/account.html">your account</a>.</p>, <ul>
<li>Go to “View the applications you submitted”</li>
<li>Click “Application status and messages”</li>
</ul>, <p>On the “Application status and messages” page, you’re able to see</p>, <ul>
<li>the overall status of your application</li>
<li>the current status of each ap

In [30]:
text_data = []
between_h2_tags = 0
between_h3_tags = 0
for tag in tags:
    if tag.name == 'h2':
        if tag.text == 'Check your application status online':
            between_h2_tags = 1
            text_data.append({
                'subheading' : tag.text,
                'content' : ''
            })
        elif tag.text == 'Processing times':
            between_h2_tags = 2
            text_data.append({
                'subheading' : tag.text,
                'content' : ''
            })
        else:
            break
    elif tag.name == 'h3':
        if tag.text == 'Causes of processing delays':
            between_h3_tags = 1
            between_h2_tags = 0
            text_data.append({
                'subheading' : tag.text,
                'content' : ''
            })
        elif tag.text == 'Keep your information up to date':
            between_h3_tags = 2
            between_h2_tags = 0
            text_data.append({
                'subheading' : tag.text,
                'content' : ''
            })
    elif tag.name == 'p' or tag.name == 'ul':
        if between_h2_tags == 1:
            text_data[0]['content'] += tag.text
        elif between_h2_tags == 2:
            text_data[1]['content'] += tag.text
        elif between_h3_tags == 1:
            text_data[2]['content'] += tag.text
        elif between_h3_tags == 2:
            text_data[3]['content'] += tag.text

#get current directory
current_directory = os.getcwd()

# make output directory if not exist
output_directory = os.path.join(current_directory, "Output")
os.makedirs(output_directory, exist_ok=True)

json_data = {
    "Title": "Express Entry: Check your application status",
    "Source(link)": f"{url}",
    "Date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    "subheadings": text_data
}

json_file_path = os.path.join(output_directory, f'Express Entry Check your application status.json')

with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(json_data, json_file, indent=4)