In [None]:
import os
import re
import pandas as pd
from selenium import webdriver

In [None]:
# Credentials
USERNAME = os.environ.get('GCCAMPUS_USERNAME')
PASSWORD = os.environ.get('GCCAMPUS_PASSWORD')

# Bool for scraping FR descriptions
FRENCH = False

In [None]:
browser = webdriver.Chrome()

In [None]:
# Navigate to GCcampus and login
if FRENCH:
    main_url = 'https://idp.csps-efpc.gc.ca/idp/login-fr.jsp'
else:
    main_url = 'https://idp.csps-efpc.gc.ca/idp/Authn/UserPassword'

browser.get(main_url)
browser.find_element_by_id('j_username').send_keys(USERNAME)
browser.find_element_by_id('j_password').send_keys(PASSWORD)
browser.find_element_by_id('cbPrivacy').click()
browser.find_element_by_xpath("//button[@type='submit']").click()

In [None]:
%%time
# Loop through catalogue and get all links to courses
if FRENCH:
    list_url = 'https://learn-apprendre.csps-efpc.gc.ca/application/fr/courses-solr?page='
else:
    list_url = 'https://learn-apprendre.csps-efpc.gc.ca/application/en/courses-solr?page='

course_links = []
for i in range(1): # Limit of ?page=41 in EN (therefore use range(42)); ?page=41 in FR
    browser.get(list_url + str(i))
    mars = browser.find_elements_by_css_selector('.field-items a')
    for elem in mars:
        course_links.append(elem.get_attribute('href'))

In [None]:
# Compile regex to extract course codes
regex = re.compile(pattern=r'[a-zA-Z]{1}\d{3}(?:\s{1}-\s{1}MODULE\s{1}\d{1})?')

In [None]:
%%time
# For each link in 'course_links', navigate to page, grab course description (HTML
# tags included), search for course code, and save to 'desc_dict'
desc_dict = {}
for link in course_links:
    browser.get(link)
    # Grab description
    desc = browser.find_elements_by_css_selector('.field-item[property="content:encoded"]')[0].get_attribute('innerHTML')
    # Grab title and extract course code
    title = browser.find_elements_by_css_selector('.page-title')[0].get_attribute('innerHTML')
    title_search = regex.findall(title)
    pkey = title_search[0] if title_search else link
    desc_dict[pkey] = desc

In [None]:
# Export 'desc_dict' to CSV
df = pd.DataFrame.from_dict(desc_dict, orient='index')
df.to_csv('scraped_{0}.csv'.format('fr' if FRENCH else 'en'), sep=',', encoding='utf-8')