In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from time import time

from bs4 import BeautifulSoup
import re

In [2]:
# ------------------------------------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------------------------------------
# Define a function to start chrome with a given link address
def startBrowser(link, headless=False):
    '''
    -------------
    DESCRIPTION:
    -------------
    This function starts a chrome browser window with Selenium.


    -------
    INPUT:
    -------
    It contains 2 parameters:
    1. link
    2. headless

    link: takes a string which is the browser link which you want to open in the chrome window.

    headless: Takes a boolean value. Set by default to `False`, it gives the option of opening the chrome browser:
            a. With Graphical User Interface (headless=False) or 
            b. Without GUI (headless=True)

            Note: Without GUI all the capabilities of scraping are still available as With GUI. 

    -------
    OUTPUT:
    -------

    The output of the function will be a driver that can be used for giving commands to Selenium. 
    Preferably label the output variable for this function as `driver`.
    
    -----------------
    Example Use Case:
    -----------------

    # Starts a browser window with GUI
    driver = startBrowser(link="https://www.leetcode.com") 

    # Starts a browser window without GUI
    driver = startBrowser(link="https://www.leetcode.com", headless=True)

    '''

    # Open Browser in the background, with no display
    if headless == True:

        # Print information for script about the task being undertaken
        print("-> Starting Chrome Headless")


        # Set up options to start chrome with
        settings = webdriver.ChromeOptions()

        # Ignore certificate errors from chrome
        settings.add_argument('--ignore-certificate-errors')

        #  Allow any type-safe setter from Chrome which are currently not encoded from Selenium side 
        settings.add_experimental_option('excludeSwitches', ['enable-logging'])

        # Open the browser in Incognito
        settings.add_argument('--incognito')

        # Open the browser headless
        settings.add_argument('--headless')

        # Set a driver for the chrome startup
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=settings) 
        
        # Make the driver navigate to the link provided in function input
        driver.get(link)

        # Maximize the window for easier work
        driver.maximize_window()

        # Print message about browser being ready to use
        print('-> Headless Browser Opened and Ready to Use.')

        # Return the driver which can be used for the operating the browser through Selenium in later code
        return driver


    # If the headless=False then
    else:

        # Print information for script about the task being undertaken
        print("-> Starting Chrome")

        # Set a driver for the chrome startup
        driver = webdriver.Chrome()

        # Make the driver navigate to the link provided in function input
        driver.get(link)

        # Maximize the window for easier work
        driver.maximize_window()

        # Print message about browser being ready to use
        print("-> Browser Ready to Use.")

        # Return the driver which can be used for the operating the browser through Selenium in later code
        return driver

# ------------------------------------------------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------------------------------------------------
# driver.quit() :
# The quit() method quits the driver, closing every associated window. 

# driver.close() : 
# The close() method closes the currently focused window, quitting the driver if the current window is the only open window.
# ---------------------------------------------------------------------------------------------------------------------------------


def quitBrowser(driver_name):
    '''
    -------------
    DESCRIPTION:
    -------------
    This function quits the browser window with Selenium.


    -------
    INPUT:
    -------
    It contains one parameter which takes the variable name of the driver that was started with the `startBrowser()` function.

    -------
    OUTPUT:
    -------

    There are no outputs of the function. It closes the browser window that was opened by Selenium.
    It quits the browser even if it is headless.



    
    -----------------
    Example Use Case:
    -----------------

    
    driver = startBrowser(link="https://www.leetcode.com", headless=False)


    quitBrowser(driver_name=driver)
    '''
    # Set up the driver name
    driver = driver_name

    # Print Process
    print("-> Quitting Browser")

    # Quit the browser window
    driver.quit()

    # Print Process
    print("-> Browser Quit")


def closeBrowser(driver_name):
    '''
    -------------
    DESCRIPTION:
    -------------
    This function closes the browser tab. If it is the only tab then it closes the browser window. 


    -------
    INPUT:
    -------
    It contains one parameter which takes the variable name of the driver that was started with the `startBrowser()` function.

    -------
    OUTPUT:
    -------

    There are no outputs of the function. It closes the browser tab that was opened by Selenium.
    It quits the browser tab even if it is headless.



    
    -----------------
    Example Use Case:
    -----------------

    
    driver = startBrowser(link="https://www.leetcode.com", headless=False)


    closeBrowser(driver_name=driver)
    '''
    # Set up the driver name
    driver = driver_name
    
    # Print Process
    print("-> Closing Browser")

    # Quit the browser window
    driver.close()

    # Print Process
    print("-> Browser Closed")


# ------------------------------------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------------------------------------

# Initiate parser which 
def initiateParser(driver_name):
    """
    
    -------------
    DESCRIPTION:
    -------------
    Initiates the Parser for Beautiful Soup


    -------
    INPUT:
    -------
    It contains one parameter which takes the variable name of the driver that was started with the `startBrowser()` function.

    -------
    OUTPUT:
    -------

    It returns the parser which can be saved as a variable. This parser gets all the information available for the 
    page that the browser is on. 


    -----------------
    Example Use Case:
    -----------------

    
    driver = startBrowser(link="https://www.leetcode.com", headless=False)


    parser = initiateParser(driver_name=driver)
    
    """

    # Set up the driver name
    driver = driver_name
    
    # Get the html page source from the driver
    html_page = driver.page_source

    # Use Beautiful Soup to parse through the html page
    parser = BeautifulSoup(html_page, 'html.parser')

    # Return the Parser
    return parser



In [3]:
def checkLastPage(parser):

    """
    Check the total number of pages at the bottom of the main page. 
    This function uses the driver and Beautiful Soup to find the total number of pages with questions in it for leetcode. 

    Libraries used: Beautiful Soup, Regex, Selenium
    """

    # Find the buttons at the bottom of the page which contains the total number of pages
    page_list = parser.find_all('button', class_='flex items-center justify-center px-3 h-8 rounded select-none focus:outline-none bg-fill-3 dark:bg-dark-fill-3 text-label-2 dark:text-dark-label-2 hover:bg-fill-2 dark:hover:bg-dark-fill-2')

    # Find the last page with a number associated with it.
    # Note this will be the second last page as the last page is just a button by itself
    last_page = page_list[-2]

    # Extract the last 12 digits of the page which should ensure all 3 digit numbers showing up
    number = str(last_page)[-13:]

    # Use Regex to find all the numbers in the last 13 digits. 
    # Note: Taking -1 so as to ensure that you take the last number associated with the page
    last_number = re.findall(r'\d+', number)[-1]

    # Print the highest page number
    print(f"The last page number is {int(last_number)}")

    # Return the last page number so it can be used later
    return int(last_number)

In [4]:
def questionList(parser):
    '''
    -------
    INPUT:
    -------

    This function takes the result of the initiateParser() function and uses it to find all the tables on the page. 
    It parses through all the tables and finds the table with the most rows within it. 
    This table will the one with all the questions for later parsing. 
    -----------------------------------------------------------------------------------------------------------------

    -------
    OUTPUT:
    -------

    The output of the function will be the html output for the table with all the questions within it

    
    -----------------
    Example Use Case:
    -----------------

    current_parser = initiateParser(driver)

    q_list = questionList(current_parser)

    '''

    # Find all the tables on the page (defined as role = 'rowgroup')
    q_table = parser.find_all('div', role='rowgroup')

    # Initialise an empty list to store information later
    lengths = []

    # Start a counter
    i = 0

    # For each table found on the page
    for table in range(len(q_table)):

        # Start a minimum length of table counter as 0
        length = 0

        # Check the length of the table
        new_length = len(q_table[i])

        # If the current table length is higher than previous table length
        if new_length > length:

            # Add the length of the new table in the list created previously
            lengths.append(new_length)

            # The new base length is now the length of the current highest table
            length = new_length

        # Increase the counter by 1 to check next table index
        i +=1

    # Extract the table with the highest length (which will be in the last place of the list hence -1)
    rows_in_table = lengths[-1]

    # Find the index of the table with the highest length of rows. Subtract 1 as it is 0 indexed
    table_index = i -1 

    # The list of questions will be in the table with the highest length, which is extracted here. 
    q_list = q_table[table_index].find_all('div', role='row')[0:rows_in_table]

    # Return Question List
    return q_list

In [5]:
def index_question_difficulty(question_list):
    '''
    -------
    INPUT:
    -------

    This function takes the result of the questionList() function and uses it to find all the questions (and related information) on the page. 
    It parses through all the questions and finds the index number, question name and difficulty level of the questions.  
    -----------------------------------------------------------------------------------------------------------------

    -------
    OUTPUT:
    -------

    The output of the function will be the  3 lists:
    1. Index list
    2. Question name list
    3. Question difficulty list
    
    -----------------
    Example Use Case:
    -----------------

    q_list = questionList(current_parser)

    index_list, question_list, difficulty_list = index_question_difficulty(question_list = q_list)

    '''
    # Extracting Question number and Question string from the q_list. 

    # Create an empty dictionary
    question_list = []
    index_list = []
    difficulty_list = []

    # For row in the question list
    for question in q_list:

        # Find all the rows within the question list
        row = question.find_all('div', role='cell')

        # Isolates the section where name and index will be found 
        row_info = row[1].find('a').text 

        # Isolates the section where difficulty will be found
        row_info2 = row[4].find('span').text 


        # Create Regex patterns to match everything before and after the first period
        pattern_before = r'^(.*?)\.'
        pattern_after = r'\.(.*)'

        # Find all matches before a period using regex
        matches_before = re.findall(pattern_before, row_info)

        # Turn the result into an integer
        matches_before = int(matches_before[0])

        # Find all matches after a period using regex
        matches_after = re.findall(pattern_after, row_info)

        # Remove all the leading and trailing spaces from the result
        matches_after = matches_after[0].strip()

        # Add results to lists
        index_list.append(matches_before)
        question_list.append(matches_after)
        difficulty_list.append(row_info2)

    # Return the results
    return index_list, question_list, difficulty_list



---
Testing Application

---

In [None]:
quitBrowser()

In [6]:
# Open Leetcode
driver = startBrowser("https://www.leetcode.com/", headless=False)

-> Starting Chrome
-> Browser Ready to Use.


In [7]:
# Timeout in seconds to allow page to load completely
delay = 5 


# Click the "Get Started" link to get into the Overall Questions page
try:
    # Wait for the element with the ID of wrapper
    element = WebDriverWait(driver, delay).until(
      EC.presence_of_element_located((By.CLASS_NAME, "link"))
    ).click()
except:
    quitBrowser()



# Click the "Problems" link to get into the problems page
try:
    # Wait for the element with the ID of wrapper
    element = WebDriverWait(driver, delay).until(
        EC.presence_of_element_located((By.XPATH,'//*[@id="leetcode-navbar"]/div[1]/ul/li[2]/a'))).click()
except:
    quitBrowser()


In [8]:
# Use the driver to initiate the parse
parser = initiateParser(driver)

In [9]:
# See how many pages of problems are in the set
last_page = checkLastPage(parser)

The last page number is 67


In [10]:
q_list = questionList(parser)

In [11]:
index_list, question_list, difficulty_list = index_question_difficulty(question_list=q_list)

In [12]:
difficulty_list


['Hard',
 'Easy',
 'Medium',
 'Medium',
 'Hard',
 'Medium',
 'Medium',
 'Medium',
 'Medium',
 'Easy',
 'Hard',
 'Medium',
 'Medium',
 'Easy',
 'Easy',
 'Medium',
 'Medium',
 'Medium',
 'Medium',
 'Medium',
 'Easy',
 'Easy',
 'Medium',
 'Hard',
 'Medium',
 'Hard',
 'Easy',
 'Easy',
 'Easy',
 'Medium',
 'Hard',
 'Medium',
 'Hard',
 'Medium',
 'Medium',
 'Easy',
 'Medium',
 'Hard',
 'Medium',
 'Medium',
 'Medium',
 'Hard',
 'Hard',
 'Medium',
 'Hard',
 'Medium',
 'Medium',
 'Medium',
 'Medium',
 'Medium',
 'Medium']

In [13]:
question_list

['Parsing A Boolean Expression',
 'Two Sum',
 'Add Two Numbers',
 'Longest Substring Without Repeating Characters',
 'Median of Two Sorted Arrays',
 'Longest Palindromic Substring',
 'Zigzag Conversion',
 'Reverse Integer',
 'String to Integer (atoi)',
 'Palindrome Number',
 'Regular Expression Matching',
 'Container With Most Water',
 'Integer to Roman',
 'Roman to Integer',
 'Longest Common Prefix',
 '3Sum',
 '3Sum Closest',
 'Letter Combinations of a Phone Number',
 '4Sum',
 'Remove Nth Node From End of List',
 'Valid Parentheses',
 'Merge Two Sorted Lists',
 'Generate Parentheses',
 'Merge k Sorted Lists',
 'Swap Nodes in Pairs',
 'Reverse Nodes in k-Group',
 'Remove Duplicates from Sorted Array',
 'Remove Element',
 'Find the Index of the First Occurrence in a String',
 'Divide Two Integers',
 'Substring with Concatenation of All Words',
 'Next Permutation',
 'Longest Valid Parentheses',
 'Search in Rotated Sorted Array',
 'Find First and Last Position of Element in Sorted Array',

In [14]:
index_list

[1106,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50]

In [15]:
driver.quit()


In [None]:
# driver = startBrowser("https://www.leetcode.com/", headless=True)


In [33]:
# Get to next page
try:
    # Wait for the element with the ID of wrapper
    element = WebDriverWait(driver, delay).until(
        EC.presence_of_element_located((By.XPATH,'//*[@id="__next"]/div[1]/div[4]/div[2]/div[1]/div[4]/div[3]/nav/button[10]'))).click()
except:
    quitBrowser()

In [None]:
print(driver.title)

In [11]:
search = driver.find_element(By.NAME ,"q")
search.send_keys("scotiabank canada")
search.send_keys(Keys.RETURN)

In [None]:
# Timeout in seconds
delay = 5 

try:
    # Wait for the element with the ID of wrapper
    main = WebDriverWait(driver, delay).until(
      EC.presence_of_element_located((By.ID, "rso"))
    )
    

except:
    driver.quit()


# 

main = driver.find_element()


In [15]:
driver.quit()