In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from time import time

from bs4 import BeautifulSoup
import re

In [2]:
# Define a function to start chrome with a given link address
def startBrowser(link, headless=False):
    '''
    Start a Chrome browser with Selenium. 

    This function takes a link or url and opens that in a browser with Selenium. 
    
    It has the option of opening the browser as headless (which means that there is no Graphical User Interface but all 
    capabilities are available therein.)

    
    '''

    # Open Browser in the background, with no display
    if headless == True:

        # Print information for script about the task being undertaken
        print("-> Starting Chrome Headless")


        # Set up options to start chrome with
        settings = webdriver.ChromeOptions()

        # Ignore certificate errors from chrome
        settings.add_argument('--ignore-certificate-errors')

        #  Allow any type-safe setter from Chrome which are currently not encoded from Selenium side 
        settings.add_experimental_option('excludeSwitches', ['enable-logging'])

        # Open the browser in Incognito
        settings.add_argument('--incognito')

        # Open the browser headless
        settings.add_argument('--headless')

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=settings) 
        
        driver.get(link)

        print('-> Headless browser opened and ready to use')

        return driver


    else:

        # Print information for script about the task being undertaken
        print("-> Starting Chrome")

        # Set a driver for the chrome startup
        driver = webdriver.Chrome()

        # Make the driver navigate to the link provided in function input
        driver.get(link)

        # Maximize the window for easier work
        driver.maximize_window()

        # Print message about browser being ready to use
        print("-> Browser Ready to Use")

        # Return the driver which can be used for the operating the browser through Selenium in later code
        return driver


# ---------------------------------------------------------------------------------------------------------------------------------
# driver.quit() :
# The quit() method quits the driver, closing every associated window. 

# driver.close() : 
# The close() method closes the currently focused window, quitting the driver if the current window is the only open window.
# ---------------------------------------------------------------------------------------------------------------------------------


def quitBrowser():

    # Print Process
    print("-> Quitting Browser")

    # Quit the browser window
    driver.quit()

    # Print Process
    print("-> Browser Quit")


def closeBrowser():

    # Print Process
    print("-> Closing Browser")

    # Quit the browser window
    driver.close()

    # Print Process
    print("-> Browser Closed")



In [3]:
def initiateParser(driver):
    """
    Initiates the Parser for Beautiful Soup
    
    """

    # Get the html page source from the driver
    html_page = driver.page_source

    # Use Beautiful Soup to parse through the html page
    parser = BeautifulSoup(html_page, 'html.parser')

    # Return the Parser
    return parser



In [4]:
def checkLastPage(parser):

    """
    Check the total number of pages at the bottom of the main page. 
    This function uses the driver and Beautiful Soup to find the total number of pages with questions in it for leetcode. 

    Libraries used: Beautiful Soup, Regex, Selenium
    """

    # Find the buttons at the bottom of the page which contains the total number of pages
    page_list = parser.find_all('button', class_='flex items-center justify-center px-3 h-8 rounded select-none focus:outline-none bg-fill-3 dark:bg-dark-fill-3 text-label-2 dark:text-dark-label-2 hover:bg-fill-2 dark:hover:bg-dark-fill-2')

    # Find the last page with a number associated with it.
    # Note this will be the second last page as the last page is just a button by itself
    last_page = page_list[-2]

    # Extract the last 12 digits of the page which should ensure all 3 digit numbers showing up
    number = str(last_page)[-13:]

    # Use Regex to find all the numbers in the last 13 digits. 
    # Note: Taking -1 so as to ensure that you take the last number associated with the page
    last_number = re.findall(r'\d+', number)[-1]

    # Print the highest page number
    print(f"The last page number is {int(last_number)}")

    # Return the last page number so it can be used later
    return int(last_number)

In [8]:
def questionList(parser):
    '''
    -------
    INPUT:
    -------

    This function takes the result of the initiateParser() function and uses it to find all the tables on the page. 
    It parses through all the tables and finds the table with the most rows within it. 
    This table will the one with all the questions for later parsing. 
    -----------------------------------------------------------------------------------------------------------------

    -------
    OUTPUT:
    -------

    The output of the function will be the html output for the table with all the questions within it

    
    -----------------
    Example Use Case:
    -----------------

    current_parser = initiateParser(driver)

    q_list = questionList(current_parser)

    '''

    # Find all the tables on the page (defined as role = 'rowgroup')
    q_table = parser.find_all('div', role='rowgroup')

    # Initialise an empty list to store information later
    lengths = []

    # Start a counter
    i = 0

    # For each table found on the page
    for table in range(len(q_table)):

        # Start a minimum length of table counter as 0
        length = 0

        # Check the length of the table
        new_length = len(q_table[i])

        # If the current table length is higher than previous table length
        if new_length > length:

            # Add the length of the new table in the list created previously
            lengths.append(new_length)

            # The new base length is now the length of the current highest table
            length = new_length

        # Increase the counter by 1 to check next table index
        i +=1

    # Extract the table with the highest length (which will be in the last place of the list hence -1)
    rows_in_table = lengths[-1]

    # Find the index of the table with the highest length of rows. Subtract 1 as it is 0 indexed
    table_index = i -1 

    # The list of questions will be in the table with the highest length, which is extracted here. 
    q_list = q_table[table_index].find_all('div', role='row')[0:rows_in_table]

    # Return Question List
    return q_list

---
Testing Application

---

In [25]:
quitBrowser()

-> Quitting Browser
-> Browser Quit


In [5]:
# Open Leetcode
driver = startBrowser("https://leetcode.com/", headless=False)

-> Starting Chrome
-> Browser Ready to Use


In [6]:
# Timeout in seconds to allow page to load completely
delay = 5 


# Click the "Get Started" link to get into the Overall Questions page
try:
    # Wait for the element with the ID of wrapper
    element = WebDriverWait(driver, delay).until(
      EC.presence_of_element_located((By.CLASS_NAME, "link"))
    )
    element.click()
except:
    quitBrowser()



# Click the "Problems" link to get into the problems page
try:
    # Wait for the element with the ID of wrapper
    element = WebDriverWait(driver, delay).until(
        EC.presence_of_element_located((By.XPATH,'//*[@id="leetcode-navbar"]/div[1]/ul/li[2]/a'))).click()
except:
    quitBrowser()


In [7]:
# Use the driver to initiate the parse
parser = initiateParser(driver)

In [8]:
# See how many pages of problems are in the set
last_page = checkLastPage(parser)

The last page number is 67


In [10]:
q_list = questionList(parser)

In [16]:
def index_question_difficulty(question_list):
    '''
    -------
    INPUT:
    -------

    This function takes the result of the questionList() function and uses it to find all the questions (and related information) on the page. 
    It parses through all the questions and finds the index number, question name and difficulty level of the questions.  
    -----------------------------------------------------------------------------------------------------------------

    -------
    OUTPUT:
    -------

    The output of the function will be the  3 lists:
    1. Index list
    2. Question name list
    3. Question difficulty list
    
    -----------------
    Example Use Case:
    -----------------

    q_list = questionList(current_parser)

    index_list, question_list, difficulty_list = index_question_difficulty(question_list = q_list)

    '''
    # Extracting Question number and Question string from the q_list. 

    # Create an empty dictionary
    question_list = []
    index_list = []
    difficulty_list = []

    # For row in the question list
    for question in q_list:

        # Find all the rows within the question list
        row = question.find_all('div', role='cell')

        # Isolates the section where name and index will be found 
        row_info = row[1].find('a').text 

        # Isolates the section where difficulty will be found
        row_info2 = row[4].find('span').text 


        # Create Regex patterns to match everything before and after the first period
        pattern_before = r'^(.*?)\.'
        pattern_after = r'\.(.*)'

        # Find all matches before a period using regex
        matches_before = re.findall(pattern_before, row_info)

        # Turn the result into an integer
        matches_before = int(matches_before[0])

        # Find all matches after a period using regex
        matches_after = re.findall(pattern_after, row_info)

        # Remove all the leading and trailing spaces from the result
        matches_after = matches_after[0].strip()

        # Add results to lists
        index_list.append(matches_before)
        question_list.append(matches_after)
        difficulty_list.append(row_info2)

    # Return the results
    return index_list, question_list, difficulty_list



In [18]:
index_list, question_list, difficulty_list = index_question_difficulty(question_list=q_list)

In [5]:
def dataScraper():
    
    last = checkLastPage(driver)

    delay = 5
    
    try:    
        # Wait for the element with the ID of wrapper
        element = WebDriverWait(driver, delay).until(
        EC.presence_of_element_located((By.CLASS_NAME, "link"))
        )
        element.click()

    except:
        


SyntaxError: incomplete input (3275129531.py, line 15)

In [10]:
import numpy as np
import pandas as pd


# Create lists for storing scraped data
titleList = []
solutionList = []
acceptanceList = []
difficultyList = []

# Element for each row of table
# <div role="row" class="odd:bg-layer-1 even:bg-overlay-1 dark:odd:bg-dark-layer-bg dark:even:bg-dark-fill-4" style="display: flex; flex: 1 0 auto; min-width: 0px;"><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 52 0 auto; min-width: 0px; width: 52px;"></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 260 0 auto; min-width: 0px; width: 260px;"><div class="max-w-[302px] flex items-center overflow-hidden"><div class="overflow-hidden"><div class="flex items-center"><div class="truncate"><a href="/problems/two-sum" class="h-5 hover:text-blue-s dark:hover:text-dark-blue-s">1. Two Sum</a></div></div></div></div></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 54 0 auto; min-width: 0px; width: 54px;"><a aria-label="solution" href="/problems/two-sum/solution" class="truncate"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="1em" height="1em" fill="currentColor" class="text-purple dark:text-dark-purple h-5 w-5"><path d="M10 15.464v-3.927a.8.8 0 011.259-.656l2.805 1.964a.8.8 0 010 1.31l-2.805 1.964A.8.8 0 0110 15.464z"></path><path d="M7 4a1 1 0 00-1 1v14a1 1 0 001 1h10a1 1 0 001-1V9h-3a2 2 0 01-2-2V4H7zm8 .6V7h1.92L15 4.6zM4 5a3 3 0 013-3h7.039a3 3 0 012.342 1.126l2.962 3.701A3 3 0 0120 8.702V19a3 3 0 01-3 3H7a3 3 0 01-3-3V5z"></path></svg></a></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 100 0 auto; min-width: 0px; width: 100px;"><span>53.9%</span></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 84 0 auto; min-width: 0px; width: 84px;"><span class="text-olive dark:text-dark-olive">Easy</span></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 84 0 auto; min-width: 0px; width: 84px;"><a class="flex h-full w-full flex-row items-center" href="/subscribe/?ref=ps_fq" data-state="closed"><div class="flex h-full w-full flex-row items-center"><span class="h-2 flex-1 rounded-l-lg bg-fill-3 dark:bg-dark-fill-3"></span><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="1em" height="1em" fill="currentColor" class="text-gray-5 dark:text-gray-7 -mt-1.5 h-5 w-5"><path fill-rule="evenodd" d="M7 8v2H6a3 3 0 00-3 3v6a3 3 0 003 3h12a3 3 0 003-3v-6a3 3 0 00-3-3h-1V8A5 5 0 007 8zm8 0v2H9V8a3 3 0 116 0zm-3 6a2 2 0 100 4 2 2 0 000-4z" clip-rule="evenodd"></path></svg><span class="h-2 flex-1 rounded-r-lg bg-fill-3 dark:bg-dark-fill-3"></span></div></a></div></div>"



In [22]:
driver.quit()


In [11]:
# driver = startBrowser("https://leetcode.com/", headless=True)


-> Starting Chrome Headless
-> Headless browser opened and ready to use


In [10]:
print(driver.title)

Google


In [11]:
search = driver.find_element(By.NAME ,"q")
search.send_keys("scotiabank canada")
search.send_keys(Keys.RETURN)

In [13]:
# Timeout in seconds
delay = 5 

try:
    # Wait for the element with the ID of wrapper
    main = WebDriverWait(driver, delay).until(
      EC.presence_of_element_located((By.ID, "rso"))
    )
    

except:
    driver.quit()


# 

main = driver.find_element()


AttributeError: type object 'By' has no attribute 'name'

In [15]:
driver.quit()