In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from time import time

from bs4 import BeautifulSoup
import re

In [2]:
# Define a function to start chrome with a given link address
def startBrowser(link, headless=False):
    '''
    Start a Chrome browser with Selenium. 

    This function takes a link or url and opens that in a browser with Selenium. 
    
    It has the option of opening the browser as headless (which means that there is no Graphical User Interface but all 
    capabilities are available therein.)

    
    '''

    # Open Browser in the background, with no display
    if headless == True:

        # Print information for script about the task being undertaken
        print("-> Starting Chrome Headless")


        # Set up options to start chrome with
        settings = webdriver.ChromeOptions()

        # Ignore certificate errors from chrome
        settings.add_argument('--ignore-certificate-errors')

        #  Allow any type-safe setter from Chrome which are currently not encoded from Selenium side 
        settings.add_experimental_option('excludeSwitches', ['enable-logging'])

        # Open the browser in Incognito
        settings.add_argument('--incognito')

        # Open the browser headless
        settings.add_argument('--headless')

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=settings) 
        
        driver.get(link)

        print('-> Headless browser opened and ready to use')

        return driver


    else:

        # Print information for script about the task being undertaken
        print("-> Starting Chrome")

        # Set a driver for the chrome startup
        driver = webdriver.Chrome()

        # Make the driver navigate to the link provided in function input
        driver.get(link)

        # Maximize the window for easier work
        driver.maximize_window()

        # Print message about browser being ready to use
        print("-> Browser Ready to Use")

        # Return the driver which can be used for the operating the browser through Selenium in later code
        return driver


# ---------------------------------------------------------------------------------------------------------------------------------
# driver.quit() :
# The quit() method quits the driver, closing every associated window. 

# driver.close() : 
# The close() method closes the currently focused window, quitting the driver if the current window is the only open window.
# ---------------------------------------------------------------------------------------------------------------------------------


def quitBrowser():

    # Print Process
    print("-> Quitting Browser")

    # Quit the browser window
    driver.quit()

    # Print Process
    print("-> Browser Quit")


def closeBrowser():

    # Print Process
    print("-> Closing Browser")

    # Quit the browser window
    driver.close()

    # Print Process
    print("-> Browser Closed")



In [3]:
def initiateParser(driver):
    """
    Initiates the Parser for Beautiful Soup
    
    """

    # Get the html page source from the driver
    html_page = driver.page_source

    # Use Beautiful Soup to parse through the html page
    parser = BeautifulSoup(html_page, 'html.parser')

    # Return the Parser
    return parser



In [4]:
def checkLastPage(parser):

    """
    Check the total number of pages at the bottom of the main page. 
    This function uses the driver and Beautiful Soup to find the total number of pages with questions in it for leetcode. 

    Libraries used: Beautiful Soup, Regex, Selenium
    """

    # Find the buttons at the bottom of the page which contains the total number of pages
    page_list = parser.find_all('button', class_='flex items-center justify-center px-3 h-8 rounded select-none focus:outline-none bg-fill-3 dark:bg-dark-fill-3 text-label-2 dark:text-dark-label-2 hover:bg-fill-2 dark:hover:bg-dark-fill-2')

    # Find the last page with a number associated with it.
    # Note this will be the second last page as the last page is just a button by itself
    last_page = page_list[-2]

    # Extract the last 12 digits of the page which should ensure all 3 digit numbers showing up
    number = str(last_page)[-13:]

    # Use Regex to find all the numbers in the last 13 digits. 
    # Note: Taking -1 so as to ensure that you take the last number associated with the page
    last_number = re.findall(r'\d+', number)[-1]

    # Print the highest page number
    print(f"The last page number is {int(last_number)}")

    # Return the last page number so it can be used later
    return int(last_number)

In [5]:
def dataScraper():
    
    last = checkLastPage(driver)

    delay = 5
    
    try:    
        # Wait for the element with the ID of wrapper
        element = WebDriverWait(driver, delay).until(
        EC.presence_of_element_located((By.CLASS_NAME, "link"))
        )
        element.click()

    except:
        


SyntaxError: incomplete input (3275129531.py, line 15)

---
Testing Application

---

In [21]:
quitBrowser()

-> Quitting Browser
-> Browser Quit


In [5]:
# Open Leetcode
driver = startBrowser("https://leetcode.com/", headless=False)

-> Starting Chrome
-> Browser Ready to Use


In [6]:
# Timeout in seconds to allow page to load completely
delay = 5 


# Click the "Get Started" link to get into the Overall Questions page
try:
    # Wait for the element with the ID of wrapper
    element = WebDriverWait(driver, delay).until(
      EC.presence_of_element_located((By.CLASS_NAME, "link"))
    )
    element.click()
except:
    quitBrowser()



# Click the "Problems" link to get into the problems page
try:
    # Wait for the element with the ID of wrapper
    element = WebDriverWait(driver, delay).until(
        EC.presence_of_element_located((By.XPATH,'//*[@id="leetcode-navbar"]/div[1]/ul/li[2]/a'))).click()
except:
    quitBrowser()


In [7]:
# Use the driver to initiate the parse
parser = initiateParser(driver)

In [8]:
# See how many pages of problems are in the set
last_page = checkLastPage(parser)

The last page number is 67


In [20]:
# Checking the table lengths on the page to find the longest table, as that will contain the questions. 
q_table = parser.find_all('div', role='rowgroup')
print(f'The total number of tables on the page are {len(q_table)}.')


lengths = []
i = 0
for table in range(len(q_table)):
    print(f'Table {i} has {len(q_table[i])} rows')
    lengths.append(len(q_table[i]))
    i+=1

for i, j in enumerate(lengths):
    print(i, j)
    


The total number of tables on the page are 3.
Table 0 has 0 rows
Table 1 has 0 rows
Table 2 has 51 rows
0 0
1 0
2 51


In [46]:
#TESTING whether q_test1 contains questions
print(q_test1[1])

q_list = q_table[2].find_all('div', role='row')[1:52]




for question in q_list:
    row = question.find_all('div', role='cell')
    row_info = row[1].find('a').text
    print(row_info)

<div class="odd:bg-layer-1 even:bg-overlay-1 dark:odd:bg-dark-layer-bg dark:even:bg-dark-fill-4" role="row" style="display: flex; flex: 1 0 auto; min-width: 0px;"><div class="mx-2 flex items-center py-[11px]" role="cell" style="box-sizing: border-box; flex: 52 0 auto; min-width: 0px; width: 52px;"></div><div class="mx-2 flex items-center py-[11px]" role="cell" style="box-sizing: border-box; flex: 260 0 auto; min-width: 0px; width: 260px;"><div class="max-w-[302px] flex items-center overflow-hidden"><div class="overflow-hidden"><div class="flex items-center"><div class="truncate"><a class="h-5 hover:text-blue-s dark:hover:text-dark-blue-s" href="/problems/two-sum">1. Two Sum</a></div></div></div></div></div><div class="mx-2 flex items-center py-[11px]" role="cell" style="box-sizing: border-box; flex: 54 0 auto; min-width: 0px; width: 54px;"><a aria-label="solution" class="truncate" href="/problems/two-sum/solution"><svg class="text-purple dark:text-dark-purple h-5 w-5" fill="currentColo

In [25]:
print(q_test[0])

<div role="row" style="display: flex; flex: 1 0 auto; min-width: 0px;"><div class="mx-2 py-[11px] font-normal text-label-3 dark:text-dark-label-3" colspan="1" role="columnheader" style="box-sizing: border-box; flex: 52 0 auto; min-width: 0px; width: 52px;"><div class="flex items-center justify-between"><div class="overflow-hidden text-ellipsis">Status</div></div></div><div class="mx-2 py-[11px] font-normal text-label-3 dark:text-dark-label-3 hover:text-gray-7 dark:hover:text-dark-gray-7 group" colspan="1" role="columnheader" style="box-sizing: border-box; flex: 260 0 auto; min-width: 0px; width: 260px; cursor: pointer;"><div class="flex items-center justify-between"><div class="overflow-hidden text-ellipsis">Title</div><span class="text-gray-5 dark:text-dark-gray-5 ml-2 h-3.5 w-3.5 group-hover:text-gray-7 dark:group-hover:text-dark-gray-7"><svg fill="currentColor" height="1em" viewbox="0 0 24 24" width="1em" xmlns="http://www.w3.org/2000/svg"><path d="M18.695 9.378L12.83 3.769a1.137 1.

In [14]:

q_test = parser.find_all('div', role='row')
len(q_test)


52

In [11]:
# Check the total number of questions in the table
q_list = parser.find_all('div', role='row')[1:52]


In [10]:
import numpy as np
import pandas as pd


# Create lists for storing scraped data
titleList = []
solutionList = []
acceptanceList = []
difficultyList = []

# Element for each row of table
# <div role="row" class="odd:bg-layer-1 even:bg-overlay-1 dark:odd:bg-dark-layer-bg dark:even:bg-dark-fill-4" style="display: flex; flex: 1 0 auto; min-width: 0px;"><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 52 0 auto; min-width: 0px; width: 52px;"></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 260 0 auto; min-width: 0px; width: 260px;"><div class="max-w-[302px] flex items-center overflow-hidden"><div class="overflow-hidden"><div class="flex items-center"><div class="truncate"><a href="/problems/two-sum" class="h-5 hover:text-blue-s dark:hover:text-dark-blue-s">1. Two Sum</a></div></div></div></div></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 54 0 auto; min-width: 0px; width: 54px;"><a aria-label="solution" href="/problems/two-sum/solution" class="truncate"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="1em" height="1em" fill="currentColor" class="text-purple dark:text-dark-purple h-5 w-5"><path d="M10 15.464v-3.927a.8.8 0 011.259-.656l2.805 1.964a.8.8 0 010 1.31l-2.805 1.964A.8.8 0 0110 15.464z"></path><path d="M7 4a1 1 0 00-1 1v14a1 1 0 001 1h10a1 1 0 001-1V9h-3a2 2 0 01-2-2V4H7zm8 .6V7h1.92L15 4.6zM4 5a3 3 0 013-3h7.039a3 3 0 012.342 1.126l2.962 3.701A3 3 0 0120 8.702V19a3 3 0 01-3 3H7a3 3 0 01-3-3V5z"></path></svg></a></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 100 0 auto; min-width: 0px; width: 100px;"><span>53.9%</span></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 84 0 auto; min-width: 0px; width: 84px;"><span class="text-olive dark:text-dark-olive">Easy</span></div><div role="cell" class="mx-2 flex items-center py-[11px]" style="box-sizing: border-box; flex: 84 0 auto; min-width: 0px; width: 84px;"><a class="flex h-full w-full flex-row items-center" href="/subscribe/?ref=ps_fq" data-state="closed"><div class="flex h-full w-full flex-row items-center"><span class="h-2 flex-1 rounded-l-lg bg-fill-3 dark:bg-dark-fill-3"></span><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="1em" height="1em" fill="currentColor" class="text-gray-5 dark:text-gray-7 -mt-1.5 h-5 w-5"><path fill-rule="evenodd" d="M7 8v2H6a3 3 0 00-3 3v6a3 3 0 003 3h12a3 3 0 003-3v-6a3 3 0 00-3-3h-1V8A5 5 0 007 8zm8 0v2H9V8a3 3 0 116 0zm-3 6a2 2 0 100 4 2 2 0 000-4z" clip-rule="evenodd"></path></svg><span class="h-2 flex-1 rounded-r-lg bg-fill-3 dark:bg-dark-fill-3"></span></div></a></div></div>"



In [87]:
q_list = parser.find_all('div', role='row')[1:52]




for question in q_list:
    row = question.find_all('div', role='cell')
    row_info = row[1].find('a').text
    print(row_info)

1963. Minimum Number of Swaps to Make the String Balanced
1. Two Sum
2. Add Two Numbers
3. Longest Substring Without Repeating Characters
4. Median of Two Sorted Arrays
5. Longest Palindromic Substring
6. Zigzag Conversion
7. Reverse Integer
8. String to Integer (atoi)
9. Palindrome Number
10. Regular Expression Matching
11. Container With Most Water
12. Integer to Roman
13. Roman to Integer
14. Longest Common Prefix
15. 3Sum
16. 3Sum Closest
17. Letter Combinations of a Phone Number
18. 4Sum
19. Remove Nth Node From End of List
20. Valid Parentheses
21. Merge Two Sorted Lists
22. Generate Parentheses
23. Merge k Sorted Lists
24. Swap Nodes in Pairs
25. Reverse Nodes in k-Group
26. Remove Duplicates from Sorted Array
27. Remove Element
28. Find the Index of the First Occurrence in a String
29. Divide Two Integers
30. Substring with Concatenation of All Words
31. Next Permutation
32. Longest Valid Parentheses
33. Search in Rotated Sorted Array
34. Find First and Last Position of Element

In [21]:
# Find the last page number to see the number of pages to scrape in total

# Find the buttons at the bottom of the page which contains the total number of pages
page_list = parser.find_all('button', class_='flex items-center justify-center px-3 h-8 rounded select-none focus:outline-none bg-fill-3 dark:bg-dark-fill-3 text-label-2 dark:text-dark-label-2 hover:bg-fill-2 dark:hover:bg-dark-fill-2')

# Find the last page with a number associated with it.
# Note this will be the second last page as the last page is just a button by itself
last_page = page_list[-2]

# Extract the last 12 digits of the page which should ensure all 3 digit numbers showing up
number = str(last_page)[-13:]

# Use Regex to find all the numbers in the last 13 digits. 
# Note: Taking -1 so as to ensure that you take the last number associated with the page
last_number = re.findall(r'\d+', number)[-1]

# Print the highest page number
print(int(last_number))

67


In [22]:
# Find using multiple attributes
parser.find('div', attrs={'role': 'row', 'class':'odd:bg-layer-1 even:bg-overlay-1 dark:odd:bg-dark-layer-bg dark:even:bg-dark-fill-4'})

<div class="odd:bg-layer-1 even:bg-overlay-1 dark:odd:bg-dark-layer-bg dark:even:bg-dark-fill-4" role="row" style="display: flex; flex: 1 0 auto; min-width: 0px;"><div class="mx-2 flex items-center py-[11px]" role="cell" style="box-sizing: border-box; flex: 52 0 auto; min-width: 0px; width: 52px;"><a href="/problems/minimum-string-length-after-removing-substrings/?envType=daily-question&amp;envId=2024-10-07"><svg class="h-[18px] w-[18px] cursor-pointer text-green-s dark:text-dark-green-s" fill="currentColor" height="1em" viewbox="0 0 24 24" width="1em" xmlns="http://www.w3.org/2000/svg"><path clip-rule="evenodd" d="M19 11.063V7h-2v1a1 1 0 11-2 0V7H9v1a1 1 0 01-2 0V7H5v4.063h14zm0 2H5V19h14v-5.938zM9 5h6V4a1 1 0 112 0v1h2a2 2 0 012 2v12a2 2 0 01-2 2H5a2 2 0 01-2-2V7a2 2 0 012-2h2V4a1 1 0 012 0v1z" fill-rule="evenodd"></path></svg></a></div><div class="mx-2 flex items-center py-[11px]" role="cell" style="box-sizing: border-box; flex: 260 0 auto; min-width: 0px; width: 260px;"><div class=

In [23]:
driver.quit()


In [11]:
# driver = startBrowser("https://leetcode.com/", headless=True)


-> Starting Chrome Headless
-> Headless browser opened and ready to use


In [10]:
print(driver.title)

Google


In [11]:
search = driver.find_element(By.NAME ,"q")
search.send_keys("scotiabank canada")
search.send_keys(Keys.RETURN)

In [13]:
# Timeout in seconds
delay = 5 

try:
    # Wait for the element with the ID of wrapper
    main = WebDriverWait(driver, delay).until(
      EC.presence_of_element_located((By.ID, "rso"))
    )
    

except:
    driver.quit()


# 

main = driver.find_element()


AttributeError: type object 'By' has no attribute 'name'

In [15]:
driver.quit()