## Web Scraping using Selenium webdriver and webdriver_manager

In [1]:
# Install Selenium -
!pip install selenium

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl (475 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

To set up the WebDriver you need to install the suitable version of your preferred browser manually on your local machine. But if you don't want to do that, you can use the <b>webdriver_manager</b> package.<br>
At First you need to install the package:

In [2]:
# Install webdriver manager
!pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.1 webdriver_manager-4.0.1


Then import selenium web driver and the manager of your preferred browser:

In [3]:
import pandas as pd
import numpy as np

# Import selenium webdriver and
from selenium import webdriver
from selenium.webdriver.common.by import By

# for Selenium 3 you need to import the service by uncommenting the following line,
# but for Selenium 4 you don't have to
#from selenium.webdriver.chrome.service import Service

# Import the webdriver manager of your browser, ie: chrome
from webdriver_manager.chrome import ChromeDriverManager

<b>If you want learn more about how to set-up and use webdriver_manager, you can visit [this page](https://pypi.org/project/webdriver-manager/) </b> <br>

Now you are ready to set up the WebDriver and start scraping: <br><br>
Let's say that we want to scrape the following web page: https://www.scrapethissite.com/pages/forms <br>
this page contains data about Hockey Teams in USA, which is taken from a database of NHL team stats since 1990 to 2011.

In [None]:
# Set up the WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Define the URL that the driver will get
# the URL that contains the data you are going to scrape
driver.get("https://www.scrapethissite.com/pages/forms")

# Search form submission,
# let's suppose that We want to use the input search field to filter the results of the data table
# ie: get all data where the team name is 'Los Angeles Kings'
# At first, Find the element of the input field
q = driver.find_element(By.ID, "q")

# Then, Fill the input with your search keyword
q.send_keys("Los Angeles Kings")

# Then, submit the form to execute the filter process and get the results
q.submit()

# Print all the filtered table values
table = driver.find_element(By.CLASS_NAME, "table")
print(table.text)

# Or, you can iterate the output data and save them
datatable = []

# Find all teams data from the filtered table
teams = table.find_elements(By.CLASS_NAME, "team")

# Iterate all the teams to get the wanted data, then append the results to the datatable
for team in teams:
    team_name = team.find_element(By.CLASS_NAME, "name").text.strip()
    year = team.find_element(By.CLASS_NAME, "year").text.strip()
    wins = team.find_element(By.CLASS_NAME, "wins").text.strip()
    losses = team.find_element(By.CLASS_NAME, "losses").text.strip()

    datatable.append({
        'team_name': team_name,
        'year': year,
        'wins': wins,
        'losses': losses
    })


# After finishing our scraping and getting all the wanted data, we must quit the driver
driver.quit()


Team Name Year Wins Losses OT Losses Win % Goals For (GF) Goals Against (GA) + / -
Los Angeles Kings 1990 46 24 0.575 340 254 86
Los Angeles Kings 1991 35 31 0.438 287 296 -9
Los Angeles Kings 1992 39 35 0.464 338 340 -2
Los Angeles Kings 1993 27 45 0.321 294 322 -28
Los Angeles Kings 1994 16 23 0.333 142 174 -32
Los Angeles Kings 1995 24 40 0.293 256 302 -46
Los Angeles Kings 1996 28 43 0.341 214 268 -54
Los Angeles Kings 1997 38 33 0.463 227 225 2
Los Angeles Kings 1998 32 45 0.39 189 222 -33
Los Angeles Kings 1999 39 27 4 0.476 245 228 17
Los Angeles Kings 2000 38 28 3 0.463 252 228 24
Los Angeles Kings 2001 40 27 4 0.488 214 190 24
Los Angeles Kings 2002 33 37 6 0.402 203 221 -18
Los Angeles Kings 2003 28 29 9 0.341 205 217 -12
Los Angeles Kings 2005 42 35 5 0.512 249 270 -21
Los Angeles Kings 2006 27 41 14 0.329 227 283 -56
Los Angeles Kings 2007 32 43 7 0.39 231 266 -35
Los Angeles Kings 2008 34 37 11 0.415 207 234 -27
Los Angeles Kings 2009 46 27 9 0.561 241 219 22
Los Angeles K

In [None]:
# Save the data into a Dataframe, and explore it
teams_filtered = pd.DataFrame(datatable)
teams_filtered

Unnamed: 0,team_name,year,wins,losses
0,Los Angeles Kings,1990,46,24
1,Los Angeles Kings,1991,35,31
2,Los Angeles Kings,1992,39,35
3,Los Angeles Kings,1993,27,45
4,Los Angeles Kings,1994,16,23
5,Los Angeles Kings,1995,24,40
6,Los Angeles Kings,1996,28,43
7,Los Angeles Kings,1997,38,33
8,Los Angeles Kings,1998,32,45
9,Los Angeles Kings,1999,39,27


<b>What if I want to get all the table data without any filters?</b> Here we can see that the data is rendered over several pages within the table which is called pagination, each page table contains 25 records. Getting data from all pages might be more complex, but we can do it like the following:

In [None]:
def scrape_data():
    """
    Scrape data from a specific URL.

    Parameters:
    None

    Returns:
    data
    """
    # Set up the WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    # Define the URL that the driver will get
    # the URL that contains the data you are going to scrape
    driver.get("https://www.scrapethissite.com/pages/forms")

    data = []

    # Fing the 'pagination' element that contains the pages
    pagination = driver.find_elements(By.CSS_SELECTOR, "ul.pagination>li")

    # By default the first rendered page has page number 1
    page_number = 1

    # Etirate through each page in the pagination element to get the its data
    for page in pagination:

        try:

            # Page numbers are clickable, so we need to click the page number to render its data
            # to do that in code, we need first to find the'href' attribute for the page number
            # then click it using code to render the data
            page = driver.find_element(By.CSS_SELECTOR, f"a[href='/pages/forms/?page_num={page_number}']")
            page.click()

            # Find all teams data from the rendered table
            teams = driver.find_elements(By.CLASS_NAME, "team")

            # Iterate all the teams to get the wanted data, then append the results to the 'data'
            for team in teams:
                team_name = team.find_element(By.CLASS_NAME, "name").text.strip()
                year = team.find_element(By.CLASS_NAME, "year").text.strip()
                wins = team.find_element(By.CLASS_NAME, "wins").text.strip()
                losses = team.find_element(By.CLASS_NAME, "losses").text.strip()

                data.append({
                    'team_name': team_name,
                    'year': year,
                    'wins': wins,
                    'losses': losses
                })
        except:
            # If the page number does not exist in the pagination, then don't raise an exception and continue
            continue

        # Set the next page number
        page_number = page_number + 1

    # After finishing our scraping and getting all the wanted data, we must quit the driver
    driver.quit()

    # return the output data
    return data

In [None]:
# Call the scraper function
scraped_data = scrape_data()

In [None]:
# Save the data into a Dataframe, and explore it
teams_data = pd.DataFrame(scraped_data)
teams_data

Unnamed: 0,team_name,year,wins,losses
0,Boston Bruins,1990,44,24
1,Buffalo Sabres,1990,31,30
2,Calgary Flames,1990,46,26
3,Chicago Blackhawks,1990,49,23
4,Detroit Red Wings,1990,34,38
...,...,...,...,...
577,Tampa Bay Lightning,2011,38,36
578,Toronto Maple Leafs,2011,35,37
579,Vancouver Canucks,2011,51,22
580,Washington Capitals,2011,42,32


In [None]:
# explore the scraped data
teams_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582 entries, 0 to 581
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   team_name  582 non-null    object
 1   year       582 non-null    object
 2   wins       582 non-null    object
 3   losses     582 non-null    object
dtypes: object(4)
memory usage: 18.3+ KB
