# Project Final Web Scraping Data
Alex Mueggenberg and Sebastian Rios

Webscraping code for the overall clubs performance from the website: https://www.premierleague.com/tables?co=1&se=418&ha=-1

In [22]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time


driver = webdriver.Chrome()  
url = "https://www.premierleague.com/tables?co=1&se=418&ha=-1"
driver.get(url)

time.sleep(5)  


rows = driver.find_elements(By.CSS_SELECTOR, "tbody tr")
team_names = []
played = []
won = []
drawn = []
lost = []
goals_for = []
goals_against = []
goal_difference = []
points = []

for row in rows:
    cols = row.find_elements(By.TAG_NAME, "td")
    if len(cols) >= 10: 
        team_names.append(cols[1].text.strip())
        played.append(cols[2].text.strip())
        won.append(cols[3].text.strip())
        drawn.append(cols[4].text.strip())
        lost.append(cols[5].text.strip())
        goals_for.append(cols[6].text.strip())
        goals_against.append(cols[7].text.strip())
        goal_difference.append(cols[8].text.strip())
        points.append(cols[9].text.strip())
driver.quit()

df = pd.DataFrame({
    "Team": team_names,
    "Played": played,
    "Won": won,
    "Drawn": drawn,
    "Lost": lost,
    "Goals For": goals_for,
    "Goals Against": goals_against,
    "Goal Difference": goal_difference,
    "Points": points
})
df.to_csv("premier_league_table.csv", index=False)
print(df)


                 Team Played Won Drawn Lost Goals For Goals Against  \
0     Manchester City     38  29     6    3        99            26   
1           Liverpool     38  28     8    2        94            26   
2             Chelsea     38  21    11    6        76            33   
3   Tottenham Hotspur     38  22     5   11        69            40   
4             Arsenal     38  22     3   13        61            48   
..                ...    ...  ..   ...  ...       ...           ...   
67                                                                    
68                                                                    
69                                                                    
70                                                                    
71                                                                    

   Goal Difference Points  
0               73     93  
1               68     92  
2               43     74  
3               29     71  
4      

In [23]:
df.head(20)

Unnamed: 0,Team,Played,Won,Drawn,Lost,Goals For,Goals Against,Goal Difference,Points
0,Manchester City,38,29,6,3,99,26,73,93
1,Liverpool,38,28,8,2,94,26,68,92
2,Chelsea,38,21,11,6,76,33,43,74
3,Tottenham Hotspur,38,22,5,11,69,40,29,71
4,Arsenal,38,22,3,13,61,48,13,69
5,Manchester United,38,16,10,12,57,57,0,58
6,West Ham United,38,16,8,14,60,51,9,56
7,Leicester City,38,14,10,14,62,59,3,52
8,Brighton And Hove Albion,38,12,15,11,42,44,-2,51
9,Wolverhampton Wanderers,38,15,6,17,38,43,-5,51


In [24]:
df['Team'] = df['Team'].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
df_teams = df[df['Team'] != ""]

In [25]:
df_teams

Unnamed: 0,Team,Played,Won,Drawn,Lost,Goals For,Goals Against,Goal Difference,Points
0,Manchester City,38,29,6,3,99,26,73,93
1,Liverpool,38,28,8,2,94,26,68,92
2,Chelsea,38,21,11,6,76,33,43,74
3,Tottenham Hotspur,38,22,5,11,69,40,29,71
4,Arsenal,38,22,3,13,61,48,13,69
5,Manchester United,38,16,10,12,57,57,0,58
6,West Ham United,38,16,8,14,60,51,9,56
7,Leicester City,38,14,10,14,62,59,3,52
8,Brighton And Hove Albion,38,12,15,11,42,44,-2,51
9,Wolverhampton Wanderers,38,15,6,17,38,43,-5,51


In [26]:
df_teams.to_csv("premier_league_table.csv", index=False)


Second Web Scraping which contains all of the players Salaries(weekly and annual)

In [28]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Setup
options = Options()
options.headless = False
driver = webdriver.Chrome(options=options)
driver.get("https://www.capology.com/uk/premier-league/salaries/2021-2022/")
wait = WebDriverWait(driver, 10)
time.sleep(5)

driver.execute_script("""
    const sticky = document.querySelector('.nav-sticky');
    if (sticky) sticky.style.display = 'none';
""")

all_data = []

# Find how many pages exist
pagination_links = driver.find_elements(By.CSS_SELECTOR, "ul.pagination li a.page-link")
page_numbers = [int(link.text) for link in pagination_links if link.text.isdigit()]
total_pages = max(page_numbers)
print(f" Total pages found: {total_pages}")

for page_num in range(1, total_pages + 1):
    print(f"Scraping page {page_num}...")

    # Wait for table to load
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table tbody tr")))
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")

    for row in rows:
        cols = row.find_elements(By.TAG_NAME, "td")
        if len(cols) >= 6:
            player = cols[0].text.strip()
            club = cols[1].text.strip()
            weekly_salary = cols[3].text.strip()
            annual_salary = cols[4].text.strip()
            all_data.append([player, club, weekly_salary, annual_salary])

    # Don't click after last page
    if page_num < total_pages:
        try:
            next_page_btn = wait.until(EC.presence_of_element_located(
                (By.XPATH, f"//ul[contains(@class,'pagination')]//a[text()='{page_num + 1}']")
            ))
            driver.execute_script("arguments[0].scrollIntoView(true);", next_page_btn)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", next_page_btn)
            time.sleep(4)
        except Exception as e:
            print(f" Could not click page {page_num + 1}: {e}")
            break

driver.quit()

# Save CSV
dfs = pd.DataFrame(all_data, columns=["Player", "Weekly Salary", "Annual Salary", "Position"])
dfs.to_csv("premier_league_salaries_2021_2022.csv", index=False)
print(" All done! Data from all pages saved.")

 Total pages found: 23
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...


InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: Unable to receive message from renderer
  (Session info: chrome=136.0.7103.93)
Stacktrace:
0   chromedriver                        0x00000001008578b8 chromedriver + 5986488
1   chromedriver                        0x000000010084e9ea chromedriver + 5949930
2   chromedriver                        0x0000000100307600 chromedriver + 415232
3   chromedriver                        0x00000001002ef0f5 chromedriver + 315637
4   chromedriver                        0x00000001002eee24 chromedriver + 314916
5   chromedriver                        0x00000001002ecccd chromedriver + 306381
6   chromedriver                        0x00000001002ed5ff chromedriver + 308735
7   chromedriver                        0x00000001002fc2d3 chromedriver + 369363
8   chromedriver                        0x0000000100315ddd chromedriver + 474589
9   chromedriver                        0x000000010031c4bb chromedriver + 500923
10  chromedriver                        0x00000001002edcbb chromedriver + 310459
11  chromedriver                        0x000000010031551e chromedriver + 472350
12  chromedriver                        0x00000001003a645e chromedriver + 1066078
13  chromedriver                        0x000000010037f0e3 chromedriver + 905443
14  chromedriver                        0x000000010034b61d chromedriver + 693789
15  chromedriver                        0x000000010034c281 chromedriver + 696961
16  chromedriver                        0x0000000100815580 chromedriver + 5715328
17  chromedriver                        0x0000000100819445 chromedriver + 5731397
18  chromedriver                        0x00000001007efed0 chromedriver + 5562064
19  chromedriver                        0x0000000100819e7b chromedriver + 5734011
20  chromedriver                        0x00000001007dede4 chromedriver + 5492196
21  chromedriver                        0x000000010083ca98 chromedriver + 5876376
22  chromedriver                        0x000000010083cc60 chromedriver + 5876832
23  chromedriver                        0x000000010084e5b1 chromedriver + 5948849
24  libsystem_pthread.dylib             0x00007ff80a7a5df1 _pthread_start + 99
25  libsystem_pthread.dylib             0x00007ff80a7a1857 thread_start + 15


In [None]:
display(dfs.head(476))

In [None]:
df.replace('', pd.NA, inplace=True)  
df.dropna(how='all', inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
#final checking of the data and the data was already created into a csv file
df