#### For each match, for all players, we want: team name, player name, position, rating, minutes played, stats
#### Also get date of match. With this, we can use date of birth to find the age. Also useful for tracking rating before and after certain dates
#### The dataframe could look like:

#### Match date | Season | Team name | Player's name | Player's d.o.b. | Player's position | Player's rating | Minutes played | Other player stats (many columns)

In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Initialize the WebDriver with custom options
def initialize_driver():
    chrome_options = ChromeOptions()
    # Set the page load strategy to 'none'
    chrome_options.page_load_strategy = 'none'
    
    return webdriver.Chrome(options=chrome_options)

driver = initialize_driver()

def navigate_to_page(url):
    """Navigate to the given URL without waiting for the full page load."""
    driver.get(url)
    time.sleep(2)  # Short initial wait to start loading the page

def scroll_to_matches():
    """Scroll down to the Matches section of the page."""
    try:
        # Scroll to the estimated position to bring the Matches section into view
        scroll_position = 1700  # Adjusted value for the Matches section
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        print(f"Scrolled to position: {scroll_position}")
    except Exception as e:
        print(f"An error occurred while scrolling: {e}")

def collect_match_ids(match_ids, round_number):
    """Collect unique match IDs from the currently displayed Matches section."""
    try:
        # Ensure the elements are fully loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a[href*="/football/match/"]'))
        )
        # Second scroll to ensure all matches are visible
        scroll_to_matches()
        time.sleep(1)  # Allow time for full rendering after scroll

        match_links = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/football/match/"]')
        print(f"Round {round_number}: Match links located (count: {len(match_links)}).")

        if len(match_links) < 10:
            print(f"Warning: Less than 10 matches found for round {round_number}, possible loading issue.")

        for match in match_links:
            match_url = match.get_attribute('href')
            match_id = match_url.split('#id:')[-1]
            if match_id not in match_ids:
                match_ids.append(match_id)
                print(f"Found match: {match_url} with Match ID: {match_id}")
    except Exception as e:
        print(f"Error occurred during match collection: {e}")

def find_and_click_left_arrow():
    """Find and click the left arrow button to navigate to the previous round."""
    try:
        left_arrow = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "button.Button.iCnTrv[style*='visibility: visible;']"))
        )
        if left_arrow.is_displayed() and left_arrow.is_enabled():
            driver.execute_script("arguments[0].click();", left_arrow)
            print("Clicked on the left arrow button.")
            return True
    except Exception as e:
        print(f"Failed to click the left arrow button: {e}")
    return False

def scrape_premier_league_matches():
    """Main function to scrape match IDs from the Premier League page."""
    match_ids = []
    total_rounds = 38  # Number of rounds in the Premier League
    round_click_count = 0

    # Navigate to the Premier League page for the desired season
    navigate_to_page("https://www.sofascore.com/tournament/football/england/premier-league/17#id:52186")
    scroll_to_matches()

    # Collect match IDs for the initial round
    collect_match_ids(match_ids, round_click_count + 1)

    while round_click_count < (total_rounds - 1):  # Loop for 37 left arrow clicks
        try:
            # Attempt to click the left arrow button
            if not find_and_click_left_arrow():
                print("Left arrow button is no longer interactable. Ending loop.")
                break

            round_click_count += 1
            print(f"Total round navigations (left arrow clicks): {round_click_count}")

            # Wait briefly to allow the page to load partially
            time.sleep(3)  # Wait time increased to 3 seconds to allow more content to load

            # Collect match IDs for the current round
            collect_match_ids(match_ids, round_click_count + 1)
            
        except Exception as e:
            print(f"An error occurred during scraping: {e}")
            break

    return match_ids, round_click_count

# Example usage
collected_match_ids, round_click_count = scrape_premier_league_matches()
print("Collected Unique Match IDs:", collected_match_ids)
print(f"Total round navigations (left arrow clicks): {round_click_count}")

# Close the browser after scraping
driver.quit()

Scrolled to position: 1700
Scrolled to position: 1700
Round 1: Match links located (count: 31).
Found match: https://www.sofascore.com/football/match/west-ham-united-manchester-city/rM#id:11352571 with Match ID: 11352571
Found match: https://www.sofascore.com/football/match/everton-arsenal/RY#id:11352546 with Match ID: 11352546
Found match: https://www.sofascore.com/football/match/brentford-newcastle-united/Osab#id:11352549 with Match ID: 11352549
Found match: https://www.sofascore.com/football/match/manchester-united-brighton-and-hove-albion/FsK#id:11352552 with Match ID: 11352552
Found match: https://www.sofascore.com/football/match/nottingham-forest-burnley/gso#id:11352555 with Match ID: 11352555
Found match: https://www.sofascore.com/football/match/bournemouth-chelsea/Nskb#id:11352558 with Match ID: 11352558
Found match: https://www.sofascore.com/football/match/aston-villa-crystal-palace/hP#id:11352562 with Match ID: 11352562
Found match: https://www.sofascore.com/football/match/li

In [9]:
#Should be 10*38=380
len(collected_match_ids)

373

In [1]:
import http.client, json
from urllib.parse import urlparse
import requests

In [5]:
#Checking the json with the data
match_id=collected_match_ids[1]

url = "https://www.sofascore.com/api/v1/event/"+match_id+"/lineups"
parsed_url = urlparse(url)
conn = http.client.HTTPSConnection(parsed_url.netloc)
conn.request("GET",parsed_url.path)
res = conn.getresponse()
data = res.read()
jsondata = json.loads(data.decode("utf-8"))
jsondata

NameError: name 'collected_match_ids' is not defined

In [10]:
#Decoding date of birth of player
from datetime import datetime
import time

match_id = collected_match_ids[1]

url = "https://www.sofascore.com/api/v1/event/"+match_id+"/lineups"
parsed_url = urlparse(url)
conn = http.client.HTTPSConnection(parsed_url.netloc)
conn.request("GET",parsed_url.path)
res = conn.getresponse()
data = res.read()
jsondata = json.loads(data.decode("utf-8"))

timestamp = jsondata['home']['players'][0]['player']['dateOfBirthTimestamp'] #699667200
date_object = datetime.fromtimestamp(timestamp)
print("Datetime:", date_object)

Datetime: 1995-09-14 17:00:00


In [6]:
#Just checking if all matches collected are from the correct season and tournament
for match_id in collected_match_ids:
    url = "https://www.sofascore.com/api/v1/event/"+match_id
    parsed_url = urlparse(url)
    conn = http.client.HTTPSConnection(parsed_url.netloc)
    conn.request("GET",parsed_url.path)
    res = conn.getresponse()
    data = res.read()
    jsondata = json.loads(data.decode("utf-8"))
    if jsondata['event']['season']['name'] != 'Premier League 23/24':
        print(jsondata['event']['season']['name'])

In [7]:
#Obtaining the ratings of home players
match_id = collected_match_ids[1]

url = "https://www.sofascore.com/api/v1/event/"+match_id+"/lineups"
parsed_url = urlparse(url)
conn = http.client.HTTPSConnection(parsed_url.netloc)
conn.request("GET",parsed_url.path)
res = conn.getresponse()
data = res.read()
jsondata = json.loads(data.decode("utf-8"))

for player in jsondata['home']['players']:
    player_name = player['player']['name']
    if 'rating' not in player['statistics']:
        print(player_name)
    else:
        player_rating = player['statistics']['rating']
        print(player_name,player_rating)

David Raya 6.6
Ben White 6.8
William Saliba 7.1
Gabriel Magalhães 7
Takehiro Tomiyasu 7.7
Martin Ødegaard 8.5
Thomas Partey 7
Declan Rice 7.2
Gabriel Martinelli 8
Kai Havertz 7.3
Leandro Trossard 6.7
Oleksandr Zinchenko 6.9
Jurriën Timber 6.7
Emile Smith Rowe 7.1
Gabriel Jesus 7.2
Aaron Ramsdale
Jakub Kiwior
Fábio Vieira
Jorginho
Edward Nketiah


In [19]:
import get_data.Scrapers.sofascore_scraper as sofascraper
import importlib
importlib.reload(sofascraper)

<module 'get_data.Scrapers.sofascore_scraper' from '/home/cody/Documents/DataSciBC/EPLTransfer/get_data/Scrapers/sofascore_scraper.py'>

In [20]:
ids = sofascraper.get_all_ids()
ids.collect_and_save()


country:   0%|          | 0/2 [00:00<?, ?it/s]

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/2 [00:00<?, ?it/s]

before += []
after += ['11352571', '11352546', '11352546', '11352549', '11352552', '11352555', '11352558', '11352562', '11352565', '11352568', '11352571', '11352574', '11352546', '11352546', '11352352', '11352401', '11352562', '11352362', '12190336', '11352616', '11352638', '11352616', '11352392', '11352340', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028']
['11352571', '11352546', '11352546', '11352549', '11352552', '11352555', '11352558', '11352562', '11352565', '11352568', '11352571', '11352574', '11352546', '11352546', '11352352', '11352401', '11352562', '11352362', '12190336', '11352616', '11352638', '11352616', '11352392', '11352340', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028']
before += ['11352571', '11352546', '11352546', '11352549', '11352552', '11352555', '11352558', '11352562', '11352565', '11352568', '11352571', '11352574', '11352546', '11352546', '11352352', '11352401', '11352562', '11352362', '1219

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/1 [00:00<?, ?it/s]

before += []
after += ['11368620', '11368617', '11368614', '11368625', '11368623', '11368639', '11368622', '11368620', '11368618', '11368615', '11368632', '11368617', '11368617', '11368617', '11368689', '11369335', '11368619', '11368616', '11369390', '11369473', '11368616', '11368658', '11369380', '11368722', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028']
['11368620', '11368617', '11368614', '11368625', '11368623', '11368639', '11368622', '11368620', '11368618', '11368615', '11368632', '11368617', '11368617', '11368617', '11368689', '11369335', '11368619', '11368616', '11369390', '11369473', '11368616', '11368658', '11369380', '11368722', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028']
before += ['11368620', '11368617', '11368614', '11368625', '11368623', '11368639', '11368622', '11368620', '11368618', '11368615', '11368632', '11368617', '11368617', '11368617', '11368689', '11369335', '11368619', '11368616', '1136

In [69]:
ids.clean_ids()

  0%|          | 0/382 [00:00<?, ?it/s]

removed the elements [], leaving 382 left over


In [9]:
print(ids.list_match_ids())
print(len(set(ids.list_match_ids())))

['11352571', '11352546', '11352549', '11352552', '11352555', '11352558', '11352562', '11352565', '11352568', '11352574', '11352352', '11352401', '11352362', '12190336', '11352616', '11352638', '11352392', '11352340', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '11352529', '11352519', '11352527', '11352536', '11352542', '11352513', '11352516', '11352539', '11352532', '11352523', '11352503', '11352482', '11352485', '11352491', '11352510', '11352507', '11352488', '11352494', '11352500', '12226495', '11352476', '11352460', '11352463', '11352467', '11352479', '11352457', '11352454', '11352451', '11352472', '11352470', '11352434', '11352441', '11352448', '11352429', '11352420', '11352425', '11352432', '12240634', '12240570', '12240571', '11352410', '11352394', '11352397', '11352406', '11352413', '11352387', '11352403', '11352416', '11352390', '11352355', '11352364', '11352368', '11352371', '11352384', '11352359', '11352374', '11352378', '11352381', '11

In [10]:
data = sofascraper.get_all_data()
data.data.head()

main loop:   0%|          | 0/387 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
data.data

Unnamed: 0,name,shortName,position,height,dateOfBirthTimestamp,team,date,league,season,minutesPlayed,...,hitWoodwork,bigChanceCreated,bigChanceMissed,shotOffTarget,onTargetScoringAttempt,goals,expectedGoals,wasFouled,fouls,totalOffside
0,Stefan Ortega,S. Ortega,G,186.0,1992-11-05 16:00:00,Manchester City,2024-05-19 08:00:00,Premier League,23/24,90,...,0,0,0,0,0,0,0.0000,0,0,0
1,Kyle Walker,K. Walker,D,183.0,1990-05-27 17:00:00,Manchester City,2024-05-19 08:00:00,Premier League,23/24,90,...,0,0,0,0,0,0,0.0000,0,0,0
2,Rúben Dias,R. Dias,D,188.0,1997-05-13 17:00:00,Manchester City,2024-05-19 08:00:00,Premier League,23/24,90,...,0,0,0,1,0,0,0.2179,0,0,0
3,Manuel Akanji,M. Akanji,D,187.0,1995-07-18 17:00:00,Manchester City,2024-05-19 08:00:00,Premier League,23/24,71,...,0,0,0,0,1,0,0.1096,0,0,0
4,Joško Gvardiol,J. Gvardiol,D,186.0,2002-01-22 16:00:00,Manchester City,2024-05-19 08:00:00,Premier League,23/24,90,...,0,0,0,1,0,0,0.0920,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15175,Daniel Bentley,D. Bentley,G,188.0,1993-07-12 17:00:00,Wolverhampton,2023-08-14 12:00:00,Premier League,23/24,0,...,0,0,0,0,0,0,0.0000,0,0,0
15176,Toti Gomes,T. Gomes,D,187.0,1999-01-15 16:00:00,Wolverhampton,2023-08-14 12:00:00,Premier League,23/24,0,...,0,0,0,0,0,0,0.0000,0,0,0
15177,Matt Doherty,M. Doherty,M,182.0,1992-01-15 16:00:00,Wolverhampton,2023-08-14 12:00:00,Premier League,23/24,0,...,0,0,0,0,0,0,0.0000,0,0,0
15178,Boubacar Traoré,B. Traoré,M,183.0,2001-08-19 17:00:00,Wolverhampton,2023-08-14 12:00:00,Premier League,23/24,0,...,0,0,0,0,0,0,0.0000,0,0,0
