#### For each match, for all players, we want: team name, player name, position, rating, minutes played, stats
#### Also get date of match. With this, we can use date of birth to find the age. Also useful for tracking rating before and after certain dates
#### The dataframe could look like:

#### Match date | League | Season | Team name | Player's name | Player's d.o.b. | Player's position | Player's rating | Minutes played | Other player stats (many columns)

In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Initialize the WebDriver with custom options
def initialize_driver():
    chrome_options = ChromeOptions()
    # Set the page load strategy to 'none'
    chrome_options.page_load_strategy = 'none'
    
    return webdriver.Chrome(options=chrome_options)

driver = initialize_driver()

def navigate_to_page(url):
    """Navigate to the given URL without waiting for the full page load."""
    driver.get(url)
    time.sleep(2)  # Short initial wait to start loading the page

def scroll_to_matches():
    """Scroll down to the Matches section of the page."""
    try:
        # Scroll to the estimated position to bring the Matches section into view
        scroll_position = 1700  # Adjusted value for the Matches section
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        print(f"Scrolled to position: {scroll_position}")
    except Exception as e:
        print(f"An error occurred while scrolling: {e}")

def collect_match_ids(match_ids, round_number):
    """Collect unique match IDs from the currently displayed Matches section."""
    try:
        # Ensure the elements are fully loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a[href*="/football/match/"]'))
        )
        # Second scroll to ensure all matches are visible
        scroll_to_matches()
        time.sleep(1)  # Allow time for full rendering after scroll

        match_links = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/football/match/"]')
        print(f"Round {round_number}: Match links located (count: {len(match_links)}).")

        if len(match_links) < 10:
            print(f"Warning: Less than 10 matches found for round {round_number}, possible loading issue.")

        for match in match_links:
            match_url = match.get_attribute('href')
            match_id = match_url.split('#id:')[-1]
            if match_id not in match_ids:
                match_ids.append(match_id)
                print(f"Found match: {match_url} with Match ID: {match_id}")
    except Exception as e:
        print(f"Error occurred during match collection: {e}")

def find_and_click_left_arrow():
    """Find and click the left arrow button to navigate to the previous round."""
    try:
        left_arrow = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "button.Button.iCnTrv[style*='visibility: visible;']"))
        )
        if left_arrow.is_displayed() and left_arrow.is_enabled():
            driver.execute_script("arguments[0].click();", left_arrow)
            print("Clicked on the left arrow button.")
            return True
    except Exception as e:
        print(f"Failed to click the left arrow button: {e}")
    return False

def scrape_premier_league_matches():
    """Main function to scrape match IDs from the Premier League page."""
    match_ids = []
    total_rounds = 38  # Number of rounds in the Premier League
    round_click_count = 0

    # Navigate to the Premier League page for the desired season
    navigate_to_page("https://www.sofascore.com/tournament/football/england/premier-league/17#id:52186")
    scroll_to_matches()

    # Collect match IDs for the initial round
    collect_match_ids(match_ids, round_click_count + 1)

    while round_click_count < (total_rounds - 1):  # Loop for 37 left arrow clicks
        try:
            # Attempt to click the left arrow button
            if not find_and_click_left_arrow():
                print("Left arrow button is no longer interactable. Ending loop.")
                break

            round_click_count += 1
            print(f"Total round navigations (left arrow clicks): {round_click_count}")

            # Wait briefly to allow the page to load partially
            time.sleep(3)  # Wait time increased to 3 seconds to allow more content to load

            # Collect match IDs for the current round
            collect_match_ids(match_ids, round_click_count + 1)
            
        except Exception as e:
            print(f"An error occurred during scraping: {e}")
            break

    return match_ids, round_click_count

# Example usage
collected_match_ids, round_click_count = scrape_premier_league_matches()
print("Collected Unique Match IDs:", collected_match_ids)
print(f"Total round navigations (left arrow clicks): {round_click_count}")

# Close the browser after scraping
driver.quit()

Scrolled to position: 1700
Scrolled to position: 1700
Round 1: Match links located (count: 31).
Found match: https://www.sofascore.com/football/match/west-ham-united-manchester-city/rM#id:11352571 with Match ID: 11352571
Found match: https://www.sofascore.com/football/match/everton-arsenal/RY#id:11352546 with Match ID: 11352546
Found match: https://www.sofascore.com/football/match/brentford-newcastle-united/Osab#id:11352549 with Match ID: 11352549
Found match: https://www.sofascore.com/football/match/manchester-united-brighton-and-hove-albion/FsK#id:11352552 with Match ID: 11352552
Found match: https://www.sofascore.com/football/match/nottingham-forest-burnley/gso#id:11352555 with Match ID: 11352555
Found match: https://www.sofascore.com/football/match/bournemouth-chelsea/Nskb#id:11352558 with Match ID: 11352558
Found match: https://www.sofascore.com/football/match/aston-villa-crystal-palace/hP#id:11352562 with Match ID: 11352562
Found match: https://www.sofascore.com/football/match/li

In [9]:
#Should be 10*38=380
len(collected_match_ids)

373

In [57]:
import http.client, json
from urllib.parse import urlparse
import requests

In [5]:
#Checking the json with the data
match_id=collected_match_ids[1]

url = "https://www.sofascore.com/api/v1/event/"+match_id+"/lineups"
parsed_url = urlparse(url)
conn = http.client.HTTPSConnection(parsed_url.netloc)
conn.request("GET",parsed_url.path)
res = conn.getresponse()
data = res.read()
jsondata = json.loads(data.decode("utf-8"))
jsondata

NameError: name 'collected_match_ids' is not defined

In [10]:
#Decoding date of birth of player
from datetime import datetime
import time

match_id = collected_match_ids[1]

url = "https://www.sofascore.com/api/v1/event/"+match_id+"/lineups"
parsed_url = urlparse(url)
conn = http.client.HTTPSConnection(parsed_url.netloc)
conn.request("GET",parsed_url.path)
res = conn.getresponse()
data = res.read()
jsondata = json.loads(data.decode("utf-8"))

timestamp = jsondata['home']['players'][0]['player']['dateOfBirthTimestamp'] #699667200
date_object = datetime.fromtimestamp(timestamp)
print("Datetime:", date_object)

Datetime: 1995-09-14 17:00:00


In [6]:
#Just checking if all matches collected are from the correct season and tournament
for match_id in collected_match_ids:
    url = "https://www.sofascore.com/api/v1/event/"+match_id
    parsed_url = urlparse(url)
    conn = http.client.HTTPSConnection(parsed_url.netloc)
    conn.request("GET",parsed_url.path)
    res = conn.getresponse()
    data = res.read()
    jsondata = json.loads(data.decode("utf-8"))
    if jsondata['event']['season']['name'] != 'Premier League 23/24':
        print(jsondata['event']['season']['name'])

In [7]:
#Obtaining the ratings of home players
match_id = collected_match_ids[1]

url = "https://www.sofascore.com/api/v1/event/"+match_id+"/lineups"
parsed_url = urlparse(url)
conn = http.client.HTTPSConnection(parsed_url.netloc)
conn.request("GET",parsed_url.path)
res = conn.getresponse()
data = res.read()
jsondata = json.loads(data.decode("utf-8"))

for player in jsondata['home']['players']:
    player_name = player['player']['name']
    if 'rating' not in player['statistics']:
        print(player_name)
    else:
        player_rating = player['statistics']['rating']
        print(player_name,player_rating)

David Raya 6.6
Ben White 6.8
William Saliba 7.1
Gabriel Magalhães 7
Takehiro Tomiyasu 7.7
Martin Ødegaard 8.5
Thomas Partey 7
Declan Rice 7.2
Gabriel Martinelli 8
Kai Havertz 7.3
Leandro Trossard 6.7
Oleksandr Zinchenko 6.9
Jurriën Timber 6.7
Emile Smith Rowe 7.1
Gabriel Jesus 7.2
Aaron Ramsdale
Jakub Kiwior
Fábio Vieira
Jorginho
Edward Nketiah


In [1]:
import get_data.Scrapers.sofascore_scraper as sofascraper
import importlib
importlib.reload(sofascraper)

<module 'get_data.Scrapers.sofascore_scraper' from '/home/rafael/Score-data/get_data/Scrapers/sofascore_scraper.py'>

In [21]:
ids = sofascraper.get_all_ids()
ids.collect_and_save()


country:   0%|          | 0/9 [00:00<?, ?it/s]

leagues:   0%|          | 0/2 [00:00<?, ?it/s]

season_ids:   0%|          | 0/11 [00:00<?, ?it/s]

['12437015', '12437022', '12436536', '12436559', '12436538', '12436548', '12436540', '12436541', '12436556', '12436549', '12436557', '12436542', '12437038', '12437038', '12437038', '12436989', '12437004', '12436985', '12436908', '12436904', '12436898', '12436927', '12436985', '12436898', '12436920', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12436585', '12436577', '12436586', '12436590', '12436582', '12436592', '12436579', '12436534', '12436555', '12436533']
['12437015', '12437022', '12436536', '12436559', '12436538', '12436548', '12436540', '12436541', '12436556', '12436549', '12436557', '12436542', '12437038', '12437038', '12437038', '12436989', '12437004', '12436985', '12436908', '12436904', '12436898', '12436927', '12436985', '12436898', '12436920', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12436585', '12436577', '12436586', '12436590', '12436582', '12436592', '12436579', '12436534', '12436555', '1

season_ids:   0%|          | 0/11 [00:00<?, ?it/s]

['12468964', '12468964', '12465202', '12465206', '12465213', '12465209', '12465203', '12465216', '12465204', '12465212', '12465237', '12465236', '12465238', '12465229', '12468956', '12468956', '12468958', '12468970', '12469009', '12468993', '12469691', '12468989', '12468961', '12469053', '12469038', '12469053', '12469043', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12465233', '12465235', '12465241', '12465222', '12465228', '12465230', '12465240', '12465227', '12465231', '12465225', '12465226', '12465224']
['12468964', '12468964', '12465202', '12465206', '12465213', '12465209', '12465203', '12465216', '12465204', '12465212', '12465237', '12465236', '12465238', '12465229', '12468956', '12468956', '12468958', '12468970', '12469009', '12468993', '12469691', '12468989', '12468961', '12469053', '12469038', '12469053', '12469043', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12465233', '12465235', '12465241', '1

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/11 [00:00<?, ?it/s]

['12437815', '12437813', '12437503', '12437507', '12437663', '12437516', '12437509', '12437511', '12437508', '12437510', '12437506', '12437512', '12437855', '12437855', '12437843', '12437687', '12437815', '12778688', '12437721', '12437808', '12437792', '12437777', '12437808', '12437730', '12437702', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12437493', '12437489', '12437483', '12437497', '12437490', '12437505', '12437578', '12437486', '12437499', '12437496']
['12437815', '12437813', '12437503', '12437507', '12437663', '12437516', '12437509', '12437511', '12437508', '12437510', '12437506', '12437512', '12437855', '12437855', '12437843', '12437687', '12437815', '12778688', '12437721', '12437808', '12437792', '12437777', '12437808', '12437730', '12437702', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12437493', '12437489', '12437483', '12437497', '12437490', '12437505', '12437578', '12437486', '12437499', '1

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/11 [00:00<?, ?it/s]

['12499319', '12501544', '12501543', '12501501', '12501519', '12501517', '12501503', '12501502', '12501508', '12501527', '12501507', '12499326', '12499326', '12499330', '12499315', '12499293', '12499319', '12499256', '12499270', '12499273', '12499266', '12499285', '12499317', '12499257', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12501534', '12501538', '12501535', '12501536', '12501541', '12501540', '12501542', '12501539', '12501537', '12501545']
['12499319', '12501544', '12501543', '12501501', '12501519', '12501517', '12501503', '12501502', '12501508', '12501527', '12501507', '12499326', '12499326', '12499330', '12499315', '12499293', '12499319', '12499256', '12499270', '12499273', '12499266', '12499285', '12499317', '12499257', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12501534', '12501538', '12501535', '12501536', '12501541', '12501540', '12501542', '12501539', '12501537', '12501545', '12501528', '1

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/11 [00:00<?, ?it/s]

['12499656', '12499660', '12499397', '12499435', '12499542', '12499447', '12499425', '12499456', '12499539', '12499412', '12499543', '12499703', '12499703', '12499684', '12499604', '12499406', '12499588', '12499648', '12499624', '12499600', '12499631', '12499633', '12499595', '12499627', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12499535', '12499525', '12499530', '12499528', '12499538', '12499532', '12499526', '12499536', '12499534']
['12499656', '12499660', '12499397', '12499435', '12499542', '12499447', '12499425', '12499456', '12499539', '12499412', '12499543', '12499703', '12499703', '12499684', '12499604', '12499406', '12499588', '12499648', '12499624', '12499600', '12499631', '12499633', '12499595', '12499627', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12499535', '12499525', '12499530', '12499528', '12499538', '12499532', '12499526', '12499536', '12499534', '12499513', '12499522', '12499519', '1

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/10 [00:00<?, ?it/s]

['12451029', '12451019', '12450189', '12450272', '12450187', '12450192', '12450190', '12450275', '12450181', '12450176', '12450273', '12451048', '12451048', '12451044', '12450982', '12451010', '12450874', '12451023', '12451002', '12450873', '12450995', '12451002', '12450884', '12450868', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12450186', '12450182', '12450178', '12450199', '12450276', '12450184', '12450172', '12450269', '12450191']
['12451029', '12451019', '12450189', '12450272', '12450187', '12450192', '12450190', '12450275', '12450181', '12450176', '12450273', '12451048', '12451048', '12451044', '12450982', '12451010', '12450874', '12451023', '12451002', '12450873', '12450995', '12451002', '12450884', '12450868', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12450186', '12450182', '12450178', '12450199', '12450276', '12450184', '12450172', '12450269', '12450191', '12450274', '12450228', '12450271', '1

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/11 [00:00<?, ?it/s]

['12514044', '12514041', '12513667', '12513644', '12513678', '12513668', '12513641', '12513640', '12513654', '12513681', '12513675', '12514054', '12514054', '12514054', '12513994', '12513991', '12514014', '12513987', '12514029', '12514020', '12514032', '12514043', '12513994', '12513997', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12513677', '12513679', '12513669', '12513676', '12513683', '12513672', '12513674', '12513670', '12513673']
['12514044', '12514041', '12513667', '12513644', '12513678', '12513668', '12513641', '12513640', '12513654', '12513681', '12513675', '12514054', '12514054', '12514054', '12513994', '12513991', '12514014', '12513987', '12514029', '12514020', '12514032', '12514043', '12513994', '12513997', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12513677', '12513679', '12513669', '12513676', '12513683', '12513672', '12513674', '12513670', '12513673', '12513671', '12513664', '12513653', '1

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/5 [00:00<?, ?it/s]

['12421255', '12421258', '12421260', '12421257', '12421262', '12421261', '12421256', '12421259', '12421104', '12421104', '12421093', '12421090', '12421000', '12421089', '12421070', '12421009', '12421049', '12421050', '12420981', '12421063', '12421091', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12421250', '12421254', '12421252', '12421249', '12421253', '12421248', '12421247', '12421251']
['12421255', '12421258', '12421260', '12421257', '12421262', '12421261', '12421256', '12421259', '12421104', '12421104', '12421093', '12421090', '12421000', '12421089', '12421070', '12421009', '12421049', '12421050', '12420981', '12421063', '12421091', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12421250', '12421254', '12421252', '12421249', '12421253', '12421248', '12421247', '12421251', '12421243', '12421240', '12421242', '12421244', '12421246', '12421239', '12421245', '12421241']
['12421255', '12421258', '12421260', '

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/6 [00:00<?, ?it/s]

['12407880', '12407881', '12407882', '12407885', '12407883', '12407884', '12407821', '12407821', '12407822', '12407773', '12407785', '12407790', '12407807', '12407785', '12407775', '12407782', '12407816', '12407803', '12407795', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12407877', '12407879', '12407874', '12407878', '12407876', '12407875']
['12407880', '12407881', '12407882', '12407885', '12407883', '12407884', '12407821', '12407821', '12407822', '12407773', '12407785', '12407790', '12407807', '12407785', '12407775', '12407782', '12407816', '12407803', '12407795', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12407877', '12407879', '12407874', '12407878', '12407876', '12407875', '12407870', '12407868', '12407869', '12407871', '12407872', '12407873']
['12407880', '12407881', '12407882', '12407885', '12407883', '12407884', '12407821', '12407821', '12407822', '12407773', '12407785', '12407790', '12407807', '

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

season_ids:   0%|          | 0/10 [00:00<?, ?it/s]

['12448373', '12448374', '12441769', '12441690', '12441768', '12441767', '12441770', '12441682', '12441692', '12441688', '12441771', '12448379', '12448379', '12448382', '12448342', '12448333', '12448360', '12448342', '12448277', '12448313', '12448358', '12448247', '12448342', '12448262', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12441702', '12441707', '12441705', '12441708', '12441704', '12441709', '12441710', '12441711', '12441760']
['12448373', '12448374', '12441769', '12441690', '12441768', '12441767', '12441770', '12441682', '12441692', '12441688', '12441771', '12448379', '12448379', '12448382', '12448342', '12448333', '12448360', '12448342', '12448277', '12448313', '12448358', '12448247', '12448342', '12448262', '12437842', '12437034', '12437026', '12437038', '12437035', '12499684', '12437028', '12441702', '12441707', '12441705', '12441708', '12441704', '12441709', '12441710', '12441711', '12441760', '12441698', '12441706', '12441696', '1

In [69]:
# ids.clean_ids() #We are no longer cleaning like this

  0%|          | 0/382 [00:00<?, ?it/s]

removed the elements [], leaving 382 left over


In [50]:
print(ids.list_match_ids())
print(len(ids.list_match_ids()))

['12437058', '12436545', '12436958', '12437023', '12436610', '12436556', '12436931', '12436509', '12436933', '12437015', '12436591', '12436460', '12436576', '12436996', '12436994', '12436592', '12436447', '12436871', '12436536', '12436889', '12436952', '12436481', '12436937', '12436968', '12436912', '12437034', '12437057', '12436907', '12436882', '12436563', '12436951', '12436946', '12436944', '12436597', '12436546', '12436486', '12436580', '12437024', '12437056', '12437060', '12436560', '12436507', '12437003', '12437059', '12437020', '12436893', '12436526', '12436438', '12437049', '12436899', '12436590', '12436572', '12436494', '12437048', '12436437', '12437043', '12436515', '12436552', '12436978', '12437039', '12437047', '12436564', '12436522', '12436555', '12437035', '12436969', '12436974', '12436964', '12436559', '12436948', '12436553', '12436887', '12436914', '12436909', '12436894', '12436477', '12436898', '12436993', '12436595', '12436880', '12436517', '12436469', '12436543', '12

In [45]:
import pandas as pd
import os

cur_dir=os.getcwd()
json_path=os.path.join(cur_dir,'pre_data','sofascore','match_ids.json')

mids=pd.read_json(json_path)

In [46]:
import numpy as np

# Ensure mids is a dictionary and filter out any NaN values
if isinstance(mids, pd.DataFrame):
    mids = mids.to_dict()

results = []

# Traverse the JSON structure safely, ignoring NaN values
for country, tournaments in mids.items():
    if not isinstance(tournaments, dict) or pd.isna(tournaments):
        continue  # Skip invalid or NaN values

    for tournament, seasons in tournaments.items():
        if not isinstance(seasons, dict) or pd.isna(seasons):
            continue

        for season, details in seasons.items():
            if not isinstance(details, dict) or pd.isna(details):
                continue

            # Extract match IDs and the matches value
            match_ids = details.get("match_ids", [])
            match_count = len(match_ids)
            total_matches = details.get("matches", 0)

            # Check for duplicate match IDs
            duplicate_count = len(match_ids) - len(set(match_ids))

            results.append([tournament, season, match_count, total_matches, duplicate_count])

# Convert results to DataFrame
df = pd.DataFrame(results, columns=["Tournament", "Season", "Match ID Count", "Matches", "Duplicate Matches"])

# Display the DataFrame
print(df)

# Identify mismatches between Match ID Count and Matches
mismatches = df[df["Match ID Count"] != df["Matches"]]
if not mismatches.empty:
    print("\nMismatches found:")
    print(mismatches)
else:
    print("\nAll match counts are consistent.")


        Tournament Season  Match ID Count  Matches  Duplicate Matches
0   premier-league  24/25             382      380                  0
1   premier-league  23/24             387      380                  0
2   premier-league  22/23             409      380                  0
3   premier-league  21/22             422      380                  0
4   premier-league  20/21             399      380                  0
..             ...    ...             ...      ...                ...
92      eredivisie  19/20             319      306                  0
93      eredivisie  18/19             320      312                  0
94      eredivisie  17/18             321      312                  0
95      eredivisie  16/17             319      312                  0
96      eredivisie  15/16             324      312                  0

[97 rows x 5 columns]

Mismatches found:
        Tournament Season  Match ID Count  Matches  Duplicate Matches
0   premier-league  24/25             382      3

In [44]:
df

Unnamed: 0,Tournament,Season,Match ID Count,Matches,Duplicate Matches
0,premier-league,24/25,382,380,0
1,premier-league,23/24,387,380,0
2,premier-league,22/23,409,380,0
3,premier-league,21/22,422,380,0
4,premier-league,20/21,399,380,0
...,...,...,...,...,...
92,eredivisie,19/20,319,306,0
93,eredivisie,18/19,320,312,0
94,eredivisie,17/18,321,312,0
95,eredivisie,16/17,319,312,0


In [49]:
unique_match_ids = set()

# Traverse the data to collect unique match IDs
def collect_match_ids(data):
    if isinstance(data, dict):
        for key, value in data.items():
            if key == "match_ids":
                unique_match_ids.update(value)
            else:
                collect_match_ids(value)
    elif isinstance(data, list):
        for item in data:
            collect_match_ids(item)

# Start collecting match IDs from the 'mids' variable
collect_match_ids(mids)

# Convert the set back to a list
unique_match_ids_list = list(unique_match_ids)

# Create a new JSON structure to save
new_data = {
    "unique_match_ids": unique_match_ids_list
}

# Write the unique match IDs to a new JSON file
with open('unique_match_ids.json', 'w') as file:
    json.dump(new_data, file, indent=4)

print("Unique match IDs have been saved to 'unique_match_ids.json'.")

Unique match IDs have been saved to 'unique_match_ids.json'.


In [2]:
data = sofascraper.get_all_data() #This will end up getting ~700 repeated matches, but if we removed repeated rows after it should be fine

main loop:   0%|          | 0/35969 [00:00<?, ?it/s]

Failed to fetch data for match ID: 12437058, Status: 404
Failed to fetch data for match ID: 12436545, Status: 404
Failed to fetch data for match ID: 12436958, Status: 404
Failed to fetch data for match ID: 12437023, Status: 404
Failed to fetch data for match ID: 12436610, Status: 404
Failed to fetch data for match ID: 12436556, Status: 404
Failed to fetch data for match ID: 12436509, Status: 404
Failed to fetch data for match ID: 12436591, Status: 404
Failed to fetch data for match ID: 12436460, Status: 404
Failed to fetch data for match ID: 12436576, Status: 404
Failed to fetch data for match ID: 12436996, Status: 404
Failed to fetch data for match ID: 12436994, Status: 404
Failed to fetch data for match ID: 12436592, Status: 404
Failed to fetch data for match ID: 12436447, Status: 404
Failed to fetch data for match ID: 12436536, Status: 404
Failed to fetch data for match ID: 12436889, Status: 404
Failed to fetch data for match ID: 12436952, Status: 404
Failed to fetch data for match 

In [59]:
#Checking why some match ids are not returning any data

url = "https://www.sofascore.com/api/v1/event/12437058"
parsed_url = urlparse(url)
conn = http.client.HTTPSConnection(parsed_url.netloc)
conn.request("GET",parsed_url.path)
res = conn.getresponse()
data = res.read()
jsondata = json.loads(data.decode("utf-8"))
jsondata #we can see it is a match that has not been played yet

{'event': {'tournament': {'name': 'Premier League',
   'slug': 'premier-league',
   'category': {'name': 'England',
    'slug': 'england',
    'sport': {'name': 'Football', 'slug': 'football', 'id': 1},
    'id': 1,
    'country': {'alpha2': 'EN',
     'alpha3': 'ENG',
     'name': 'England',
     'slug': 'england'},
    'flag': 'england',
    'alpha2': 'EN'},
   'uniqueTournament': {'name': 'Premier League',
    'slug': 'premier-league',
    'primaryColorHex': '#3c1c5a',
    'secondaryColorHex': '#f80158',
    'category': {'name': 'England',
     'slug': 'england',
     'sport': {'name': 'Football', 'slug': 'football', 'id': 1},
     'id': 1,
     'country': {'alpha2': 'EN',
      'alpha3': 'ENG',
      'name': 'England',
      'slug': 'england'},
     'flag': 'england',
     'alpha2': 'EN'},
    'userCount': 1912307,
    'id': 17,
    'country': {},
    'hasPerformanceGraphFeature': True,
    'hasEventPlayerStatistics': True,
    'displayInverseHomeAwayTeams': False},
   'priority': 

In [3]:
data.data.head()

Unnamed: 0,name,shortName,position,height,dateOfBirthTimestamp,team,date,league,season,minutesPlayed,...,hitWoodwork,bigChanceCreated,bigChanceMissed,shotOffTarget,onTargetScoringAttempt,goals,expectedGoals,wasFouled,fouls,totalOffside
0,Mark Travers,M. Travers,G,191.0,1999-05-17 17:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,90,...,0,0,0,0,0,0,0.0,0,0,0
1,Adam Smith,A. Smith,D,180.0,1991-04-28 17:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,75,...,0,0,0,0,0,0,0.0,0,0,0
2,Illia Zabarnyi,I. Zabarnyi,D,189.0,2002-08-31 17:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,90,...,0,0,0,0,0,0,0.0,0,0,0
3,Marcos Senesi,M. Senesi,D,184.0,1997-05-09 17:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,90,...,0,0,0,0,0,0,0.0,0,1,0
4,Miloš Kerkez,M. Kerkez,D,176.0,2003-11-06 16:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,90,...,0,0,0,1,0,0,0.0678,0,1,0


In [5]:
data.data

Unnamed: 0,name,shortName,position,height,dateOfBirthTimestamp,team,date,league,season,minutesPlayed,...,hitWoodwork,bigChanceCreated,bigChanceMissed,shotOffTarget,onTargetScoringAttempt,goals,expectedGoals,wasFouled,fouls,totalOffside
0,Mark Travers,M. Travers,G,191.0,1999-05-17 17:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,90,...,0,0,0,0,0,0,0.0000,0,0,0
1,Adam Smith,A. Smith,D,180.0,1991-04-28 17:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,75,...,0,0,0,0,0,0,0.0000,0,0,0
2,Illia Zabarnyi,I. Zabarnyi,D,189.0,2002-08-31 17:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,90,...,0,0,0,0,0,0,0.0000,0,0,0
3,Marcos Senesi,M. Senesi,D,184.0,1997-05-09 17:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,90,...,0,0,0,0,0,0,0.0000,0,1,0
4,Miloš Kerkez,M. Kerkez,D,176.0,2003-11-06 16:00:00,Bournemouth,2024-09-14 12:00:00,Premier League,24/25,90,...,0,0,0,1,0,0,0.0678,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1284463,Lucas Bijker,L. Bijker,D,175.0,1993-03-03 16:00:00,SC Heerenveen,2015-12-05 11:45:00,Eredivisie,15/16,17,...,0,0,0,0,0,0,0.0000,1,0,0
1284464,Maarten De Fockert,M. D. Fockert,G,189.0,1995-02-19 16:00:00,SC Heerenveen,2015-12-05 11:45:00,Eredivisie,15/16,0,...,0,0,0,0,0,0,0.0000,0,0,0
1284465,Jordy Buijs,J. Buijs,D,185.0,1988-12-27 16:00:00,SC Heerenveen,2015-12-05 11:45:00,Eredivisie,15/16,0,...,0,0,0,0,0,0,0.0000,0,0,0
1284466,Luka Zahović,L. Zahović,F,177.0,1995-11-14 16:00:00,SC Heerenveen,2015-12-05 11:45:00,Eredivisie,15/16,0,...,0,0,0,0,0,0,0.0000,0,0,0


In [6]:
len(data.data)

1284468