In [2]:
import pandas as pd
import requests
from io import StringIO
from bs4 import BeautifulSoup

In [None]:
# function for scraping
def get_wc_data(year):
    '''Gets squad data by World Cup year from Wikipedia.

    Args:
        year (int): The World Cup year for which to extract the data.

    Returns:
        all_data (pd.DataFrame): DataFrame containing data about all the nations that participated in the relevant World Cup, including
            players, the clubs they were playing for at the point in time, etc. 

    '''
    url = f'https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup_squads'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/120.0.0.0 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    all_data = []
    for h3 in soup.find_all('h3'):
        table = h3.find_next('table')
        if 'No.' in table.find_next('th').get_text(): # check for relevant table
            country_name = h3.get_text()
            country_data = pd.read_html(StringIO(str(table)))[0]
            country_data['Country'] = country_name
            country_data['Year'] = year
            all_data.append(country_data)
        else:
            break
    
    all_data = pd.concat(all_data, ignore_index=True)
    return all_data

In [20]:
all_2022_data = get_wc_data(2022)
all_2022_data.head()

Unnamed: 0,No.,Pos.,Player,Date of birth (age),Caps,Goals,Club,Country,Year
0,1,GK,Hernán Galíndez,30 March 1987 (aged 35),12,0,Aucas,Ecuador,2022
1,2,DF,Félix Torres,11 January 1997 (aged 25),17,2,Santos Laguna,Ecuador,2022
2,3,DF,Piero Hincapié,9 January 2002 (aged 20),21,1,Bayer Leverkusen,Ecuador,2022
3,4,DF,Robert Arboleda,22 October 1991 (aged 31),33,2,São Paulo,Ecuador,2022
4,5,MF,José Cifuentes,12 March 1999 (aged 23),11,0,Los Angeles FC,Ecuador,2022
