## Wikipedia Fifa Squad Scraping
This file will pull Fifa Squad Info from Wikipedia pages

Accessed on the 3rd November 2022

Then it will tidy up the resulting file to be uploaded to the sqlite

In [166]:
from bs4 import BeautifulSoup
import requests
import os
import re
import numpy as np
import pandas as pd
import re as re
import itertools
from IPython.display import clear_output


import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)


In [63]:
#list the years we want to scrape
years = ['2018', '2014', '2010', '2006', '2002', '1998', '1994']

In [3]:
for y in years:
    url = f'https://en.wikipedia.org/wiki/{y}_FIFA_World_Cup_squads'
    print(url)

https://en.wikipedia.org/wiki/2018_FIFA_World_Cup_squads
https://en.wikipedia.org/wiki/2014_FIFA_World_Cup_squads
https://en.wikipedia.org/wiki/2010_FIFA_World_Cup_squads
https://en.wikipedia.org/wiki/2006_FIFA_World_Cup_squads
https://en.wikipedia.org/wiki/2002_FIFA_World_Cup_squads
https://en.wikipedia.org/wiki/1998_FIFA_World_Cup_squads
https://en.wikipedia.org/wiki/1994_FIFA_World_Cup_squads


In [75]:
countries = []
for y in years:
    n = 32
    #only 24 nations in 1994
    if y == '1994':
        n = 24
    
    url = f'https://en.wikipedia.org/wiki/{y}_FIFA_World_Cup_squads'
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')    
    result = soup.find_all('h3')

    for r in result[0:n]:
        country_name = re.sub("\\[edit\\]", "", r.text)
        countries.append({'year':y,
                          'country':country_name})
        
countries = pd.DataFrame(countries)


In [115]:
def get_table(url, n):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    table = soup.find_all('table')[n]
    
    rows = []
    for tr in table.find_all('tr'):
        row = []
        for td in tr.find_all('td'):
            try:
                row.append(td.text.replace('\n', ''))
            except:
                continue
        if len(row) > 0:
            rows.append(row)

    return rows

In [168]:
#Although this is a slow way to do this... it will do for the purposes of this project
#This block of code will get all the data

i = 0
squad_data = pd.DataFrame()
y = 0
for r in countries.iterrows():
    if y == r[1]['year']:
        i = i+1
    elif y != r[1]['year']:
        i = 0
    c = r[1]['country']
    y = r[1]['year']
    clear_output(wait=True)
    print(y + ' ' + c)
    
    url = f'https://en.wikipedia.org/wiki/{y}_FIFA_World_Cup_squads'
    table = pd.DataFrame(get_table(url, i))
    table_year = table.assign(year = y)
    table_country = table_year.assign(country = c)
    
    squad_data = pd.concat([squad_data, table_country], ignore_index=True, sort=False)
    
squad_data

1994 Saudi Arabia


Unnamed: 0,0,1,2,3,4,5,year,country
0,1,1GK,(1973-01-15)15 January 1973 (aged 45),158,0,Al Taawoun,2018,Egypt
1,2,2DF,(1989-01-01)1 January 1989 (aged 29),21,1,West Bromwich Albion,2018,Egypt
2,3,2DF,(1987-09-09)9 September 1987 (aged 30),78,2,Aston Villa,2018,Egypt
3,4,3MF,(1992-01-30)30 January 1992 (aged 26),24,0,Los Angeles FC,2018,Egypt
4,5,3MF,(1991-09-10)10 September 1991 (aged 26),5,0,Wigan Athletic,2018,Egypt
...,...,...,...,...,...,...,...,...
4908,18,2DF,(1968-09-24)24 September 1968 (aged 25),1,Al Shabab,,1994,Saudi Arabia
4909,19,3MF,(1967-04-19)19 April 1967 (aged 27),0,Al Ahli,,1994,Saudi Arabia
4910,20,4FW,(1972-10-08)8 October 1972 (aged 21),7,Ohod,,1994,Saudi Arabia
4911,21,1GK,(1973-10-15)15 October 1973 (aged 20),0,Al Qadsiah,,1994,Saudi Arabia


In [169]:
#rename columns
squad_data.set_axis(['number', 'position', 'age_all', 'caps', 'goals', 'team', 'year', 'country'], axis=1, inplace=True)
squad_data.head()

Unnamed: 0,number,position,age_all,caps,goals,team,year,country
0,1,1GK,(1973-01-15)15 January 1973 (aged 45),158,0,Al Taawoun,2018,Egypt
1,2,2DF,(1989-01-01)1 January 1989 (aged 29),21,1,West Bromwich Albion,2018,Egypt
2,3,2DF,(1987-09-09)9 September 1987 (aged 30),78,2,Aston Villa,2018,Egypt
3,4,3MF,(1992-01-30)30 January 1992 (aged 26),24,0,Los Angeles FC,2018,Egypt
4,5,3MF,(1991-09-10)10 September 1991 (aged 26),5,0,Wigan Athletic,2018,Egypt


In [183]:
#split out the age_all column
squad_data['DOB'] = squad_data.age_all.apply(lambda st: st[st.find("(")+1:st.find(")")])
squad_data['age'] = squad_data.age_all.str.extract('.*\((.*)\).*')
squad_data['age'] = squad_data.age.str.replace('aged ', '').astype(int)
#remove number from position column
squad_data.position = squad_data.position.str.replace('\d+', '', regex = True)
squad_data.head()

Unnamed: 0,number,position,age_all,caps,goals,team,year,country,DOB,age
0,1,GK,(1973-01-15)15 January 1973 (aged 45),158,0,Al Taawoun,2018,Egypt,1973-01-15,45
1,2,DF,(1989-01-01)1 January 1989 (aged 29),21,1,West Bromwich Albion,2018,Egypt,1989-01-01,29
2,3,DF,(1987-09-09)9 September 1987 (aged 30),78,2,Aston Villa,2018,Egypt,1987-09-09,30
3,4,MF,(1992-01-30)30 January 1992 (aged 26),24,0,Los Angeles FC,2018,Egypt,1992-01-30,26
4,5,MF,(1991-09-10)10 September 1991 (aged 26),5,0,Wigan Athletic,2018,Egypt,1991-09-10,26


In [None]:
#move the team data to team, if the data frame was missing goals
squad_data.team = test.team.fillna(squad_data.goals)
test.goals = pd.to_numeric(test.goals,errors='coerce')
test = test.replace(np.nan, 0, regex=True)
test.goals = test.goals.astype(int)

In [219]:
# squad_data['performance_l'] = 
test = squad_data
test.team = test.team.fillna(squad_data.goals)
test.goals = pd.to_numeric(test.goals,errors='coerce')
test = test.replace(np.nan, 0, regex=True)
test.goals = test.goals.astype(int)

test

Unnamed: 0,number,position,age_all,caps,goals,team,year,country,DOB,age
0,1,GK,(1973-01-15)15 January 1973 (aged 45),158,0,Al Taawoun,2018,Egypt,1973-01-15,45
1,2,DF,(1989-01-01)1 January 1989 (aged 29),21,1,West Bromwich Albion,2018,Egypt,1989-01-01,29
2,3,DF,(1987-09-09)9 September 1987 (aged 30),78,2,Aston Villa,2018,Egypt,1987-09-09,30
3,4,MF,(1992-01-30)30 January 1992 (aged 26),24,0,Los Angeles FC,2018,Egypt,1992-01-30,26
4,5,MF,(1991-09-10)10 September 1991 (aged 26),5,0,Wigan Athletic,2018,Egypt,1991-09-10,26
...,...,...,...,...,...,...,...,...,...,...
4908,18,DF,(1968-09-24)24 September 1968 (aged 25),1,0,Al Shabab,1994,Saudi Arabia,1968-09-24,25
4909,19,MF,(1967-04-19)19 April 1967 (aged 27),0,0,Al Ahli,1994,Saudi Arabia,1967-04-19,27
4910,20,FW,(1972-10-08)8 October 1972 (aged 21),7,0,Ohod,1994,Saudi Arabia,1972-10-08,21
4911,21,GK,(1973-10-15)15 October 1973 (aged 20),0,0,Al Qadsiah,1994,Saudi Arabia,1973-10-15,20


In [185]:
squad_data.to_csv('./raw_data/squad_data.csv', index = False)