## 1. Web Scraping Player Data from FUTBIN

FUTBIN is a popular website and mobile app that provides tools and resources for players of the Ultimate Team mode in FIFA 23 and other versions of the game. The website offers a comprehensive database of player cards, including their stats, ratings, and market values, as well as real-time updates on the prices of different cards in the game's virtual marketplace

### Import Libraries

In [1]:
import pandas as pd

import requests
from bs4 import BeautifulSoup

from time import sleep

### Web Scraping Player Data from Page 1

In [2]:
# Define url and user-agent
url = 'https://www.futbin.com/players?page=1'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'

# Set the user agent string in the headers
headers = {
    'User-Agent': user_agent
}

# Make the request using the headers
response = requests.get(url, headers=headers)

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
# Find stats for an example player
example = soup.find_all('tr')[0]

# Extract text from columns for player row
example_stats = [col.text.strip() for col in example.find_all('td')]
example_stats


['',
 'Pelé',
 '98',
 'CAM\nCF,ST',
 'Icon\n\n\r\n                                                    Explosive',
 '4.05M \n\n\n                                                                2.51%',
 '5',
 '4',
 'H \\ M',
 '95',
 '96',
 '93',
 '96',
 '60',
 '76',
 '173cm | 5\'8"\n  Unique  (70kg)',
 '3782',
 '516',
 '2513']

This data is missing the player's team, country and league which are in the form of hyperlinks. Therefore I will build a function to extract these values

In [4]:
def get_loc_attrs(row):
    attrs = []
    first_col = row.find_all('td')[1]
    for link in first_col.find_all('a')[1:4]:
        attr = link.get('data-original-title')
        attrs.append(attr)
    return attrs
    
# Test on example row
get_loc_attrs(example)

['FUT ICONS', 'Brazil', 'Icons']

I then build a function to extract the table headers:

In [5]:
def get_table_headers(soup):
    headers = []
    for header in soup.find_all('th')[2:]: # Start from third header as first two were in a different format
        header_title = header.find('a').get('data-original-title')
        if header_title[:8] == 'Order By': # Remove 'Order By' from header
            header_title = header_title[9:]
        headers.append(header_title)

    headers.insert(0, 'Name') # Add 'Name' to start of list as this wasn't added above
    headers.extend(['Team', 'Nation', 'League']) # Add other attributes not picked up above
    

    return headers

get_table_headers(soup)

['Name',
 'Rating',
 'Position',
 'Version',
 'Price',
 'Skills',
 'Weak Foot',
 'Attack \\ Defense',
 'Pace',
 'Shooting',
 'Passing',
 'Dribbling',
 'Defending',
 'Physicality',
 'Height',
 'Popularity',
 'Base Stats',
 'In Game Stats',
 'Team',
 'Nation',
 'League']

Next I build a function to create a dataframe of player attributes from the current page:

In [6]:
def players_df_from_onepage(soup):
    # Create an empty list to store all the player attributes
    all_players = []
    
    # Loop through each row in the table on the page
    for row in soup.find_all('tr'):
        cols = row.find_all('td')
        # Check if there are at least 2 columns (to skip blank lines or ads)
        if len(cols) > 1:
            # Extract the location attributes for the player from the row
            loc_attrs = get_loc_attrs(row)
            # Extract the other player attributes from the columns in the row
            other_attrs = [col.text.strip() for col in cols][1:] # Ignore first blank column
            # Combine and add to empty list
            player_attrs = other_attrs + loc_attrs
            all_players.append(player_attrs)

    # Get the table headers from the page
    headers = get_table_headers(soup)

    # Convert the list of player attributes to a pandas dataframe
    df = pd.DataFrame(all_players, columns=headers)
    
    # Rename the 'Attack \ Defense' column to 'Work Rate (Attack \ Defense)'
    df.rename(columns={'Attack \ Defense' : 'Work Rate (Attack \ Defense)'}, inplace=True)
    
    # Return the dataframe containing all the player attributes
    return df


In [7]:
df = players_df_from_onepage(soup)
df.head()

Unnamed: 0,Name,Rating,Position,Version,Price,Skills,Weak Foot,Work Rate (Attack \ Defense),Pace,Shooting,...,Dribbling,Defending,Physicality,Height,Popularity,Base Stats,In Game Stats,Team,Nation,League
0,Pelé,98,"CAM\nCF,ST",Icon\n\n\r\n ...,4.05M \n\n\n ...,5,4,H \ M,95,96,...,96,60,76,"173cm | 5'8""\n Unique (70kg)",3782,516,2513,FUT ICONS,Brazil,Icons
1,Lionel Messi,98,RW\nRM,TOTY\n\n\r\n ...,5.1M \n\n\n ...,4,4,H \ L,93,98,...,99,40,77,"169cm | 5'7""\n Messi (67kg)",5398,504,2469,Paris SG,Argentina,Ligue 1
2,Karim Benzema,97,CF\nST,TOTY\n\n\r\n ...,2.38M \n\n\n ...,4,5,H \ M,92,97,...,94,45,90,"185cm | 6'1""\n Average (81kg)",4363,508,2449,Real Madrid,France,LaLiga Santander
3,Kylian Mbappé,97,"ST\nCF,LW",TOTY\n\n\r\n ...,12M \n\n\n ...,5,4,H \ L,99,96,...,98,44,87,"182cm | 6'0""\n Unique (73kg)",3433,512,2455,Paris SG,France,Ligue 1
4,Kevin De Bruyne,97,CM\nCAM,TOTY\n\n\r\n ...,2.46M \n\n\n ...,4,5,H \ H,85,94,...,94,81,86,"181cm | 5'11""\n Unique (70kg)",2252,538,2593,Manchester City,Belgium,Premier League


### Web Scraping Player Data from All Pages

In [8]:
# Code to find the last page by web sraping the page navigation buttons below the table
last_page = int(soup.find_all('li', class_='page-item')[-2].find('a').text) 
print(last_page)

698


First I combine the steps from the previous section to create a function that creates a DataFrame for players on any given page:

In [9]:
def player_df_from_anypage(page_number):
    # Define url and user-agent
    url = 'https://www.futbin.com/players?page=' + str(page_number)
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'

    # Set the user agent string in the headers
    headers = {
        'User-Agent': user_agent
    }

    # Make the request using the headers
    response = requests.get(url, headers=headers)

    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Call the previously defined function to create a DataFrame
    df = players_df_from_onepage(soup)

    # Return the dataframe containing all the player attributes
    return df

Next, I'll create a new function that iterates through each page of data, calls the `players_df_from_onepage` function for each page, and combines the resulting dataframes to create a complete dataset.

In [10]:
def get_dataset():
    
    # Define url and user-agent
    url = 'https://www.futbin.com/players?page=1'
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
    # Set the user agent string in the headers
    headers = {
        'User-Agent': user_agent
    }
    # Make the request using the headers
    response = requests.get(url, headers=headers)
    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Get the last page number
    last_page = int(soup.find_all('li', class_='page-item')[-2].find('a').text) 

    try:
        # create an empty pandas DataFrame to hold all the scraped data
        all_players = pd.DataFrame()
        
        # Loop through all pages to get all player data for that page and add to above dataframe 
        for i in range(1,last_page+1):
            df_i = player_df_from_anypage(i)
            all_players = pd.concat([all_players, df_i])
            
            # Sleep for 15 seconds every six requests to avoid too many requests
            if (i) % 6 == 0:
                sleep(15)        
            if i % 50 == 0:
                print(f'Page {i} scraped, {all_players.shape[0]} rows added')
    
    # If an error occurs, sleep for 30 seconds before continuing
    except requests.exceptions.RequestException as err:
            print(err)
            sleep(30)
    print('All pages scraped')

    return all_players

In [11]:
player_dataset = get_dataset()

Page 50 scraped, 1500 rows added
Page 100 scraped, 3000 rows added
Page 150 scraped, 4500 rows added
Page 200 scraped, 6000 rows added
Page 250 scraped, 7500 rows added
Page 300 scraped, 9000 rows added
Page 350 scraped, 10500 rows added
Page 400 scraped, 12000 rows added
Page 450 scraped, 13500 rows added
Page 500 scraped, 15000 rows added
Page 550 scraped, 16500 rows added
Page 600 scraped, 18000 rows added
Page 650 scraped, 19500 rows added
All pages scraped


In [12]:
player_dataset.to_csv(r'datasets\player_dataset.csv', index=False)