In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import re

In [2]:
# extracting player urls

url_main='https://sofifa.com'
player_urls = {}
for n in range(0,10,60):
    page = requests.get(url_main + '/players?offset=' + str(n))
    soup = bs(page.content, 'lxml')
    page_contents = soup.find('table', {'class': 'table table-hover persist-area'}).find('tbody').find_all('a')
    for i in page_contents:
        if i['href'][0:8] == '/player/':  player_urls[i.text] = i['href']

In [3]:
# extracting player attributes

player_attr = {}
st, mid, df, gk = 'Striker', 'Midfielder', 'Defender', 'GoalKeeper'
player_category_map = {'LW':st, 'ST':st, 'RW':st, 'LF':st, 'CF':st, 'RF':st,
                      'CAM':mid, 'LM':mid, 'CM':mid, 'RM':mid, 'CDM':mid,
                      'LWB':df, 'LB':df, 'CB':df, 'RB':df, 'RWB':df,
                      'GK': gk}
content_aux_list = ['meta', 'column col-4 text-center']

attr_list = ['Crossing', 'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys', 'Dribbling', 'Curve', 'FK Accuracy',
             'Long Passing','Ball Control', 'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance', 'Shot Power',
             'Jumping', 'Stamina', 'Strength', 'Long Shots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
             'Penalties', 'Composure', 'Marking', 'Standing Tackle', 'Sliding Tackle', 'GK Diving', 'GK Handling',
             'GK Kicking', 'GK Positioning','GK Reflexes']
aux_attr_list = ['Player Category','Age', 'Height', 'Weight', 'Overall Rating', 'Value'] 
attr_len = len(attr_list)
aux_attr_len = len(aux_attr_list)

for player_name, url in player_urls.items():
    player_url = url_main + url
    page = requests.get(player_url)
    soup = bs(page.content, 'lxml')
    content = soup.find_all('ul', {'class': 'pl'})
    
    # getting all the words in content
    lines = []
    for c in content:
        line = c.text.strip().split()
        new_line = []
        for word in line:
            if word[0].isdigit(): 
                new_line.append(word)
                continue
            new_word = ''
            number = ''
            for letter in word:
                if letter.isdigit() or letter == '+' or letter == '-':   number += letter
                else:                                                    new_word += letter
            if len(new_word) != 0:    new_line.append(new_word)
            if len(number) != 0:      new_line.append(number)
        lines.append(new_line)

    # fetching attribute ratings
    attr_ratings = []
    attr_count = 0
    for line in lines:
        for i,word in enumerate(line):
            if word[0].isdigit():
                new_word = ''
                for next_word in line[i+1:]:
                    if next_word[0].isdigit(): 
                        break
                    else:
                        if len(new_word) == 0:    new_word += next_word
                        else :                    new_word += ' ' + next_word
                if new_word == attr_list[attr_count]:
                    attr_ratings.append(word)
                    if attr_count < attr_len-1:   attr_count += 1
    for i, rating in enumerate(attr_ratings):
        if len(rating.split('+')) != 1:   attr_ratings[i] = rating.split('+')[0]
        if len(rating.split('-')) != 1:   attr_ratings[i] = rating.split('-')[0]
    if attr_count != attr_len-1:
        print(player_name, '- datapoint not included')   # due to missed attributes, if any
        continue
    
    # fetching auxiliary attribute data
    aux_attr_data = []
    for item in content_aux_list:
        content_aux = soup.find_all('div', {'class': item})
        for c in content_aux:
            line = c.text.strip().split()
            for i,word in enumerate(line):
                word_prev = line[i-1]
                if word[0].isdigit() and word_prev[-3:] == aux_attr_list[1]:
                    if len(word_prev[:-3]) < 2:   aux_attr_data.append(player_category_map[line[i-2]])
                    else:                         aux_attr_data.append(player_category_map[word_prev[:-3]])
                    aux_attr_data.append(word)
                    height, weight = line[-2], line[-1][:-3]
                    aux_attr_data.append(height)
                    aux_attr_data.append(weight)
                if word_prev + ' ' + word == aux_attr_list[4] or word == aux_attr_list[5]:
                    word_next = line[i+1]
                    aux_attr_data.append(word_next)
    if len(aux_attr_data) != aux_attr_len:
        print(player_name, '- datapoint not included')   # due to missed auxiliary attributes, if any
        continue
    
    # storing all the attributes
    player_attr[player_name] = aux_attr_data + attr_ratings


In [4]:
# creating and saving the player attributes dataframe

players_dataframe = pd.DataFrame(columns = ['Player Name'] + aux_attr_list + attr_list)
for name, ratings in player_attr.items():
    player_data = [name]
    for rating in ratings:   player_data.append(rating)
    players_dataframe = players_dataframe.append(pd.Series(player_data,
                                                 index = ['Player Name'] + aux_attr_list + attr_list), ignore_index=True)
players_dataframe.to_csv('player_attributes.csv')

In [5]:
# check if all the data were fetched

if players_dataframe.shape[1] != (players_dataframe.count() == players_dataframe.shape[0]).sum():
    print('Missed some data')
else:
    print('All data fetched')

All data fetched


In [6]:
# reading and displaying the player attribute dataframe

player_attr_dataframe = pd.read_csv('player_attributes.csv', index_col=[0])
player_attr_dataframe

Unnamed: 0,Player Name,Player Category,Age,Height,Weight,Overall Rating,Value,Crossing,Finishing,Heading Accuracy,...,Penalties,Composure,Marking,Standing Tackle,Sliding Tackle,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes
0,L. Messi,Striker,31,"5'7""",159,94,€110.5M,86,95,70,...,75,96,33,28,26,6,11,15,14,8
1,Cristiano Ronaldo,Striker,33,"6'2""",183,94,€77M,84,94,89,...,85,95,28,31,23,7,11,15,14,11
2,Neymar Jr,Midfielder,26,"5'9""",150,92,€108M,83,87,62,...,81,94,27,24,33,9,9,15,15,11
3,De Gea,GoalKeeper,27,"6'4""",168,91,€72M,17,13,21,...,40,70,25,21,13,90,85,85,89,94
4,K. De Bruyne,Midfielder,27,"5'11""",154,91,€102M,93,82,55,...,79,90,68,58,51,15,13,5,10,13
5,E. Hazard,Striker,27,"5'8""",168,91,€93M,81,84,61,...,86,91,34,27,22,11,12,6,8,8
6,L. Modrić,Midfielder,32,"5'8""",146,91,€67M,86,72,55,...,82,90,68,76,73,13,9,7,14,9
7,L. Suárez,Striker,31,"6'0""",190,91,€80M,77,92,82,...,85,85,62,45,38,27,25,31,33,37
8,H. Kane,Striker,24,"6'2""",196,90,€96.5M,75,94,86,...,90,91,56,36,38,8,10,11,14,11
9,J. Oblak,GoalKeeper,25,"6'2""",192,90,€68M,13,11,15,...,11,70,27,12,18,86,92,78,88,89
