In [26]:
import requests
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd

In [39]:
leagues = [('/9', '/Premier-League-Stats'), ('/12', '/La-Liga-Stats'), ('/13/3243', '/2019-2020-Ligue-1-Stats'),
                ('/20', '/Bundesliga-Stats'), ('/11', '/Serie-A-Stats')]

pages = ['/stats', '/shooting', '/passing', '/passing_types', '/gca', '/defense', '/possession', '/misc']
selectors = ['#all_stats_standard', '#all_stats_shooting', '#all_stats_passing', '#all_stats_passing_types', 
             '#all_stats_gca', '#all_stats_defense', '#all_stats_possession', '#all_stats_misc']
col_idx = [[0,2,3,4,8,9,10,20,21,22], 
           [8,9,15],
           [7,8,10,11,12,13,15,16,18,19,24,25,26,27,28],
           [8,9,11,12,13,14,19,20,21,30],
           [7,14],
           [7,8,9,10,11,12,13,16,17,19,20,21,22,23,25,26,28],
           [8,9,10,11,12,13,14,15,17,18,19,20,21,22,23,25,26],
           [10,19,20,21]]
col_names = [['Player', 'Pos', 'Squad', 'Age', 'Min', 'Gls', 'Ast', 'xG', 'npxG', 'xA'], 
             ['Sh', 'SoT', 'FK'],
             ['PassCmp', 'PassAttm', 'TotalPassDist', 'PrgPassDist', 'ShortCmp', 'ShortAttm', 
              'MedCmp', 'MedAttm', 'LongCmp', 'LongAttm', 'KeyPasses', 'FinalThirdPass', 'PenPass', 
              'CrsPA', 'PrgPassNum'],
             ['Live', 'Dead', 'ThroughBalls', 'PassUnderPressure', 'SwitchingPass', 'Cross',
              'GroundPass', 'LowPass', 'HighPass', 'InterceptedPass'],
             ['SCA', 'GCA'],
             ['Tkl', 'TklW', 'TklDef', 'TklMid', 'TklAtt', 'DribTkl', 'DrbTklAttm', 'NumPressed', 
              'SuccPress', 'PressDef', 'PressMid', 'PressAtt', 'TotalBlocks', 'ShotsBlocked', 
              'PassesBlocked', 'PassesIntercepted', 'Clearances'],
             ['TouchesDefPen', 'TouchesDef', 'TouchesMid', 'TouchesAtt', 'TouchesAttPen', 'LiveTouches',
              'SuccDrib', 'AttmDrib', 'PlayersDrib', 'Nutmegs', 'Carries', 'DistCarried', 'PrgDistCarried',
              'TargetOfPass', 'PassesRecvd', 'Miscontrols', 'Disposessed'],
             ['Fouls', 'Recoveries', 'AerialsWon', 'AerialsLost']]


In [40]:
base_url = 'https://fbref.com/en/comps'

league_dfs = []
for league in leagues:
    dfs = []
    for page, sel, cidx, cnames in zip(pages, selectors, col_idx, col_names):
        url = base_url + league[0] + page + league[1]
        
        print('Fetching', url, '...')
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        table = BeautifulSoup(soup.select_one(sel).find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

        table_rows = table.find_all('tr')
        l = []
        for tr in table_rows:
            td = tr.find_all('td')
            row = [tr.text for tr in td]
            if row:
                sts = []
                for idx in cidx:
                    if idx > 3:
                        val = row[idx].replace(',', '')
                        if val:
                            val = float(val)
                        else:
                            val = 0
                    else:
                        val = row[idx]
                    sts.append(val)
                l.append(sts)
        
        df_new = pd.DataFrame(l, columns=cnames)
        dfs.append(df_new)
        sleep(10)
    
    tmp = pd.concat(dfs, axis=1)
    fp = './data' + league[1] + '_raw.csv'
    tmp.to_csv(fp)
    
    valid = tmp['Min'] > 180
    tmp = tmp[valid]
    gk = tmp['Pos'] == 'GK'
    df_player = tmp[~gk]
    for c in df_player.columns[5:]:
        df_player[c] = df_player[c] / df_player['Min'] * 90

    league_dfs.append(df_player)
    
df_all = pd.concat(league_dfs)
fp = './data/all_leagues.csv'
df_all.to_csv(fp)

Fetching https://fbref.com/en/comps/9/stats/Premier-League-Stats ...
Fetching https://fbref.com/en/comps/9/shooting/Premier-League-Stats ...
Fetching https://fbref.com/en/comps/9/passing/Premier-League-Stats ...
Fetching https://fbref.com/en/comps/9/passing_types/Premier-League-Stats ...
Fetching https://fbref.com/en/comps/9/gca/Premier-League-Stats ...
Fetching https://fbref.com/en/comps/9/defense/Premier-League-Stats ...
Fetching https://fbref.com/en/comps/9/possession/Premier-League-Stats ...
Fetching https://fbref.com/en/comps/9/misc/Premier-League-Stats ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fetching https://fbref.com/en/comps/12/stats/La-Liga-Stats ...
Fetching https://fbref.com/en/comps/12/shooting/La-Liga-Stats ...
Fetching https://fbref.com/en/comps/12/passing/La-Liga-Stats ...
Fetching https://fbref.com/en/comps/12/passing_types/La-Liga-Stats ...
Fetching https://fbref.com/en/comps/12/gca/La-Liga-Stats ...
Fetching https://fbref.com/en/comps/12/defense/La-Liga-Stats ...
Fetching https://fbref.com/en/comps/12/possession/La-Liga-Stats ...
Fetching https://fbref.com/en/comps/12/misc/La-Liga-Stats ...
Fetching https://fbref.com/en/comps/13/3243/stats/2019-2020-Ligue-1-Stats ...
Fetching https://fbref.com/en/comps/13/3243/shooting/2019-2020-Ligue-1-Stats ...
Fetching https://fbref.com/en/comps/13/3243/passing/2019-2020-Ligue-1-Stats ...
Fetching https://fbref.com/en/comps/13/3243/passing_types/2019-2020-Ligue-1-Stats ...
Fetching https://fbref.com/en/comps/13/3243/gca/2019-2020-Ligue-1-Stats ...
Fetching https://fbref.com/en/comps/13/3243/defense/2019-2020-Ligue-1-Stats ..

In [47]:
player_values = df_all.values[:, 5:]

In [62]:
from sklearn.decomposition import PCA, SparsePCA, NMF
import matplotlib.pyplot as plt
import numpy as np


def player_mapping(name):
    name2idx = {}
    idx2name = {}
    for i, nm in enumerate(name):
        name2idx[nm] = i
        idx2name[i] = nm
    
    return name2idx, idx2name

name2idx, idx2name = player_mapping(df_all['Player'])

def cosine_similarity(name, data):
    idx = name2idx[name]
    player = data[idx,:]
    scores = data @ player
    
    player_norm = np.linalg.norm(player)
    data_norm = np.linalg.norm(data, axis=1)
    scores = scores / data_norm / player_norm
    
    sorted_scores = -np.sort(-scores)[1:]
    ranked_idx = np.argsort(-scores)[1:]
    names = [idx2name[idx] for idx in ranked_idx]
        
    return names, sorted_scores

def closest_players(name, data):
    idx = name2idx[name]
    player = data[idx,:]
    scores = data @ player
    
    player_norm = np.linalg.norm(player)**2
    data_norm = np.linalg.norm(data, axis=1)**2
    scores = (data_norm - 2*scores) + player_norm
    
    sorted_scores = np.sort(scores)[1:]
    ranked_idx = np.argsort(scores)[1:]
    names = [idx2name[idx] for idx in ranked_idx]
        
    return names, sorted_scores

In [49]:
nmf = NMF(n_components=10)
player_trans = nmf.fit_transform(player_values)

In [74]:
names, scores = closest_players('Abdoulaye Doucouré', player_trans)
for name, score in zip(names, scores):
    print(name, score)

Dennis Praet 0.07363436041273808
Marko Grujić 0.11411304333810612
Yannick Gerhardt 0.13650806003717975
Danny Latza 0.1960587501045019
Youssouf Fofana 0.2119577985783465
Philip Billing 0.23185274364610287
Mikel Vesga 0.23807459099352535
Saša Lukić 0.2442804854162084
Vincent Pajot 0.2450436417964834
Edu Expósito 0.2534801660596475
Lewis Baker 0.26996400558762446
Pape Cheikh Diop 0.3034648206285979
Arnaud Lusamba 0.3053522008092475
James McArthur 0.31189808592839974
Sergi Darder 0.3128168631357333
Muhamed Bešić 0.31773296032701914
Žan Majer 0.3199612610843179
Jean-Eudes Aholou 0.3253030495201976
Amir Abrashi 0.33096095732787845
Wakaso 0.33313250732279975
Ibrahima Sissoko 0.3459664231079813
Dominik Kohr 0.3503324387187199
Theo Valls 0.352582568403637
Imran Louza 0.37674256684910645
Ondrej Duda 0.3829155306022791
Per Ciljan Skjelbred 0.3912866816350675
Tom Davies 0.3940014134934522
Weston McKennie 0.40734877462250907
Alessandro Deiola 0.4081871357295572
Víctor Camarasa 0.41570310971732205
H

Ben Zolinski 3.09716474021986
Jack Robinson 3.0977846956279116
Angelo Fulgini 3.098017829824986
Paolo Ghiglione 3.0988618024334134
Pedro León 3.1009022538191715
Wahbi Khazri 3.1027735759474524
Alfonso Pedraza 3.106233647090022
Kevin-Prince Boateng 3.108443590738675
Marvin Friedrich 3.114517297018505
Alberto Cerri 3.116512557866905
Šime Vrsaljko 3.1203281996013317
Matthias Zimmermann 3.1216390334993953
Bouna Sarr 3.1259456204911054
Junior Stanislas 3.127917121882424
Florian Miguel 3.1282226187724618
Phil Jagielka 3.130653574812783
Raphael Dias Belloli 3.135709150447205
Brandon Williams 3.142093621991993
Julian Chabot 3.143851336830707
Kevin Agudelo 3.1515328024967175
Oliver Fink 3.1538822282472783
Adam Masina 3.156838908924641
Nicola Sansone 3.1575663387396595
Matt Targett 3.1628791822172406
Cristian Battocchio 3.166276507503177
Nacho Vidal 3.1714771764249186
Mason Greenwood 3.1728081417376863
Ronaël Pierre-Gabriel 3.174610548479926
Vitolo 3.181507667907402
Jóhann Berg Guðmundsson 3.184

Filip Kostić 5.1054956287381295
Ibrahima Niane 5.1072817757199935
Matheus Cunha 5.108996934362613
Krzysztof Piątek 5.10909786504509
Lago Junior 5.118949860107939
Fabio Quagliarella 5.1259076730438995
Eliaquim Mangala 5.130791571624016
Lorenzo Insigne 5.136849489137486
Santiago Arias 5.140020128506659
Federico Ceccherini 5.141285292452093
Pierre-Emerick Aubameyang 5.142717655138181
Lucas Leiva 5.146584766575838
Edin Džeko 5.14751124271274
Abdallah N'dour 5.151567981303781
Breel Embolo 5.155543565131515
Andrea Favilli 5.163739386781874
Gary Medel 5.166399201869417
Hugo Mallo 5.168041787013774
Andrea Masiello 5.171997993652063
Nenad Tomović 5.172556830059737
Adrián López 5.178614064565489
Sehrou Guirassy 5.1792913104348965
Danny Rose 5.18247586857208
Bas Dost 5.1840939341462295
Andrea Pinamonti 5.184464123112582
Adrien Silva 5.1853411947192605
Ruben Gabrielsen 5.191457240617565
Harvey Barnes 5.196338998384071
Musa Barrow 5.201815493998408
Nacho 5.209997296961763
André Gomes 5.210580478044

Mauro Icardi 8.164381556670556
Eden Hazard 8.178597729642998
Emerson Palmieri 8.183363002945402
Karim Rekik 8.186638532761211
John Brooks 8.198645482383593
Ishak Belfodil 8.200193072279419
Mattia De Sciglio 8.21443026999356
Casemiro 8.229990570090319
Kamil Glik 8.240317382270153
El Bilal Touré 8.251006517820894
Rabbi Matondo 8.252591210263024
Tobias Strobl 8.256397827035965
José Arnaiz 8.257514186101217
Nélson Semedo 8.266242678703541
Yaya Sanogo 8.281193588401898
Kevin Stöger 8.282992338656907
Mehdi Bourabia 8.296452726862594
Nampalys Mendy 8.308927491460711
Hugo Duro 8.313764052031994
Marcos Alonso 8.332404189036897
Riccardo Sottil 8.366724335010886
Charles Aránguiz 8.368252268746776
Noah Joel Sarenren Bazee 8.368912337646144
Damien Da Silva 8.379837037823414
Jamie Vardy 8.39451711792238
Leonardo Spinazzola 8.405020043966971
Rafael Czichos 8.427301294452239
Jonathan Calleri 8.438755302027833
Lars Bender 8.485287280379108
Daniel Brosinski 8.487185037435694
Ozan Kabak 8.499962093296308

In [73]:
df_proto = pd.DataFrame(nmf.components_, columns=df_all.columns[5:])
df_proto.to_csv('./data/prototypes.csv', index=False)