In [1]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings
import numpy as np
from math import pi
from urllib.request import urlopen
import matplotlib.patheffects as pe
from highlight_text import fig_text
from adjustText import adjust_text
from tabulate import tabulate
import matplotlib.style as style
import unicodedata
from fuzzywuzzy import fuzz
from fuzzywuzzy import process




In [2]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    only_ascii = str(only_ascii)
    only_ascii = only_ascii[2:-1]
    only_ascii = only_ascii.replace('-', ' ')
    return only_ascii

In [3]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold=97, limit=1):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [158]:
def position_grouping(x):
    keepers = ['GK']
    defenders = ["DF",'DF,MF']
    wing_backs = ['FW,DF','DF,FW']
    defensive_mids = ['MF,DF']
    midfielders = ['MF']
    attacking_mids = ['MF,FW',"FW,MF"]
    forwards = ['FW']
    if x in keepers:
        return "GK"
    elif x in defenders:
        return "Defender"
    elif x in wing_backs:
        return "Wing-Back"
    elif x in defensive_mids:
        return "Defensive-Midfielders"
    elif x in midfielders:
        return "Central Midfielders"
    elif x in attacking_mids:
        return "Attacking Midfielders"
    elif x in forwards:
        return "Forwards"
    else:
        return "unidentified position"

In [159]:
def get_360_scouting_report(url):    
    start = url[0:38]+ "scout/365_euro/"
    def remove_first_n_char(org_str, n):
        mod_string = ""
        for i in range(n, len(org_str)):
            mod_string = mod_string + org_str[i]
        return mod_string
    mod_string = remove_first_n_char(url, 38)
    final_string = start+mod_string+"-Scouting-Report"    
    return final_string

In [160]:
def years_converter(variable_value):
    if len(variable_value) > 3:
        years = variable_value[:-4]
        days = variable_value[3:]
        years_value = pd.to_numeric(years)
        days_value = pd.to_numeric(days)
        day_conv = days_value/365
        final_val = years_value + day_conv
    else:
        final_val = pd.to_numeric(variable_value)

    return final_val

In [156]:
team_urls = pd.read_csv("CSVs/EPL_Teams_URLs.csv")
full_urls = list(team_urls.urls.unique())


In [155]:
import pandas as pd
import requests
import re

Seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021','2021-2022']
def multi_season_data(Seasons): 
    season_dfs = {}
    for season in Seasons:
        url = f'https://fbref.com/en/comps/Big5/{season}/stats/players/{season}-Big-5-European-Leagues-Stats'
        res = requests.get(url).text
        htmlStr = res.replace('<!--', '')
        htmlStr = htmlStr.replace('-->', '')
        
        dfs = pd.read_html(htmlStr, header=1)
        
        team_table = dfs[0]
        player_table = dfs[1]
        player_table = player_table[player_table['Rk'].ne('Rk')]
        player_table['Season'] = season
        
        for cat in ['shooting', 'passing', 'gca', 'defense', 'possession', 'misc', 'keepers', 'keepersadv', 'passing_types']:
            print(cat)
            cat_url = f'https://fbref.com/en/comps/Big5/{season}/{cat}/players/{season}-Big-5-European-Leagues-Stats'
            resp = requests.get(cat_url).text
            htmlStr = res.replace('<!--', '')
            htmlStr = htmlStr.replace('-->', '')
            temp_df = pd.read_html(htmlStr, header=1)[1]
            temp_df = temp_df[temp_df['Rk'].ne('Rk')]
            
            newCols = ['Player'] + [x for x in temp_df.columns if x not in player_table.columns]
            temp_df = temp_df[newCols]
            
            player_table = pd.merge(player_table, temp_df, how='outer', on='Player')
            
        season_dfs[season] = player_table
        print('Collected: ', season)

    results = pd.concat([df for x, df in season_dfs.items()])
    results = results.drop_duplicates()
    results = results.reset_index(drop=True)

    return results

In [157]:
import time 


appended_data = []
for team_url in full_urls:
    print(team_url)
    htmlStr = requests.get(team_url).text.replace('<!--', '').replace('-->', '')

    dfs = pd.read_html(htmlStr, header=1)

    team_table = dfs[0]
    player_table = dfs[1]

    player_db = pd.DataFrame()
    player_urls = []
    # data  = requests.get(team_url).text
    links = BeautifulSoup(requests.get(team_url).text).select('th a')
    urls = [link['href'] for link in links]
    player_urls.append(urls)
    player_urls  = [item for sublist in player_urls  for item in sublist]
    player_urls.sort()
    player_urls = list(set(player_urls))
    p_url = list(filter(lambda k: 'players' in k, player_urls))
    url_final = []
    for y in p_url:
        full_url = "https://fbref.com"+y
        url_final.append(full_url)
    player_names = []
    for player in p_url: 
        player_name_slice = player[21:]
        player_name_slice = player_name_slice.replace('-', ' ')
        player_names.append(player_name_slice)
    # player_names
    list_of_tuples = list(zip(player_names, url_final))
    play_url_database = pd.DataFrame(list_of_tuples, columns = ['Player', 'urls'])
    player_db = pd.concat([play_url_database])

    df = team_table[["Player","Pos","Age", "Starts"]]

    df['Player'] = df.apply(lambda x: remove_accents(x['Player']), axis=1)
    test_merge = fuzzy_merge(df, player_db, 'Player', 'Player', threshold=90)
    test_merge = test_merge.rename(columns={'matches': 'Player', 'Player': 'matches'})
    # test_merge = test_merge.drop(columns=['matches'])
    final_merge = test_merge.merge(player_db, on='Player', how='left')
    

    del dfs, df, 
    time.sleep(7)

    appended_data.append(final_merge)
appended_data = pd.concat(appended_data)
appended_data

https://fbref.com/en/squads/19538871/Manchester-United-Stats


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


https://fbref.com/en/squads/5bfb9659/Leeds-United-Stats
https://fbref.com/en/squads/a2d435b3/Leicester-City-Stats
https://fbref.com/en/squads/fd962109/Fulham-Stats
https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats
https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats
https://fbref.com/en/squads/18bb7c10/Arsenal-Stats
https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats
https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats
https://fbref.com/en/squads/cd051869/Brentford-Stats
https://fbref.com/en/squads/33c895d4/Southampton-Stats
https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats
https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats
https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats
https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats
https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats
https://fbref.com/en/squads/d3fd31cc/Everton-Stats
https://fbref.com/en/squads/822bd0ba/Liverpool-Stats
https://fbref.com/en/squads/8602292

Unnamed: 0,matches,Pos,Age,Starts,Player,urls
0,David de Gea,GK,31-289,3,David de Gea,https://fbref.com/en/players/7ba6d84e/David-de...
1,Bruno Fernandes,MF,27-349,3,Bruno Fernandes,https://fbref.com/en/players/507c7bdf/Bruno-Fe...
2,Diogo Dalot,DF,23-158,3,Diogo Dalot,https://fbref.com/en/players/d9565625/Diogo-Dalot
3,Marcus Rashford,FW,24-296,3,Marcus Rashford,https://fbref.com/en/players/a1d5bd30/Marcus-R...
4,Christian Eriksen,"MF,FW",30-190,3,Christian Eriksen,https://fbref.com/en/players/980522ec/Christia...
5,Lisandro Martinez,DF,24-217,3,Lisandro Martinez,https://fbref.com/en/players/bac46a10/Lisandro...
6,Jadon Sancho,FW,22-151,3,Jadon Sancho,https://fbref.com/en/players/dbf053da/Jadon-Sa...
7,Scott McTominay,MF,25-258,2,Scott McTominay,https://fbref.com/en/players/d93c2511/Scott-Mc...
8,Harry Maguire,DF,29-171,2,Harry Maguire,https://fbref.com/en/players/d8931174/Harry-Ma...
9,Luke Shaw,DF,27-042,2,Luke Shaw,https://fbref.com/en/players/9c94165b/Luke-Shaw


In [165]:
def apply_dset_fixes(appended_data, name):  

    df = appended_data[appended_data["matches"] != 'Squad Total'] 
    # df['Age'] = df.apply(lambda x: years_converter(x['Age']), axis=1)
    df = df.drop(columns=['matches'])
    df['scouting_url'] = df.apply(lambda x: get_360_scouting_report(x['urls']), axis=1)
    df["position_group"] = df.Pos.apply(lambda x: position_grouping(x))
    df.reset_index(drop=True)
    df[["Starts"]] = df[["Starts"]].apply(pd.to_numeric) 

    df.to_csv(f"CSVs/{name}_Player_Database.csv")

    return df

EPL_DB = apply_dset_fixes(appended_data, "EPL")


In [171]:
position = 'Central Midfielders'
pl_starts = 2
subset_of_data = EPL_DB.query('position_group == @position & Starts > @pl_starts')
players_needed = list(subset_of_data.urls.unique())

In [260]:
def get_player_multi_data(url_list:list):
    appended_data = []
    for url in url_list:
        print(url)
        warnings.filterwarnings("ignore")
        page =requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        name = [element.text for element in soup.find_all("span")]
        name = name[7]
        metric_names = []
        metric_values = []
        # remove_content = ["'", "[", "]", ","]
        for row in soup.findAll('table')[0].tbody.findAll('tr'):
            first_column = row.findAll('th')[0].contents
            metric_names.append(first_column)
        for row in soup.findAll('table')[0].tbody.findAll('tr'):
            first_column = row.findAll('td')[0].contents
            metric_values.append(first_column)
        
        if len(metric_values) < 15:
            del name, soup
            pass
        else: 

            metric_names = [item for sublist in metric_names for item in sublist]
            metric_values = [item for sublist in metric_values for item in sublist]

            df_player = pd.DataFrame()
            df_player['Name'] = name[0]
            for item in metric_names:
                df_player[item] = []

            name = name
            non_penalty_goals = (metric_values[0])
            npx_g = metric_values[1]
            shots_total = metric_values[2]
            assists = metric_values[3]
            x_a = metric_values[4]
            npx_g_plus_x_a = metric_values[5] 
            shot_creating_actions = metric_values[6] 
            passes_attempted = metric_values[7] 
            pass_completion_percent = metric_values[8] 
            progressive_passes = metric_values[9] 
            progressive_carries = metric_values[10] 
            dribbles_completed = metric_values[11] 
            touches_att_pen = metric_values[12]
            progressive_passes_rec = metric_values[13] 
            pressures = metric_values[14] 
            tackles = metric_values[15] 
            interceptions = metric_values[16] 
            blocks = metric_values[17]
            clearances = metric_values[18]
            aerials_won = metric_values[19]
            df_player.loc[0] = [name, non_penalty_goals, npx_g, shots_total, assists, x_a, npx_g_plus_x_a, shot_creating_actions, passes_attempted, pass_completion_percent,
                                progressive_passes, progressive_carries, dribbles_completed, touches_att_pen, progressive_passes_rec, pressures, tackles, interceptions, blocks,
                                clearances, aerials_won]
            appended_data.append(df_player)
            del name, soup, df_player
            time.sleep(5)
    appended_data = pd.concat(appended_data)
    return appended_data

In [291]:
# df = get_player_multi_data(players_needed)
df

Unnamed: 0,Name,Non-Penalty Goals,Non-Penalty xG,Shots Total,Assists,xG Assisted,npxG + xA,Shot-Creating Actions,Passes Attempted,Pass Completion %,...,Progressive Carries,Dribbles Completed,Touches (Att Pen),Progressive Passes Rec,Pressures,Tackles,Interceptions,Blocks,Clearances,Aerials won
0,Bruno Fernandes,0.17,0.21,2.35,0.31,0.3,0.51,4.26,59.21,75.2%,...,5.88,0.7,3.27,6.25,17.53,1.79,1.09,0.92,1.23,0.61
0,Tyler Adams,0.0,0.01,0.19,0.05,0.04,0.05,1.22,58.89,86.1%,...,4.32,0.61,0.66,1.17,22.46,2.26,1.55,2.54,1.13,1.17
0,Marc Roca,0.0,0.03,0.8,0.0,0.05,0.08,1.46,63.62,87.1%,...,3.84,0.66,1.06,1.46,23.46,2.52,1.86,1.72,0.66,0.27
0,James Maddison,0.46,0.31,2.9,0.34,0.19,0.49,4.06,41.88,73.7%,...,5.59,1.56,3.21,5.47,18.12,1.44,0.76,0.95,0.37,0.21
0,Wilfred Ndidi,0.04,0.07,0.77,0.0,0.0,0.07,0.73,52.5,83.9%,...,2.23,0.43,1.03,0.69,22.8,3.77,2.53,1.76,3.13,2.49
0,Kiernan Dewsbury-Hall,0.11,0.06,1.1,0.07,0.12,0.18,2.72,53.12,77.1%,...,6.39,1.17,1.77,3.89,19.89,1.84,1.13,1.77,1.17,1.17
0,João Palhinha,0.11,0.02,0.78,0.0,0.0,0.02,0.22,31.27,78.9%,...,1.57,0.67,0.78,0.45,29.14,3.25,1.35,1.91,1.35,2.13
0,Declan Rice,0.08,0.05,0.85,0.08,0.06,0.11,1.71,59.58,90.2%,...,6.41,1.08,1.42,0.93,13.79,2.37,2.83,1.71,1.48,1.4
0,Tomáš Souček,0.11,0.15,1.38,0.02,0.04,0.2,1.14,39.71,78.7%,...,2.21,0.31,2.91,1.66,14.29,1.99,1.73,1.38,3.13,3.91
0,Jesse Lingard,0.24,0.13,1.45,0.12,0.16,0.3,3.27,34.28,84.1%,...,4.48,0.61,2.42,4.24,21.8,1.7,1.09,0.48,0.48,0.12


In [292]:
df_test = df[["Name", "Passes Attempted","Progressive Carries"]]

In [293]:
df_test

Unnamed: 0,Name,Passes Attempted,Progressive Carries
0,Bruno Fernandes,59.21,5.88
0,Tyler Adams,58.89,4.32
0,Marc Roca,63.62,3.84
0,James Maddison,41.88,5.59
0,Wilfred Ndidi,52.5,2.23
0,Kiernan Dewsbury-Hall,53.12,6.39
0,João Palhinha,31.27,1.57
0,Declan Rice,59.58,6.41
0,Tomáš Souček,39.71,2.21
0,Jesse Lingard,34.28,4.48


In [321]:
sel_df


[Text([0.88875594], [0.4659745], 'Dewsbury-Hall'),
 Text([-0.10869677], [0.46673681], 'Rice'),
 Text([-0.41560529], [0.79772842], 'Ødegaard'),
 Text([0.70333204], [-0.7227788], 'Guimarães'),
 Text([-1.44502764], [0.01988174], 'Rodri'),
 Text([1.21484625], [0.06323857], 'Bruyne'),
 Text([-1.86702686], [-1.46474004], 'Jorginho'),
 Text([1.2276341], [0.51498877], 'Doucouré'),
 Text([-1.4578155], [-0.87452771], 'Kamara'),
 Text([-0.00639393], [-1.25424962], 'Ramsey')]

In [323]:
StandardScaler().fit_transform(df2["Passes_Attempted"].values.reshape(-1, 1))

array([[ 0.47754451],
       [ 0.4551668 ],
       [ 0.78593745],
       [-0.73434882],
       [ 0.00831172],
       [ 0.05166855],
       [-1.47631005],
       [ 0.50341875],
       [-0.88609772],
       [-1.26581964],
       [ 0.03138875],
       [ 0.45866331],
       [-0.05532491],
       [-1.31267299],
       [ 0.68104189],
       [-1.93295538],
       [-0.26861255],
       [-0.61266997],
       [ 0.44257933],
       [-0.98330094],
       [-0.38120044],
       [-0.1091713 ],
       [ 0.89642744],
       [ 2.52720369],
       [ 0.55866374],
       [ 0.6817412 ],
       [ 1.54468198],
       [-1.37561032],
       [ 0.45097097],
       [-1.10008466],
       [ 1.85517282],
       [-0.87420956],
       [-0.91476917],
       [ 1.27125423],
       [ 0.60132127]])

In [326]:
df2[["Name","Passes_Attempted"]]

Unnamed: 0,Name,Passes_Attempted
0,Bruno Fernandes,59.21
0,Tyler Adams,58.89
0,Marc Roca,63.62
0,James Maddison,41.88
0,Wilfred Ndidi,52.5
0,Kiernan Dewsbury-Hall,53.12
0,João Palhinha,31.27
0,Declan Rice,59.58
0,Tomáš Souček,39.71
0,Jesse Lingard,34.28


In [269]:
plot_extents

(-2.023434259500962, 2.933431956252556, -1.101633686269896, 3.182497315890811)

In [270]:
xs.min()

-2.023434259500962

In [None]:
def extract_names_and_urls(data):
    names = []
    urls = []
    for athlete in data['details']['sportsTeamJSONLD']['athlete']:
        names.append(athlete['name'])
        urls.append(athlete['url'])
    df = pd.DataFrame({'Name': names, 'URL': urls})
    return df

player_rating = extract_names_and_urls(dict1)

fm_ratings = []
player_ids = (player_rating.URL.unique())
def extract_rating(data):
    for id in player_ids:
        id = id[31:37]
        dict2 = client.get_player(id)
        l1= dict2['careerStatistics']
        l1= l1[0]

        l2= l1['seasons']
        l3={k:v for e in l2 for (k,v) in e.items()}

        l4= l3['stats']

        d1={k:v for e in l4 for (k,v) in e.items()}

        stating_value = None
        for rating in d1:
            for item in d1['statsArr']:
                if item[0] == 'FotMob rating':
                    rating_value = item[1]['value']['num']
                    break
            if rating_value is not None:
                break
            if rating_value is not None:
                break
        fm_ratings.appned(rating_value)
    df = pd.DataFrame({'Name': names, 'URL': urls, 'fm_rating': fm_ratings})
    return df

extract_names_and_urls(dict1)