## Intro to using beautiful soup to scrape web football data

### Install Beautiful Soup

In [None]:
# pip install beautifulsoup4


## We will now want to import all the necessary packages require for scraping and analysis:

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
# import klib as kb
import seaborn as sb
import matplotlib.pyplot as plt
# import wes
import matplotlib as mpl
import warnings
import numpy as np
from math import pi
from urllib.request import urlopen
from matplotlib.transforms import Affine2D
import mpl_toolkits.axisartist.floating_axes as floating_axes
from sklearn.preprocessing import StandardScaler


In [None]:
top_5_league_stats_urls = ['https://fbref.com/en/comps/9/Premier-League-Stats', 'https://fbref.com/en/comps/11/Serie-A-Stats', 'https://fbref.com/en/comps/13/Ligue-1-Stats', 'https://fbref.com/en/comps/12/La-Liga-Stats', 'https://fbref.com/en/comps/20/Bundesliga-Stats' ]

In [None]:
def generate_big_database(top_5_league_stats_urls):
    list_of_dfs = []   
    for url in top_5_league_stats_urls:
        team_urls = get_team_urls(url)  
        full_urls = list(team_urls.urls.unique())
        Player_db = general_url_database(full_urls)
        # Player_db['Age'] = Player_db.apply(lambda x: years_converter(x['Age']), axis=1)
        Player_db = Player_db.drop(columns=['matches'])
        Player_db['scouting_url'] = Player_db.apply(lambda x: get_360_scouting_report(x['urls']), axis=1)
        list_of_dfs.append(Player_db)
    dfs = pd.concat(list_of_dfs)
    return dfs

In [None]:
EU_TOP_5_DB = generate_big_database(top_5_league_stats_urls)

In [None]:

# df = df[["Player", "Pos", "90s", "Carries_1/3", "1/3"]]
# mf_positions = ['MF']
# min_90s = 8
# df = df[(df["90s"]>min_90s) & (df["Pos"].isin(mf_positions))].reset_index(drop=True)
# df[["Carries_1/3", "1/3"]] = df[["Carries_1/3", "1/3"]].div(df["90s"], axis=0)
df = df 
xs = StandardScaler().fit_transform(df["Carries_1/3"].values.reshape(-1, 1))
ys = StandardScaler().fit_transform(df["1/3"].values.reshape(-1, 1))


fig = plt.figure(figsize=(8,8))

plot_extents = -2.4, 5.6, -2.4, 5.6
transform = Affine2D().rotate_deg(45)
helper = floating_axes.GridHelperCurveLinear(transform, plot_extents)
ax = floating_axes.FloatingSubplot(fig, 111, grid_helper=helper)
ax.grid(alpha=0.5, linestyle="-.")
fig.add_subplot(ax)

ax.scatter(xs, ys, ec='k', alpha=.5, s=50, marker="h")
ax.set_aspect(1)

###highlight top percentile players
player_names = list(set(df.sort_values("Carries_1/3")["Player"].tail(7).tolist() + df.sort_values("1/3")["Player"].tail(7).tolist()))

sel_df = df.query("Player == @player_names")
sel_idx = sel_df.index; player_names = sel_df.Player.tolist()
sel_xs = xs[sel_idx]; sel_ys = ys[sel_idx]

ax.scatter(sel_xs, sel_ys, color="dodgerblue", ec="k", alpha=.5, s=70, marker="h")
for name, x, y in zip(player_names, sel_xs, sel_ys):
    ax.text(x, y, name.split(" ")[-1], fontsize=8, fontstyle="italic")
ax.axis[:].major_ticklabels.set_alpha(0)

ax.set(xlabel="Carries into Final Third", ylabel="Passes into Final Third")
fig.text(x=0.5, y=0.95, s="Ball Progression Profile", fontsize=18, fontweight="light", ha="center")
fig.text(x=0.5, y=0.9, s= f"Europe's Top 5 Leagues | Position: Midfielders | Minimum minutes: {min_90s*90}", 
    fontsize=12, fontweight="light", ha="center")

fig.savefig("diamond_plot.png", dpi=180)
plt.show()