# Football players

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import altair as alt
import altair_stiles as altstiles
import requests
import json
from bs4 import BeautifulSoup
import time
import random

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [4]:
today = pd.Timestamp("today").strftime("%m/%d/%Y")

---

## Get data

#### Player data from [Soccer Wiki](https://en.soccerwiki.org/download.php)

In [5]:
f = open("data/raw/SoccerWiki_2022-11-18_1668783632.json")
data = json.load(f)["PlayerData"]

In [6]:
src = pd.DataFrame(data)

In [7]:
src.columns = src.columns.str.lower()

In [8]:
src["surname"] = src["surname"].str.title()

In [9]:
src_df = src.rename(columns={"forename": "firstname", "surname": "lastname"}).copy()

In [10]:
src_df.head()

Unnamed: 0,id,firstname,lastname,imageurl
0,9,Justin,Hoyte,https://cdn.soccerwiki.org/images/player/9.png
1,10,Gaël,Clichy,https://cdn.soccerwiki.org/images/player/10.png
2,16,Sebastian,Larsson,https://cdn.soccerwiki.org/images/player/16.png
3,19,Cesc,Fàbregas,https://cdn.soccerwiki.org/images/player/19.png
4,28,Arturo,Lupoli,https://cdn.soccerwiki.org/images/player/28.png


---

In [19]:
urls = []
for r in range(0, 5000, 15):
    urls.append(
        f"https://en.soccerwiki.org/search/player?minrating=70&maxrating=99&offset={r}"
    )

In [20]:
data = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, parser="html.parser")
    table = soup.find("table", attrs={"class": "table-roster"})
    table_body = table.find("tbody")
    rows = table_body.find_all("tr")
    for row in rows:
        cols = row.find_all("td")
        cols = [ele.text.strip() for ele in cols]
        data.append(cols)

In [41]:
df = pd.DataFrame(
    data,
    columns=[
        "drop",
        "drop2",
        "name",
        "club",
        "position",
        "height_cm",
        "foot",
        "age",
        "rating",
    ],
).drop(["drop", "drop2"], axis=1)

In [42]:
df.head()

Unnamed: 0,name,club,position,height_cm,foot,age,rating
0,Lionel Messi,Paris Saint-Germain,"AM,F(RC)",170,Left,35,97
1,Robert Lewandowski,Barcelona,F(C),184,Right,34,97
2,Kylian Mbappé,Paris Saint-Germain,"AM(RL),F(RLC)",178,Right,23,96
3,Mohamed Salah,Liverpool,"AM(R),F(RC)",175,Left,30,96
4,Júnior Neymar,Paris Saint-Germain,"AM,F(LC)",174,Both,30,96


In [43]:
df[["position", "position_second"]] = df["position"].str.split(",", expand=True, n=1)

In [44]:
len(df)

5010

In [45]:
df["height_inches"] = (df["height_cm"].astype(int) / 2.54).round(2)

In [50]:
df = df.sort_values("rating", ascending=False)

In [51]:
len(df)

5010

---

## Exports

In [52]:
df.to_csv("data/processed/world_soccer_players_rating_top_5000.csv", index=False)

---

In [17]:
# %%time

# player_dicts = []

# for i in src["id"].head(100).to_list():
#     response = requests.get(f"https://en.soccerwiki.org/player.php?pid={i}")
#     soup = BeautifulSoup(response.text, parser="html.parser")
#     div = soup.find("div", class_="player-info-main")
#     for p in div.findAll("p", class_="player-info-subtitle"):
#         player_dict = {
#             "player_name": div.select_one("p:nth-of-type(1)").text.replace(
#                 "Full Name: ", ""
#             ),
#             "shirt_name": div.select_one("p:nth-of-type(2)").text.replace(
#                 "Shirt Name: ", ""
#             ),
#             "shirt_number": div.select("p:nth-of-type(5)")[1].text.replace(
#                 "Squad Number: ", ""
#             ),
#             "nation": div.select("p:nth-of-type(1)")[1].text.replace("Nation: ", ""),
#             "club": div.select("p:nth-of-type(4)")[1].text.replace("Club: ", ""),
#             "position": div.select_one("p:nth-of-type(3)").text.replace(
#                 "Position: ", ""
#             ),
#             "rating": div.select_one("p:nth-of-type(4)").text.replace("Rating: ", ""),
#             "age": div.select_one("p:nth-of-type(5)").text.replace("Age: ", ""),
#             "height": div.select("p:nth-of-type(2)")[1].text.replace(
#                 "Height (cm): ", ""
#             ),
#             "weight": div.select("p:nth-of-type(3)")[1].text.replace(
#                 "Weight (Kg): ", ""
#             ),
#             "foot": div.select_one("p:nth-of-type(6)").text.replace(
#                 "Preferred Foot: ", ""
#             ),
#             "hair_color": div.select_one("p:nth-of-type(7)").text.replace(
#                 "Hair Colour: ", ""
#             ),
#             "hair_style": div.select_one("p:nth-of-type(8)").text.replace(
#                 "Hairstyle: ", ""
#             ),
#             "skin": div.select_one("p:nth-of-type(9)").text.replace(
#                 "Skin Colour: ", ""
#             ),
#             "facial": div.select_one("p:nth-of-type(10)").text.replace(
#                 "Facial Hair: ", ""
#             ),
#         }
#     player_dicts.append(player_dict)
#     time.sleep(random.randint(1, 3))

In [18]:
# df = pd.DataFrame(player_dicts)