# Football players

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import altair as alt
import altair_stiles as altstiles
import requests
import json
from bs4 import BeautifulSoup
import time
import random

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [4]:
today = pd.Timestamp("today").strftime("%m/%d/%Y")

---

## Get data

#### Club data from [Soccer Wiki](https://en.soccerwiki.org/download.php)

In [13]:
f = open("data/raw/SoccerWiki_2022-11-18_1668783632.json")
data = json.load(f)["ClubData"]

In [14]:
club_base = pd.DataFrame(data)

In [15]:
club_base.columns = club_base.columns.str.lower()

In [16]:
club_base["id"] = club_base["id"].astype(str)

In [17]:
club_base.head()

---

#### Loop though club ids, scrape details from their pages

In [32]:
%%time

stad_dicts = []

for i in club_base.id.head().to_list():
    response = requests.get(f"https://en.soccerwiki.org/squad.php?clubdid={i}")
    soup = BeautifulSoup(response.text, "html.parser")
    div = soup.find("div", class_="col-8")
    try:
        img_url = div.find("img", class_="lozad img-fluid img-thumbnail")["data-src"]

    except: 
        img_url = 'No image url'
    stad_dict = {
        "name": div.find("h1").text,
        "capacity": div.select_one("p:nth-of-type(1)")
        .text.replace("Capacity: ", "")
        .replace(",", ""),
        'img_url': img_url,
        "city": div.select_one("p:nth-of-type(2)").text.replace("City: ", ""),
        "country": div.select_one("p:nth-of-type(3)").text.replace("Country: ", ""),
        "club": div.select_one("p:nth-of-type(4)").text.replace("Club: ", ""),
    }
    stad_dicts.append(stad_dict)

AttributeError: 'NoneType' object has no attribute 'find'

In [31]:
div = soup.find("div", class_="col-8")
div

In [24]:
df = pd.DataFrame(stad_dicts)

In [117]:
df["capacity"] = df["capacity"].astype(int)

In [118]:
df.country.value_counts()

 Spain            138
 England          125
 Italy            117
 Germany           70
 France            59
                 ... 
 Mali               1
 Liechtenstein      1
 Syria              1
 San Marino         1
 North Korea        1
Name: country, Length: 112, dtype: int64

---

## Exports

In [43]:
df.to_csv('data/processed/world_soccer_clubs.csv', index=False)