In [None]:
# !pip3 install -r requirements.txt
# !pip3 install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [6]:
import pandas as pd
import streamlit
import altair as alt
from vega_datasets import data
import math

In [7]:
# Load in the datasets
fifa = pd.read_csv('data/fifa19.csv', index_col=0)
countries = pd.read_csv('data/countries.csv')

In [8]:
# Extract numeric value from cost string
def convert_amount(cost):
    if type(cost) is not str:  # handle nan cases
        return cost
    factor = 1
    factor = 1000 if cost[-1] == 'K' else factor
    factor = 1000000 if cost[-1] == 'M' else factor
    amount = float(cost[1:-1])*factor if factor > 1 else float(cost[1:])
    return amount

# Extract numeric value from height str in inches
def get_height_in_inches(height):
    if type(height) is not str:  # handle nan cases
        return height
    feet, inches = height.split("'")
    return int(feet) * 12 + int(inches)

# Extract numeric value from weight str in lb
def get_weight_in_lbs(weight):
    if type(weight) is not str:  # handle nan cases
        return weight
    return int(weight.split('lbs')[0])

In [9]:
# Extract value from cost strings (units in €)
fifa['Wage'] = fifa['Wage'].apply(convert_amount)
fifa['Value'] = fifa['Value'].apply(convert_amount)
fifa['Release Clause'] = fifa['Release Clause'].apply(convert_amount)

# Extract value from height str (units in inches)
fifa['Height'] = fifa['Height'].apply(get_height_in_inches)

# Extract value from weight str (units in lbs)
fifa['Weight'] = fifa['Weight'].apply(get_weight_in_lbs)

In [12]:
# Add new columns in FIFA 19 and GPS datasets to match player's countries of origin (nationalities)
fifa['Nationality Country'] = fifa['Nationality'].copy()

In [13]:
# Count of nationalities that we don't have country gps data for
diff = set(fifa['Nationality Country']) - set(countries['name'])
print(len(diff))
print(diff)

23
{'Korea Republic', 'Congo', 'Ivory Coast', 'FYR Macedonia', 'South Sudan', 'Northern Ireland', 'St Lucia', 'Central African Rep.', 'Palestine', 'Korea DPR', 'Antigua & Barbuda', 'Scotland', 'China PR', 'Trinidad & Tobago', 'Wales', 'St Kitts Nevis', 'Bosnia Herzegovina', 'Guinea Bissau', 'England', 'DR Congo', 'São Tomé & Príncipe', 'Curacao', 'Republic of Ireland'}


In [14]:
# Replace the 4 countries that make up the UK with 'United Kingdom' (FIFA => GPS)
fifa['Nationality Country'] = fifa['Nationality Country'].replace('England', 'United Kingdom')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Scotland', 'United Kingdom')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Wales', 'United Kingdom')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Northern Ireland', 'United Kingdom')

In [15]:
# Make names that refer to the same country in fifa and countries dfs uniform (fifa => countries naming convention)
fifa['Nationality Country'] = fifa['Nationality Country'].replace('China PR', 'China')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Congo', 'Congo [Republic]')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('DR Congo', 'Congo [DRC]')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Ivory Coast', "Côte d'Ivoire")
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Bosnia Herzegovina', "Bosnia and Herzegovina")
fifa['Nationality Country'] = fifa['Nationality Country'].replace('São Tomé & Príncipe', 'São Tomé and Príncipe')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Republic of Ireland', 'Ireland')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Antigua & Barbuda', 'Antigua and Barbuda')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('St Lucia', 'Saint Lucia')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('FYR Macedonia', 'Macedonia [FYROM]')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Central African Rep.', 'Central African Republic')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Korea DPR', 'North Korea')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Korea Republic', 'South Korea')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('St Kitts Nevis', 'Saint Kitts and Nevis')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Guinea Bissau', 'Guinea-Bissau')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('South Sudan', 'Sudan')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Palestine', 'Palestinian Territories')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Trinidad & Tobago', 'Trinidad and Tobago')
fifa['Nationality Country'] = fifa['Nationality Country'].replace('Curacao', 'Curaçao')

In [16]:
# Count of nationalities that we don't have country gps data for
diff = set(fifa['Nationality Country']) - set(countries['name'])
print(len(diff))
print(diff)

1
{'Curaçao'}


In [17]:
# Manually lookup geographic information for Curaçao and append to countries df
countries = countries.append({'country': 'CW', 'latitude': 12.1696, 'longitude': -68.9900, 'name': 'Curaçao'}, ignore_index=True)

In [18]:
# Count of nationalities that we don't have country gps data for
diff = set(fifa['Nationality Country']) - set(countries['name'])
print(len(diff))
print(diff)

0
set()


In [19]:
# Rename country df columns
countries = countries.rename(columns={'country': 'Country Abrv', 'latitude': 'Latitude', 'longitude': 'Longitude', 'name': 'Country Name'})

In [20]:
fifa_with_gps = fifa.merge(countries, how='left', left_on='Nationality Country', right_on='Country Name')

In [21]:
fifa_with_gps.to_csv('data/clean_fifa_with_gps.csv')

In [27]:
fifa_country_avgs = fifa_with_gps.groupby('Nationality Country').agg({
    'Age': ['mean', 'min', 'max'],
    'Height': ['mean', 'min', 'max'],
    'Weight': ['mean', 'min', 'max'],
    'Overall': ['mean', 'min', 'max'],
    'Potential': ['mean', 'min', 'max'],
    'International Reputation': ['mean', 'min', 'max'],
    'Wage': ['mean', 'min', 'max'],
    'Value': ['mean', 'min', 'max'],
    'Release Clause': ['mean', 'min', 'max'],
    'Latitude': ['mean'],
    'Longitude': ['mean']
}).reset_index()
fifa_country_avgs.columns = ['_'.join(x) if x[0] not in ['Nationality Country', 'Latitude', 'Longitude'] else x[0] for x in fifa_country_avgs.columns.ravel()]
fifa_country_avgs

Unnamed: 0,Nationality Country,Age_mean,Age_min,Age_max,Height_mean,Height_min,Height_max,Weight_mean,Weight_min,Weight_max,...,Wage_min,Wage_max,Value_mean,Value_min,Value_max,Release Clause_mean,Release Clause_min,Release Clause_max,Latitude,Longitude
0,Afghanistan,22.500000,20,26,68.750000,68.0,70.0,149.250000,143.0,152.0,...,1000.0,2000.0,3.425000e+05,220000.0,450000.0,6.172500e+05,374000.0,878000.0,33.939110,67.709953
1,Albania,24.025000,18,34,71.410256,67.0,77.0,164.384615,137.0,196.0,...,1000.0,58000.0,1.920250e+06,80000.0,20000000.0,3.621784e+06,116000.0,35500000.0,41.153332,20.168331
2,Algeria,27.050000,19,35,71.366667,66.0,77.0,167.250000,132.0,207.0,...,1000.0,205000.0,4.670917e+06,160000.0,40500000.0,8.768946e+06,259000.0,78000000.0,28.033886,1.659626
3,Andorra,28.000000,28,28,72.000000,72.0,72.0,174.000000,174.0,174.0,...,1000.0,1000.0,2.900000e+05,290000.0,290000.0,3.840000e+05,384000.0,384000.0,42.546245,1.601554
4,Angola,25.866667,19,34,70.533333,67.0,74.0,166.533333,143.0,192.0,...,1000.0,44000.0,2.018333e+06,325000.0,10000000.0,3.475846e+06,463000.0,17800000.0,-11.202692,17.873887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,Uruguay,26.610738,18,37,70.845638,62.0,77.0,165.308725,132.0,212.0,...,0.0,455000.0,4.811074e+06,0.0,80000000.0,9.379158e+06,100000.0,164000000.0,-32.522779,-55.765835
156,Uzbekistan,29.500000,29,30,71.500000,71.0,72.0,164.000000,163.0,165.0,...,1000.0,17000.0,3.100000e+06,200000.0,6000000.0,6.725000e+06,250000.0,13200000.0,41.377491,64.585262
157,Venezuela,24.313433,18,34,70.727273,63.0,78.0,164.651515,121.0,216.0,...,0.0,42000.0,2.237313e+06,0.0,22500000.0,4.093058e+06,108000.0,35400000.0,6.423750,-66.589730
158,Zambia,22.222222,18,29,70.888889,67.0,75.0,162.000000,141.0,201.0,...,1000.0,7000.0,1.129444e+06,190000.0,3500000.0,2.174000e+06,356000.0,6600000.0,-13.133897,27.849332


In [29]:
fifa_country_avgs.to_csv('data/clean_fifa_country_aggs.csv')

In [30]:
fifa_country_avgs

Unnamed: 0,Nationality Country,Age_mean,Age_min,Age_max,Height_mean,Height_min,Height_max,Weight_mean,Weight_min,Weight_max,...,Wage_min,Wage_max,Value_mean,Value_min,Value_max,Release Clause_mean,Release Clause_min,Release Clause_max,Latitude,Longitude
0,Afghanistan,22.500000,20,26,68.750000,68.0,70.0,149.250000,143.0,152.0,...,1000.0,2000.0,3.425000e+05,220000.0,450000.0,6.172500e+05,374000.0,878000.0,33.939110,67.709953
1,Albania,24.025000,18,34,71.410256,67.0,77.0,164.384615,137.0,196.0,...,1000.0,58000.0,1.920250e+06,80000.0,20000000.0,3.621784e+06,116000.0,35500000.0,41.153332,20.168331
2,Algeria,27.050000,19,35,71.366667,66.0,77.0,167.250000,132.0,207.0,...,1000.0,205000.0,4.670917e+06,160000.0,40500000.0,8.768946e+06,259000.0,78000000.0,28.033886,1.659626
3,Andorra,28.000000,28,28,72.000000,72.0,72.0,174.000000,174.0,174.0,...,1000.0,1000.0,2.900000e+05,290000.0,290000.0,3.840000e+05,384000.0,384000.0,42.546245,1.601554
4,Angola,25.866667,19,34,70.533333,67.0,74.0,166.533333,143.0,192.0,...,1000.0,44000.0,2.018333e+06,325000.0,10000000.0,3.475846e+06,463000.0,17800000.0,-11.202692,17.873887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,Uruguay,26.610738,18,37,70.845638,62.0,77.0,165.308725,132.0,212.0,...,0.0,455000.0,4.811074e+06,0.0,80000000.0,9.379158e+06,100000.0,164000000.0,-32.522779,-55.765835
156,Uzbekistan,29.500000,29,30,71.500000,71.0,72.0,164.000000,163.0,165.0,...,1000.0,17000.0,3.100000e+06,200000.0,6000000.0,6.725000e+06,250000.0,13200000.0,41.377491,64.585262
157,Venezuela,24.313433,18,34,70.727273,63.0,78.0,164.651515,121.0,216.0,...,0.0,42000.0,2.237313e+06,0.0,22500000.0,4.093058e+06,108000.0,35400000.0,6.423750,-66.589730
158,Zambia,22.222222,18,29,70.888889,67.0,75.0,162.000000,141.0,201.0,...,1000.0,7000.0,1.129444e+06,190000.0,3500000.0,2.174000e+06,356000.0,6600000.0,-13.133897,27.849332


In [33]:
# Create world map
# source: https://altair-viz.github.io/gallery/index.html#maps

# Data generators for the background
sphere = alt.sphere()
graticule = alt.graticule()

# Source of land data
source = alt.topo_feature(data.world_110m.url, 'countries')

# Layering and configuring the components
background = alt.layer(
    alt.Chart(sphere).mark_geoshape(fill='lightblue'),
    alt.Chart(graticule).mark_geoshape(stroke='white', strokeWidth=0.2),
    alt.Chart(source).mark_geoshape(fill='lightgray', stroke='black')
).project(
    type='equirectangular'
).properties(width=800, height=400).configure_view(stroke=None)

hover = alt.selection(type='single', on='mouseover', nearest=True, fields=['Lat', 'Lng'])

base = alt.Chart(fifa_country_avgs).encode(
    longitude='Longitude:Q',
    latitude='Latitude:Q',
    tooltip=['Nationality Country']
)

points = base.mark_point().encode(
    color=alt.condition(~hover, alt.value('green'), alt.value('red')),
    size=alt.condition(~hover, alt.value(30), alt.value(100))
).add_selection(hover)

background + points