# Initial setting

## libraries

In [1]:
#RUN THIS CELL 
import requests
from IPython.core.display import HTML
styles = requests.get(
    "https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css"
).text
HTML(styles)

In [15]:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

## dataset

In [92]:
# Load data
fifa15 = pd.read_csv('data/players_15.csv')
fifa16 = pd.read_csv('data/players_16.csv')
fifa17 = pd.read_csv('data/players_17.csv')
fifa18 = pd.read_csv('data/players_18.csv')
fifa19 = pd.read_csv('data/players_19.csv')
fifa20 = pd.read_csv('data/players_20.csv')
team_league = pd.read_csv('data/teams_and_leagues.csv')


To do  
1. create year variable DONE
2. create team name variable DONE
3. merge all years DONE
4. create dummies for player_tags DONE
5. create dummies for player_traits DONE
6. create dummies for team_position and select primary position for each player DONE
7. remove +- signs from abilities
8. drop unnecessary columns

# Data cleaning

## team_league.csv - You can skip this section as it takes forever

In [70]:
# set a base url for team pages
base_url = 'https://sofifa.com/team/'

# create an empty list to store team names
team_name = []

# for each team url
for team in team_league.url:
    
    # open team page and get a page title
    team_url = base_url + str(team)
    soup = BeautifulSoup(requests.get(team_url).text, 'html.parser')
    title = soup.find_all('title')
    
    # delete redundant strings and append it to the list
    name = re.sub(' FIFA.*', '', title[0].contents[0])
    team_name.append(name)

In [96]:
# concat the original team_league and the team_name list
df_team = pd.concat([team_league, pd.Series(team_name)], axis=1)
df_team.columns = ['url', 'league_name', 'club']
df_team.league_name = df_team.league_name.str.strip()

# save a csv file
df_team.to_csv('data/df_team.csv', index=False)

## player datasets - You can resume from here

### merge all years

In [303]:
# read df_team
df_team = pd.read_csv('data/df_team.csv')

# add year variable for each player dataset
fifa15['year'] = 2015
fifa16['year'] = 2016
fifa17['year'] = 2017
fifa18['year'] = 2018
fifa19['year'] = 2019
fifa20['year'] = 2020

# merge all datasets and add league name
fifa = pd.concat([fifa15, fifa16, fifa17, fifa18, fifa19, fifa20])
df_fifa = fifa.merge(df_team)

### player traits and tags

In [304]:
# create dummy variables for player traits
# first make a list for all player traits (without CPU AI Only)
all_traits = []
player_traits = [x for x in df_fifa.player_traits.unique() if str(x) != 'nan']

for traits in player_traits:
    #traits_list =traits.split(',')
    traits_list = [x for x in traits.split(',') if 'CPU AI Only' not in x]
    for trait in traits_list:
        all_traits.append(trait.strip())     

# create a unique set for player traits
all_traits = list(set(all_traits))

# then add dummy variables using this list
for trait in all_traits:
    df_fifa['d_trait_' + trait] = [1 if trait in str(x) else 0 for x in df_fifa.player_traits]

In [305]:
# first make a list for all player tags
all_tags = []
player_tags = [x for x in df_fifa.player_tags.unique() if str(x) != 'nan']

for tags in player_tags:
    tags_list = tags.split(',')
    for tag in tags_list:
        all_tags.append(tag.strip())         

# create a unique set for player tags
all_tags = [x.replace('#','') for x in set(all_tags)]

# then add dummy variables using this list
for tag in all_tags:
    df_fifa['d_tag_' + tag] = [1 if tag in str(x) else 0 for x in df_fifa.player_tags]

### position

In [306]:
# set the first position in player_positions as his main position
df_fifa['main_position'] = [x.split(',')[0] for x in df_fifa['player_positions']]

# create multiple position dummies
df_fifa['d_multiple_position'] = [1 if len(x.split(',')) > 1 else 0 for x in df_fifa['player_positions']]

# make a list for all player positions
all_positions = []
player_positions = [x for x in df_fifa.player_positions.unique() if str(x) != 'nan']

for positions in player_positions:
    positions_list = positions.split(',')
    for position in positions_list:
        all_positions.append(position.strip())         

# create a unique set for player tags
all_positions = list(set(all_positions))

# then add dummy variables using this list
for position in all_positions:
    df_fifa['d_pos_' + position] = [1 if position in str(x) else 0 for x in df_fifa.player_positions]

### variables regarding abilities

In [307]:
# create a list for varibales regarding player abilities
var_ab_list = ['pace','shooting','passing','dribbling','defending','physic']
var_ab_detail = [x for x in df_fifa.columns if x.startswith('gk_') | x.startswith('attacking_') | x.startswith('skill_') | 
                x.startswith('movement_') | x.startswith('power_') | x.startswith('mentality_') | x.startswith('defending_') |
                x.startswith('goalkeeping_')]
var_ab_list = var_ab_list + var_ab_detail

# for each variable
for var_ab in var_ab_list:
    
    # remove '+-' just to focus on basic ability level
    # then remove 'nan' string and convert the series to numeric
    # rename the variable names so that it gets easier to filter variables later
    df_fifa[var_ab] = [re.sub('[+-].*', '', str(x)) for x in df_fifa[var_ab]]
    df_fifa[var_ab] = [x.replace('nan', '') for x in df_fifa[var_ab]]
    df_fifa['ab_' + var_ab] = pd.to_numeric(df_fifa[var_ab])

### miscellaneous

In [308]:
# create a dummy for preferred foot
df_fifa['d_foot_left'] = [1 if x == 'Left' else 0 for x in df_fifa['preferred_foot']]

# replace whitespace within variable names with underscore
df_fifa.columns = [x.replace(' ', '_') for x in df_fifa.columns]

# select variables
var_list = ['short_name','age','dob','height_cm','weight_kg','nationality','club','overall','value_eur','wage_eur',
            'preferred_foot','international_reputation','weak_foot','work_rate','body_type','release_clause_eur','year',
            'team_position','team_jersey_number','loaned_from','joined','contract_valid_until','league_name','main_position']
var_others = [x for x in df_fifa.columns if x.startswith('d_') | x.startswith('ab_')]
var_list = var_list + var_others
df_fifa = df_fifa[df_fifa.columns.intersection(var_list)]

### save the final dataset as csv

In [309]:
df_fifa.to_csv('data/df_fifa.csv', index=False)

In [310]:
df_fifa.head()

Unnamed: 0,short_name,age,dob,height_cm,weight_kg,nationality,club,overall,value_eur,wage_eur,preferred_foot,international_reputation,weak_foot,work_rate,body_type,release_clause_eur,team_position,team_jersey_number,loaned_from,joined,contract_valid_until,year,league_name,d_trait_Flair,d_trait_Set_Play_Specialist,d_trait_Through_Ball,d_trait_Beat_Offside_Trap,d_trait_One_Club_Player,d_trait_Inflexible,d_trait_Long_Throw-in,d_trait_Avoids_Using_Weaker_Foot,d_trait_Power_Free-Kick,d_trait_Power_Header,d_trait_GK_Up_for_Corners,d_trait_GK_Long_Throw,d_trait_Outside_Foot_Shot,d_trait_Skilled_Dribbling,d_trait_Leadership,d_trait_Comes_For_Crosses,d_trait_Early_Crosser,d_trait_Target_Forward,d_trait_Rushes_Out_Of_Goal,d_trait_Takes_Finesse_Free_Kicks,d_trait_Cautious_With_Crosses,d_trait_Injury_Prone,d_trait_Backs_Into_Player,d_trait_Injury_Free,d_trait_Giant_Throw-in,d_trait_Saves_with_Feet,d_trait_Flair_Passes,...,d_pos_RB,d_pos_CB,ab_pace,ab_shooting,ab_passing,ab_dribbling,ab_defending,ab_physic,ab_skill_moves,ab_gk_diving,ab_gk_handling,ab_gk_kicking,ab_gk_reflexes,ab_gk_speed,ab_gk_positioning,ab_attacking_crossing,ab_attacking_finishing,ab_attacking_heading_accuracy,ab_attacking_short_passing,ab_attacking_volleys,ab_skill_dribbling,ab_skill_curve,ab_skill_fk_accuracy,ab_skill_long_passing,ab_skill_ball_control,ab_movement_acceleration,ab_movement_sprint_speed,ab_movement_agility,ab_movement_reactions,ab_movement_balance,ab_power_shot_power,ab_power_jumping,ab_power_stamina,ab_power_strength,ab_power_long_shots,ab_mentality_aggression,ab_mentality_interceptions,ab_mentality_positioning,ab_mentality_vision,ab_mentality_penalties,ab_mentality_composure,ab_defending_marking,ab_defending_standing_tackle,ab_defending_sliding_tackle,ab_goalkeeping_diving,ab_goalkeeping_handling,ab_goalkeeping_kicking,ab_goalkeeping_positioning,ab_goalkeeping_reflexes,d_foot_left
0,L. Messi,27,1987-06-24,169,67,Argentina,FC Barcelona,93,0,0,Left,5,3,Medium/Low,Normal,,CF,10.0,,2004-07-01,2018.0,2015,Spain Primera Division,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,93.0,89.0,86.0,96.0,27.0,63.0,4,,,,,,,84,94,71,89,85,96,89,90,76,96,96,90,94,94,95,80,73,77,60,88,48,22,92,90,76,,25,21,20,6,11,15,14,8,1
1,L. Messi,27,1987-06-24,169,67,Argentina,FC Barcelona,93,0,0,Left,5,3,Medium/Low,Normal,,CF,10.0,,2004-07-01,2018.0,2015,Spain Primera Division,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,93.0,89.0,86.0,96.0,27.0,63.0,4,,,,,,,84,94,71,89,85,96,89,90,76,96,96,90,94,94,95,80,73,77,60,88,48,22,92,90,76,,25,21,20,6,11,15,14,8,1
2,L. Suárez,27,1987-01-24,181,81,Uruguay,FC Barcelona,89,0,0,Right,5,4,High/Medium,Normal,,RES,9.0,,2014-07-11,2019.0,2015,Spain Primera Division,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,83.0,87.0,79.0,88.0,42.0,79.0,4,,,,,,,77,91,75,82,85,90,86,84,64,89,88,79,86,91,60,84,69,86,76,82,78,41,88,84,85,,30,45,38,27,25,31,33,37,0
3,L. Suárez,27,1987-01-24,181,81,Uruguay,FC Barcelona,89,0,0,Right,5,4,High/Medium,Normal,,RES,9.0,,2014-07-11,2019.0,2015,Spain Primera Division,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,83.0,87.0,79.0,88.0,42.0,79.0,4,,,,,,,77,91,75,82,85,90,86,84,64,89,88,79,86,91,60,84,69,86,76,82,78,41,88,84,85,,30,45,38,27,25,31,33,37,0
4,Neymar,22,1992-02-05,175,64,Brazil,FC Barcelona,86,0,0,Right,5,5,High/Medium,Lean,,LW,11.0,,2013-07-01,2018.0,2015,Spain Primera Division,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,90.0,80.0,72.0,92.0,30.0,58.0,5,,,,,,,71,85,62,72,83,94,78,78,72,90,91,89,92,86,84,77,61,86,45,70,56,36,87,72,81,,21,24,33,9,9,15,15,11,0
