## FIFA EDA 

### Libraries

In [66]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

from sklearn import set_config; set_config(display='diagram')
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer, make_column_selector,ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate

### Data


In [67]:
df = pd.read_csv('../raw_data/fifa22/players_22.csv')

  df = pd.read_csv('../raw_data/fifa22/players_22.csv')


In [69]:
df.club_contract_valid_until.head()

0    2023.0
1    2023.0
2    2023.0
3    2025.0
4    2025.0
Name: club_contract_valid_until, dtype: float64

In [68]:
for index, col in enumerate(df.columns):
    print(f"{col:<26} | ", end="")
    if (index % 3 == 2):
        print("\n", end="")

sofifa_id                  | player_url                 | short_name                 | 
long_name                  | player_positions           | overall                    | 
potential                  | value_eur                  | wage_eur                   | 
age                        | dob                        | height_cm                  | 
weight_kg                  | club_team_id               | club_name                  | 
league_name                | league_level               | club_position              | 
club_jersey_number         | club_loaned_from           | club_joined                | 
club_contract_valid_until  | nationality_id             | nationality_name           | 
nation_team_id             | nation_position            | nation_jersey_number       | 
preferred_foot             | weak_foot                  | skill_moves                | 
international_reputation   | work_rate                  | body_type                  | 
real_face                  | rel

In [None]:
to_drop = ['club_logo_url','nation_flag_url','club_flag_url','nation_logo_url','player_face_url','dob','player_url',
           'real_face','nation_jersey_number','nation_position']

df2 = df.drop(to_drop, axis = 1)

In [None]:
positions = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm',
           'rm','lwb','ldm', 'cdm','rdm','rwb','lb','lcb','cb','rcb','rb']
for col in positions:
  df2[col] = df2[col].str.split('+',n=1,expand = True)[0]

### Descriptive Stats

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df2.describe()

In [None]:
val = df2['player_positions']
pref_pos = []

for i in val:
    a = i.split(',')
    pref_pos.append(a[0])

df2['prefered_pos'] = pref_pos


In [None]:
df2.head()

In [None]:
df2['expected_imporvement'] = df2.potential - df2.overall 

In [None]:
df2[df2['expected_imporvement'] < 0].shape

In [None]:
df_growth = df2[df2['expected_imporvement'] > 0]

df_growth.describe()

### Viz

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.boxplot(x='prefered_pos',y='overall',data=df2)
plt.ticklabel_format(style='plain', axis='y')

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.boxplot(x='prefered_pos',y='value_eur',data=df2)
plt.ticklabel_format(style='plain', axis='y')

In [None]:
sns.relplot(x='overall',y='value_eur',hue='age',palette = 'viridis', sizes=(15, 200),aspect=2,data=df2)
plt.title('Overall Rating vs Value in Euros',fontsize = 20)
plt.xlabel('Overall Rating')
plt.ylabel('Value in Euros')
plt.show()

In [None]:
sns.relplot(x='potential',y='value_eur',hue='age',palette = 'viridis', sizes=(15, 200),aspect=2,data=df2)
plt.title('Potential Rating vs Value in Euros',fontsize = 20)
plt.xlabel('Potential Rating')
plt.ylabel('Value in Euros')
plt.show()

In [None]:
df['value_eur'].hist()

In [None]:
physical_attributes = [
    "height_cm",
    "weight_kg",
    "power_strength",
    "pace",
    "movement_sprint_speed",
    "physic"
    ]

f = plt.figure(figsize=(20, 9))
gs = f.add_gridspec(2, 4)

with sns.axes_style("white"):
    sns.set_style("ticks")
    x = 0
    y = 0
    for attr in physical_attributes:
        ax = f.add_subplot(gs[x, y])
        # sns.histplot(data=df, x=attr, bins=25, kde=True);
        sns.kdeplot(data=df, x=attr, cut=0, fill=True, palette="crest", linewidth=0, alpha=.5);
        plt.title(f"Distribution of {attr}");
        plt.axvline(x=np.mean(df[attr]),c='red',label=f'Mean {attr}')
        plt.xlabel(attr);
        plt.ylabel("Frequency");
        plt.legend(loc="upper left")
        sns.despine(trim=True, offset=5)
        y+=1
        if (y % 3) == 0:
            y = 0
            x +=1

f.tight_layout()

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.boxplot(x='league_level',y='expected_imporvement',data=df_growth)
plt.ticklabel_format(style='plain', axis='y')

In [None]:
df_growth.league_level.value_counts()

In [None]:
df_growth.groupby('league_level')[['expected_imporvement']].max()

In [None]:
sns.displot(
  data=df_growth,
  x="overall",
  col="league_level",
  kind="hist",
  aspect=1.4,
  log_scale=10,
  bins=20
)
plt.ticklabel_format(style='plain', axis='y')

In [None]:
df_growth.groupby('league_level')[['potential']].max()

In [None]:
df_growth[(df_growth['league_level']==3) &(df_growth['potential']==84)]

In [None]:
df_growth[(df_growth['league_level']==4) &(df_growth['expected_imporvement']==23)]

In [70]:
df.club_position.value_counts()

SUB    8299
RES    3168
RCB     701
GK      701
LCB     701
RB      515
LB      515
ST      476
RCM     470
LCM     470
LM      410
RM      410
CAM     292
LDM     223
RDM     223
LS      223
RS      223
CB      186
RW      185
LW      185
CDM     168
RWB     116
LWB     116
CM       83
RF       35
LF       35
LAM      21
RAM      21
CF        7
Name: club_position, dtype: int64

In [None]:

def is_bench(d):
    if d in ("SUB", "RES", ""):
        return True 
    return False

df2['is_bench']=df2['club_position'].apply(is_bench)

df2.head()

In [None]:
df2['club_position'].value_counts()

## Baseline Model


In [None]:
# positions = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm',
#            'rm','lwb','ldm', 'cdm','rdm','rwb','lb','lcb','cb','rcb','rb']

# for col in positions:
#     df22[col] = df22[col].str.split('+',n=1,expand = True)[0]
#     df22[col].astype('int32')
    
#     df21[col] = df21[col].str.split('+',n=1,expand = True)[0]
#     df21[col].astype('int32')
    
#     df20[col] = df20[col].str.split('+',n=1,expand = True)[0]
#     df20[col].astype('int32')
    
#     df19[col] = df19[col].str.split('+',n=1,expand = True)[0]
#     df19[col].astype('int32')
    
#     df18[col] = df18[col].str.split('+',n=1,expand = True)[0]
#     df18[col].astype('int32')

### Data Load

In [2]:
def is_bench(d):
    if d in ("SUB", "RES", ""):
        return True 
    return False


In [3]:
df_22 = pd.read_csv('../raw_data/fifa22/players_22.csv')
df_21 = pd.read_csv('../raw_data/fifa22/players_21.csv')
df_20 = pd.read_csv('../raw_data/fifa22/players_20.csv')
df_19 = pd.read_csv('../raw_data/fifa22/players_19.csv')
df_18 = pd.read_csv('../raw_data/fifa22/players_18.csv')

  df_22 = pd.read_csv('../raw_data/fifa22/players_22.csv')
  df_18 = pd.read_csv('../raw_data/fifa22/players_18.csv')


In [4]:
to_drop = ['club_logo_url','nation_flag_url','club_flag_url','nation_logo_url','player_face_url','dob','player_url',
           'real_face','nation_jersey_number','nation_position','club_loaned_from','long_name','player_url'
          ,'ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm',
           'rm','lwb','ldm', 'cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk','club_jersey_number','nation_jersey_number']



In [5]:
df22 = df_22.drop(to_drop, axis = 1)
df21 = df_21.drop(to_drop, axis = 1)
df20 = df_20.drop(to_drop, axis = 1)
df19 = df_19.drop(to_drop, axis = 1)
df18 = df_18.drop(to_drop, axis = 1)

In [6]:
dfs = [df18, df19, df20, df21,df22]

for frame in dfs:
    frame['is_bench'] = frame['club_position'].apply(is_bench)

### data types

In [7]:
df_all.select_dtypes("bool").nunique()

NameError: name 'df_all' is not defined

In [8]:
# df21.dtypes
df21.dtypes.value_counts()

int64      44
float64    14
object     12
bool        1
dtype: int64

In [71]:
df.player_positions.value_counts()

CB              2423
GK              2132
ST              1770
CDM, CM          953
CM               726
                ... 
CAM, CDM, ST       1
CDM, RWB, CB       1
ST, CAM, RW        1
ST, CF, RM         1
CDM, LM, LB        1
Name: player_positions, Length: 674, dtype: int64

In [9]:
pos22 = df22['player_positions']
pos21 = df21['player_positions']
pos20 = df20['player_positions']
pos19 = df19['player_positions']
pos18 = df18['player_positions']

In [10]:

pref_pos_22 = []
pref_pos_21 = []
pref_pos_20 = []
pref_pos_19 = []
pref_pos_18 = []

for i in pos22:
    a = i.split(',')
    pref_pos_22.append(a[0])
df22['prefered_pos'] = pref_pos_22

for i in pos21:
    a = i.split(',')
    pref_pos_21.append(a[0])
df21['prefered_pos'] = pref_pos_21


for i in pos20:
    a = i.split(',')
    pref_pos_20.append(a[0])
df20['prefered_pos'] = pref_pos_20

for i in pos19:
    a = i.split(',')
    pref_pos_19.append(a[0])
df19['prefered_pos'] = pref_pos_19

for i in pos18:
    a = i.split(',')
    pref_pos_18.append(a[0])
df18['prefered_pos'] = pref_pos_18

In [11]:
df22 = df22.drop('player_positions', axis = 1)
df21 = df21.drop('player_positions', axis = 1)
df20 = df20.drop('player_positions', axis = 1)
df19 = df19.drop('player_positions', axis = 1)
df18 = df18.drop('player_positions', axis = 1)

In [12]:

df22['prefered_pos'] = df22['prefered_pos'].astype('category')
df21['prefered_pos'] = df21['prefered_pos'].astype('category')
df20['prefered_pos'] = df20['prefered_pos'].astype('category')
df19['prefered_pos'] = df19['prefered_pos'].astype('category')
df18['prefered_pos'] = df18['prefered_pos'].astype('category')


In [13]:
df20.dtypes.value_counts()

int64       44
float64     14
object      11
bool         1
category     1
dtype: int64

### Join Dataframes

In [24]:

df_all = df22.join(df21, rsuffix='_21')
df_all = df_all.join(df20, rsuffix='_20')
df_all = df_all.join(df19, rsuffix='_19')
df_all = df_all.join(df18, rsuffix='_18')



In [22]:

df21_mod = df21.add_suffix('_21')
df_mod = df21_mod.join(df20, rsuffix=f'_20')
df_mod = df_mod.join(df19, rsuffix='_19')
# df_model = df_model.join(df18, rsuffix='_18')


In [25]:
df_all.select_dtypes("category").nunique()

prefered_pos       15
prefered_pos_21    15
prefered_pos_20    15
prefered_pos_19    15
prefered_pos_18    15
dtype: int64

In [34]:
df_all['is_bench_21'] = df_all['is_bench_21'].astype('bool')
df_all['is_bench_20'] = df_all['is_bench_20'].astype('bool')
df_all['is_bench_19'] = df_all['is_bench_19'].astype('bool')
df_all['is_bench_18'] = df_all['is_bench_18'].astype('bool')


#### Check distinct values for Bool / object types

In [35]:
df_all.select_dtypes("bool").nunique()

is_bench       2
is_bench_21    2
is_bench_20    2
is_bench_19    2
is_bench_18    2
dtype: int64

In [26]:
df_all['player_tags'].value_counts()

#Strength                                                                   494
#Acrobat                                                                    235
#Engine                                                                     203
#Speedster                                                                  146
#Aerial Threat, #Strength                                                    58
                                                                           ... 
#Aerial Threat, #Distance Shooter, #Clinical Finisher, #Complete Forward      1
#Tactician, #Strength                                                         1
#Dribbler, #Engine                                                            1
#Dribbler, #Acrobat, #Clinical Finisher, #Complete Forward                    1
#Aerial Threat, #Clinical Finisher, #Complete Forward                         1
Name: player_tags, Length: 71, dtype: int64

#### Create set of cat features with less than 10 values per feature

In [36]:
feat_categorical_nunique = df_all.select_dtypes("object").nunique()
# feat_categorical_nunique.sum()
# print(feat_categorical_nunique.keys())

In [37]:
feat_categorical_small = feat_categorical_nunique[feat_categorical_nunique < 10]
feat_categorical_small

preferred_foot       2
work_rate            9
preferred_foot_21    2
work_rate_21         9
preferred_foot_20    2
work_rate_20         9
preferred_foot_19    2
work_rate_19         9
preferred_foot_18    2
work_rate_18         9
dtype: int64

In [None]:
for index, col in enumerate(feat_categorical_nunique.keys()):
    print(f"{col:<26} | ", end="")
    if (index % 3 == 2):
        print("\n", end="")

### Preprocessing 

In [38]:
feat_categorical_small.keys()


Index(['preferred_foot', 'work_rate', 'preferred_foot_21', 'work_rate_21',
       'preferred_foot_20', 'work_rate_20', 'preferred_foot_19',
       'work_rate_19', 'preferred_foot_18', 'work_rate_18'],
      dtype='object')

In [62]:
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())
])

num_col = make_column_selector(dtype_include=['int64','float64'])
cat_col = [feat_categorical_small.keys()] 
# ,"is_bench","is_bench_21","is_bench_20","is_bench_19","is_bench_18"]

# cat_col

cat_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy = 'most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore', sparse=False))])

# cat_transformer
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, num_col),
#     ('cat_tr', cat_transformer, feat_categorical_small.keys())],
    ('cat_tr', cat_transformer, cat_col)],
    remainder='drop')

preprocessor


In [45]:
clf = make_pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearRegression())]
)

In [63]:
lm = LinearRegression()

clf =make_pipeline(preprocessor, lm)
clf

### Model Fit 

In [52]:
X_drop = ['sofifa_id','sofifa_id_21','sofifa_id_20','sofifa_id_19','sofifa_id_18',
         'value_eur']

X = df_all.drop(X_drop, axis = 1)
y = df_all.value_eur

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [55]:
from sklearn.model_selection import cross_validate

cv = cross_validate(clf, X_train, y_train,
                     cv=5,
                     scoring=('r2', 'neg_mean_squared_error','neg_mean_absolute_error','neg_root_mean_squared_error')
                    )

pd.DataFrame(cv)

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/user/.pyenv/versions/3.8.12/envs/opti_recruit/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/user/.pyenv/versions/3.8.12/envs/opti_recruit/lib/python3.8/site-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/user/.pyenv/versions/3.8.12/envs/opti_recruit/lib/python3.8/site-packages/sklearn/pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/user/.pyen

Unnamed: 0,fit_time,score_time,test_r2,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_neg_root_mean_squared_error
0,0.020786,0.0,,,,
1,0.019632,0.0,,,,
2,0.021207,0.0,,,,
3,0.024494,0.0,,,,
4,0.022287,0.0,,,,


In [64]:
from sklearn.model_selection import cross_val_score


score_baseline = cross_val_score(clf, X_train, y_train, cv=5, scoring=( 'neg_mean_squared_error'))

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/user/.pyenv/versions/3.8.12/envs/opti_recruit/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/user/.pyenv/versions/3.8.12/envs/opti_recruit/lib/python3.8/site-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/user/.pyenv/versions/3.8.12/envs/opti_recruit/lib/python3.8/site-packages/sklearn/pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/user/.pyen