In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# plyaer data load 및 정리(프로필)
player_df = pd.read_csv('../data/new/players_all.csv')
player_df = player_df.rename(columns={'Unnamed: 0':'Id'}).drop(['No.', 'Id'], axis=1)

In [3]:
# 1xbet data load 및 정리(스텟)
xbet_df = pd.read_csv('../data/new/1xbet_all.csv')
xbet_df = xbet_df.drop(['Team','Age','Position'], axis=1).rename(columns={'Drb_x':'Drb_Off', 'Drb_y':'Drb_Def'})

In [4]:
# understat data load 및 정리(스텟)
understat_df = pd.read_csv('../data/new/understat_all.csv')
understat_df = understat_df.drop(['No','Team'], axis=1)

In [5]:
# capology load(연봉)
capology_df = pd.read_csv('../data/new/capology_all.csv')

In [6]:
# 프로필 + 연봉
eda_df = player_df.merge(capology_df[['Weekly Salary','Base Salary','ADJ Salary','Name', 'year']], how='left', left_on=['Name', 'year'], right_on=['Name', 'year'])
eda_df = eda_df[~eda_df.duplicated(['year', 'Name'], keep='first')] # 중복 제거

In [7]:
# 프로필 + 연봉 + 스텟(understat)
eda_df = eda_df.merge(understat_df, how='left', left_on=['Name', 'year'], right_on=['Name', 'year'])

In [8]:
# 프로필 + 연봉 + 스텟(understat) + 스텟(1xbet)
eda_df = eda_df.merge(
    xbet_df[[
        'Name', 'year', 'SpG', 'KeyP', 'Drb_Off', 'Fouled', 'Off',
        'Disp', 'UnsTch', 'Rating', 'Tackles', 'Inter','Fouls',
        'Offsides', 'Clear', 'Drb_Def', 'Blocks', 'AvgP', 'PS%']],
    how='left',
    left_on=['Name', 'year'],
    right_on=['Name', 'year']
)

In [9]:
eda_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4756 entries, 0 to 4755
Data columns (total 42 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           4756 non-null   int64  
 1   Name           4756 non-null   object 
 2   Age            4756 non-null   int64  
 3   Team           4756 non-null   object 
 4   Position       4756 non-null   object 
 5   Weekly Salary  4598 non-null   float64
 6   Base Salary    4598 non-null   float64
 7   ADJ Salary     4598 non-null   float64
 8   Apps           4744 non-null   float64
 9   Min            4744 non-null   float64
 10  G              4744 non-null   float64
 11  NPG            4744 non-null   float64
 12  A              4744 non-null   float64
 13  xG             4744 non-null   float64
 14  NPxG           4744 non-null   float64
 15  xA             4744 non-null   float64
 16  xGChain        4744 non-null   float64
 17  xGBuildup      4744 non-null   float64
 18  xG90    

In [10]:
eda_df = eda_df.dropna()

In [12]:
round(eda_df.describe(), 2)

Unnamed: 0,year,Age,Weekly Salary,Base Salary,ADJ Salary,Apps,Min,G,NPG,A,...,Rating,Tackles,Inter,Fouls,Offsides,Clear,Drb_Def,Blocks,AvgP,PS%
count,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,...,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0,4588.0
mean,2017.93,25.91,53921.12,2803898.52,3076834.48,20.58,1466.58,1.96,1.81,1.39,...,6.65,1.12,0.78,0.7,0.13,1.53,0.61,0.22,29.0,76.91
std,2.6,4.21,52782.05,2744666.44,2942732.53,11.49,1032.21,3.52,3.18,2.19,...,0.36,0.84,0.7,0.46,0.23,1.75,0.48,0.29,16.44,12.42
min,2014.0,15.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,4.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2016.0,23.0,20000.0,1040000.0,1252784.0,11.0,532.5,0.0,0.0,0.0,...,6.4,0.5,0.18,0.38,0.0,0.33,0.25,0.0,17.0,72.73
50%,2018.0,26.0,40000.0,2080000.0,2301310.0,22.0,1390.0,1.0,1.0,1.0,...,6.65,1.0,0.65,0.68,0.0,0.86,0.53,0.11,27.29,79.32
75%,2020.0,29.0,70000.0,3640000.0,4071549.0,31.0,2326.25,2.0,2.0,2.0,...,6.88,1.63,1.2,1.0,0.15,2.09,0.88,0.33,38.44,84.32
max,2022.0,41.0,600000.0,31200000.0,33671679.0,38.0,3420.0,36.0,31.0,20.0,...,8.34,7.0,4.22,4.0,2.0,11.5,4.0,2.05,90.41,100.0


In [16]:
eda_df['Age Lev'] = eda_df['Age'].apply(lambda age: '<25' if age < 25 else '<30' if age <30 else '<35' if age<35 else '>35')

In [17]:
eda_df.head()

Unnamed: 0,year,Name,Age,Team,Position,Weekly Salary,Base Salary,ADJ Salary,Apps,Min,...,Tackles,Inter,Fouls,Offsides,Clear,Drb_Def,Blocks,AvgP,PS%,Age Lev
0,2014,Eden Hazard,23,Chelsea,Forward,185827.0,9663000.0,11640054.0,38.0,3389.0,...,0.736842,0.578947,0.315789,0.0,0.236842,0.605263,0.026316,58.368421,86.834986,<25
1,2014,Alexis Sanchez,25,Arsenal,Forward,140000.0,7280000.0,8769491.0,35.0,2967.0,...,1.971429,1.171429,1.257143,0.0,0.171429,1.542857,0.0,42.0,76.802721,<30
2,2014,Sergio Aguero,26,Man City,Forward,160000.0,8320000.0,10022275.0,33.0,2551.0,...,0.666667,0.393939,0.545455,0.0,0.060606,0.515152,0.0,25.242424,85.594238,<30
3,2014,Cesc Fabregas,27,Chelsea,Midfielder,171481.0,8917000.0,10741422.0,34.0,2895.0,...,2.588235,0.676471,0.882353,0.0,0.882353,2.117647,0.147059,80.647059,85.19329,<30
4,2014,Santi Cazorla,29,Arsenal,Midfielder,90000.0,4680000.0,5637530.0,37.0,2999.0,...,1.594595,1.081081,0.540541,0.0,0.648649,1.459459,0.135135,64.027027,89.024905,<30
