In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('projects/data.csv')
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.columns

In [None]:
dataset.iloc[0]

## Simple EDA
- 포지션 별 overall,potential 통계

In [None]:
dataset['Position'].value_counts(dropna=False)
dataset = dataset[pd.notnull(dataset['Position'])]
#Position
positionList = list(dataset.Position.unique())
meanPosO=[] 
meanPosP = []
for pos in positionList:
    tempPos = dataset[dataset.Position == pos]
    meanO = sum(tempPos.Overall)/len(tempPos)
    meanP = sum(tempPos.Potential)/len(tempPos)
    meanPosO.append(meanO)
    meanPosP.append(meanP)
print(meanPosO)
print(meanPosP)

In [None]:
fig = plt.figure(figsize=(15,7))
plt.plot(positionList,meanPosO,label = 'Overall')
plt.plot(positionList,meanPosP,color='green',label = 'Potential')
plt.title('potential and overall for positions')
plt.xlabel('Positions')
plt.ylabel('mean values')

##### 어떤 feature가 wage와 value에 가장 큰 영향을 미칠까?
- 쓸모없는 칼럼을 제거하는 작업을 먼저 해준다.

In [None]:
processed_cols = ['Unnamed: 0','Photo','Flag','Club Logo', 'Special','Body Type','Real Face','Jersey Number','Joined','Loaned From','Contract Valid Until','LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB','GKDiving','GKHandling','GKKicking','GKPositioning','GKReflexes']
df = dataset.drop(processed_cols, axis=1)

In [None]:
df.head()

In [None]:
df = df.dropna()

In [None]:
df.info()

In [None]:
#근데 일단 wage와 Value에 대한 모델을 만들껀데 해당 칼럼들의 값들이 숫자가 아니라서 이를 숫자로 변환시켜야합니다.
value_suffix = [df.Value.iloc[i][-1] for i in range(len(df.Value))]
wage_suffix = [df.Wage.iloc[i][-1] for i in range(len(df.Value))]

In [None]:
df['Value Factor'] = value_suffix
df['Wage Factor'] = wage_suffix

In [None]:
factors_dict = {'M':1000000,
               'K':1000,
               0:1}

In [None]:
df.replace({'Value Factor': factors_dict}, inplace=True)
df.replace({'Wage Factor': factors_dict}, inplace=True)

In [None]:
df.head()

In [None]:
df['Value Factor'] = pd.to_numeric(df['Value Factor']) #str to numeric
df['Wage Factor'] = pd.to_numeric(df['Wage Factor']) #str to numeric

In [None]:
df.info()

In [None]:
df['Value'] = df['Value'].str.translate(str.maketrans('', '', '€MK')) #'€MK'가 있으면 ''으로 replace해주는 역할
df['Wage'] = df['Wage'].str.translate(str.maketrans('', '', '€MK'))
df.head()

In [None]:
df['Value'] = pd.to_numeric(df['Value']) #마찬가지로 str을 int로바꿔주구
df['Wage'] = pd.to_numeric(df['Wage'])
#이제 wage value factor들에 곱해줘버립니다.
df['Value'] = df['Value'] * df['Value Factor']
df['Wage'] = df['Wage'] * df['Wage Factor']

#이제 곱셈을위해만든 칼럼 삭제를 하구요
df.drop(['Value Factor','Wage Factor'],axis = 1, inplace=True)
df.head()

## Simple Linear Regression
- 먼저 이론적으로 선수의 가치에 가장 연관성이 높을것 같은 Overall을 기준으로 모델을 build해봅니다.

In [None]:
X = df.loc[:,'Overall'].values
y_wage = df.loc[:,'Wage'].values
y_value = df.loc[:,'Value'].values
X = np.expand_dims(X,axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_wage_train, X_wage_test, y_wage_train, y_wage_test = train_test_split(X, y_wage, test_size = 0.2, random_state = 0)
X_value_train, X_value_test, y_value_train, y_value_test = train_test_split(X, y_value, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.linear_model import LinearRegression
SL_regressor_forWage = LinearRegression()
SL_regressor_forValue = LinearRegression()
SL_regressor_forWage.fit(X_wage_train,y_wage_train)
SL_regressor_forValue.fit(X_value_train,y_value_train)

In [None]:
y_wage_pred = SL_regressor_forWage.predict(X_wage_test)
y_value_pred = SL_regressor_forValue.predict(X_value_test)

In [None]:
plt.scatter(X_wage_train,y_wage_train, color = 'red')
plt.plot(X_wage_train,SL_regressor_forWage.predict(X_wage_train),color = 'blue')
plt.title('wage model(train)')

In [None]:
plt.scatter(X_wage_test, y_wage_test, color='red')
plt.plot(X_wage_test,y_wage_pred,color='blue')
plt.title('wage model(test)')

In [None]:
plt.scatter(X_value_train,y_value_train, color = 'red')
plt.plot(X_value_train,SL_regressor_forValue.predict(X_value_train),color = 'blue')
plt.title('value model(train)')

In [None]:
plt.scatter(X_value_test, y_value_test, color='red')
plt.plot(X_value_test,y_value_pred,color='blue')
plt.title('value model(test)')

##### 예상대로 평범한 Linear Regression으론 좋은 fit을 가지는 모델을 만들기 힘들었다.
- 하지만 플롯을 통해 wage보다 value가 overall을 기준으로 더 잘 관계성이 적용되어있는 것을 유추 가능

In [None]:
X_rf = df.loc[:,['Potential','Overall','Age','International Reputation','Weak Foot','Skill Moves','Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle']].values

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_wage_regressor = RandomForestRegressor(n_estimators=100)
rf_value_regressor = RandomForestRegressor(n_estimators=100)
X_wage_train, X_wage_test, y_wage_train, y_wage_test = train_test_split(X_rf, y_wage, test_size = 0.2, random_state = 0)
X_value_train, X_value_test, y_value_train, y_value_test = train_test_split(X_rf, y_value, test_size = 0.2, random_state = 0)
rf_wage_regressor.fit(X_wage_train,y_wage_train)
rf_value_regressor.fit(X_value_train,y_value_train)

In [None]:
messi = df[df['Name'] == 'L. Messi'].loc[:,['Potential','Overall','Age','International Reputation','Weak Foot','Skill Moves','Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle']].values

In [None]:
#리오넬 메시(Overall 94)의 시장가치를 Linear Regression모델로 predict 해보죠 
print('Linear Regressor says Lionel Messi worth about €',np.asscalar(SL_regressor_forValue.predict([[94]])))
#동일한방법으로 메시의 시장가치를 랜덤포레스트 모델로 예측해봅시다.
print('Random Forest Regressor says Lionel Messi worth about €',np.asscalar(rf_value_regressor.predict(messi)))