In [1]:
import pandas as pd
import altair as alt
import numpy as np

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso

from scipy.stats import loguniform

In [4]:
import os
os.chdir(os.getcwd()[:-3])

In [13]:
df = pd.read_csv('data/data-21.csv', index_col=0)
df

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,Penalties,Composure,Defensive Awareness,Standing Tackle,Sliding Tackle,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes
0,253283,Facundo Pellistri,18,https://cdn.sofifa.com/players/253/283/20_60.png,Uruguay,https://cdn.sofifa.com/flags/uy.png,71,87,Peñarol,https://cdn.sofifa.com/teams/101110/light_30.png,...,66.0,61.0,35.0,11.0,18.0,9.0,12.0,7.0,8.0,7.0
1,179813,Edinson Cavani,32,https://cdn.sofifa.com/players/179/813/20_60.png,Uruguay,https://cdn.sofifa.com/flags/uy.png,86,86,Paris Saint-Germain,https://cdn.sofifa.com/teams/73/light_30.png,...,85.0,80.0,57.0,48.0,39.0,12.0,5.0,13.0,13.0,10.0
2,245541,Giovanni Reyna,17,https://cdn.sofifa.com/players/245/541/20_60.png,United States,https://cdn.sofifa.com/flags/us.png,68,87,Borussia Dortmund,https://cdn.sofifa.com/teams/22/light_30.png,...,50.0,59.0,30.0,23.0,24.0,10.0,13.0,14.0,12.0,7.0
3,233419,Raphael Dias Belloli,23,https://cdn.sofifa.com/players/233/419/20_60.png,Brazil,https://cdn.sofifa.com/flags/br.png,81,85,Stade Rennais FC,https://cdn.sofifa.com/teams/74/light_30.png,...,73.0,79.0,45.0,54.0,38.0,8.0,7.0,13.0,8.0,14.0
4,198710,James Rodríguez,28,https://cdn.sofifa.com/players/198/710/20_60.png,Colombia,https://cdn.sofifa.com/flags/co.png,82,82,Everton,https://cdn.sofifa.com/teams/7/light_30.png,...,81.0,87.0,52.0,41.0,44.0,15.0,15.0,15.0,5.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18536,5594,Rémy Vercoutre,37,https://cdn.sofifa.com/players/005/594/18_60.png,France,https://cdn.sofifa.com/flags/fr.png,77,77,Stade Malherbe Caen,https://cdn.sofifa.com/teams/210/light_30.png,...,23.0,20.0,,11.0,11.0,76.0,76.0,77.0,77.0,77.0
18537,3395,Shaun Wright-Phillips,34,https://cdn.sofifa.com/players/003/395/17_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,68,68,New York Red Bulls,https://cdn.sofifa.com/teams/689/light_30.png,...,64.0,70.0,,27.0,33.0,14.0,9.0,14.0,11.0,12.0
18538,388,Sol Campbell,35,https://cdn.sofifa.com/players/000/388/11_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,75,79,Newcastle United,https://cdn.sofifa.com/teams/13/light_30.png,...,37.0,,,77.0,73.0,11.0,7.0,12.0,5.0,9.0
18539,2956,Stiliyan Petrov,32,https://cdn.sofifa.com/players/002/956/13_60.png,Bulgaria,https://cdn.sofifa.com/flags/bg.png,76,76,Aston Villa,https://cdn.sofifa.com/teams/2/light_30.png,...,67.0,,,62.0,70.0,9.0,9.0,7.0,9.0,15.0


In [14]:
clubs = ['Manchester United', 'Tottenham Hotspur', 'Manchester City',
       'Liverpool', 'Burnley', 'Chelsea', 'Everton', 'Leicester City',
       'Southampton', 'Aston Villa', 'Newcastle United', 'Sheffield United',
       'Arsenal', 'Brighton & Hove Albion', 'Bournemouth',
       'West Bromwich Albion', 'Watford', 'West Ham United', 'Crystal Palace',
       'Leeds United']

en_df = df[df['Club'].isin(clubs)]
en_df

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,Penalties,Composure,Defensive Awareness,Standing Tackle,Sliding Tackle,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes
4,198710,James Rodríguez,28,https://cdn.sofifa.com/players/198/710/20_60.png,Colombia,https://cdn.sofifa.com/flags/co.png,82,82,Everton,https://cdn.sofifa.com/teams/7/light_30.png,...,81.0,87.0,52.0,41.0,44.0,15.0,15.0,15.0,5.0,14.0
10,246147,Mason Greenwood,18,https://cdn.sofifa.com/players/246/147/20_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,77,89,Manchester United,https://cdn.sofifa.com/teams/11/light_30.png,...,64.0,76.0,35.0,39.0,33.0,5.0,6.0,6.0,9.0,8.0
12,221479,Dominic Calvert-Lewin,23,https://cdn.sofifa.com/players/221/479/20_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,79,84,Everton,https://cdn.sofifa.com/teams/7/light_30.png,...,77.0,71.0,38.0,31.0,23.0,12.0,12.0,8.0,7.0,11.0
22,251954,Crysencio Summerville,18,https://cdn.sofifa.com/players/251/954/20_60.png,Netherlands,https://cdn.sofifa.com/flags/nl.png,68,83,Leeds United,https://cdn.sofifa.com/teams/8/light_30.png,...,59.0,58.0,20.0,24.0,26.0,9.0,9.0,15.0,11.0,10.0
25,236610,Moise Kean,20,https://cdn.sofifa.com/players/236/610/20_60.png,Italy,https://cdn.sofifa.com/flags/it.png,74,83,Everton,https://cdn.sofifa.com/teams/7/light_30.png,...,65.0,69.0,27.0,28.0,16.0,13.0,8.0,10.0,7.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18524,2148,Hatem Trabelsi,29,https://cdn.sofifa.com/players/002/148/07_60.png,Tunisia,https://cdn.sofifa.com/flags/tn.png,79,82,Manchester City,https://cdn.sofifa.com/teams/10/light_30.png,...,72.0,,,84.0,,8.0,10.0,72.0,15.0,7.0
18526,5467,Jamie Carragher,34,https://cdn.sofifa.com/players/005/467/13_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,77,77,Liverpool,https://cdn.sofifa.com/teams/9/light_30.png,...,38.0,,,76.0,80.0,15.0,13.0,8.0,14.0,15.0
18532,684,Mark Schwarzer,42,https://cdn.sofifa.com/players/000/684/16_60.png,Australia,https://cdn.sofifa.com/flags/au.png,73,73,Leicester City,https://cdn.sofifa.com/teams/95/light_30.png,...,34.0,,,19.0,20.0,70.0,73.0,67.0,78.0,69.0
18538,388,Sol Campbell,35,https://cdn.sofifa.com/players/000/388/11_60.png,England,https://cdn.sofifa.com/flags/gb-eng.png,75,79,Newcastle United,https://cdn.sofifa.com/teams/13/light_30.png,...,37.0,,,77.0,73.0,11.0,7.0,12.0,5.0,9.0


# 3. Methods

## 3.1. Data preprocessing

In [15]:
def four_positions(df1):
    df = df1.copy()
    if df['Position'] == 'GK':
        return 'GK'
    elif df['Position'] in (['RB', 'LB', 'CB', 'LCB', 'RCB', 'RWB', 'LWB']):
        return 'DF'
    elif df['Position'] in (['LDM', 'RDM', 'LCM', 'LM', 'RM','RCM', 'CM', 'LAM', 'RAM', 'CAM', 'LDM', 'RDM', 'CDM']):
        return 'MF'
    elif df['Position'] in (['RS', 'ST', 'LS', 'LF', 'RF', 'RW', 'LW', 'CF']):
        return 'ST'
    else:
        return df.Position
    
def growth(df1):
    df = df1.copy()
    df['Growth'] = df['Potential'] - df['Overall']
    return df

In [16]:
en_df = df[df['Club'].isin(clubs)]

en_df = growth(en_df)
en_df['Position'] = en_df.apply(four_positions, axis = 1)

In [17]:
str_list = [] 
num_list = []
for colname, colvalue in df.iteritems():
    if type(colvalue[1]) == str:
        str_list.append(colname)
    else:
        num_list.append(colname)

target_feature = ['Growth']
drop_features = ['ID', 'Name', 'Photo', 'Flag', 'Club Logo','Jersey Number','Joined',
                 'Special','Body Type', 'Release Clause',
               'Weight','Height','Contract Valid Until','LS', 'ST', 'RS', 'LW',
       'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM',
       'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB',
       'GK', 'Overall', 'Potential', 'Work Rate', 'Nationality', 'Real Face']
numeric_features = list(
    set(num_list)
    - set(str_list)
    - set(target_feature)
    - set(drop_features))

categorical_features = list(
    set(str_list)
    - set(num_list)
    - set(target_feature)
    - set(drop_features))

In [18]:
en_df['Weak Foot'] = en_df['Weak Foot'].str[:1]
en_df['International Reputation'] = en_df['International Reputation'].str[:1]
en_df['Skill Moves'] = en_df['Skill Moves'].str[:1]

## Model preparation

In [19]:
conditions = [
    (en_df['Club']=="Manchester United"),
    (en_df['Club']=="Manchester City") | (en_df['Club']=="Liverpool")| (en_df['Club']=="Chelsea") ,
    (en_df['Club']=="Bournemouth") | (en_df['Club']=="West Bromwich Albion")| (en_df['Club']=="Watford") ,
    (True)
    ]

# create a list of the values we want to assign for each condition
values = ['ManUtd', 'Top3', 'Bottom3', 'Others']

# create a new column and use np.select to assign values to it using our lists as arguments
en_df['Club'] = np.select(conditions, values)
en_df.Club.unique()

# en_df = en_df.drop(columns=drop_features)

array(['Others', 'ManUtd', 'Top3', 'Bottom3'], dtype=object)

In [20]:
en_df[numeric_features]=en_df[numeric_features].fillna(0.0)
en_df[categorical_features]=en_df[categorical_features].fillna("NA")
en_df[target_feature]=en_df[target_feature].fillna(0.0)

In [21]:
train_df, test_df = en_df.query("Club != 'Bottom3'"), en_df.query("Club == 'Bottom3'")

In [22]:
preprocessor= make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(sparse=False,drop="if_binary", handle_unknown="ignore", dtype="int"), categorical_features)
    ,('drop', drop_features)
)

feature_names = numeric_features + categorical_features

# 4. Results

# 5. Conclusion

In [23]:
X_train, y_train = train_df.drop(columns=['Growth']), train_df['Growth']
X_test, y_test = test_df.drop(columns=['Growth']), test_df['Growth']

In [24]:
ridge_pipe = make_pipeline(preprocessor, Ridge())

In [25]:
ridge_pipe.fit(X_train, y_train);

In [26]:
def mean_cross_val_scores(model, X_train, y_train, **kwargs):
    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append(round(mean_scores[i], 5))

    return pd.Series(data=out_col, index=mean_scores.index)

In [27]:
results = {}

In [28]:

results['Ridge regression'] = mean_cross_val_scores(ridge_pipe, X_train, y_train, cv=10, return_train_score=True)

In [29]:
pd.DataFrame(results)

Unnamed: 0,Ridge regression
fit_time,0.00813
score_time,0.00281
test_score,0.82485
train_score,0.88353


In [30]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

In [31]:
dummy_pipe = make_pipeline(preprocessor, DummyRegressor(strategy="mean"))

In [32]:
lr_pipe = make_pipeline(preprocessor, LinearRegression())

In [33]:
results['Baseline dummy'] = mean_cross_val_scores(dummy_pipe, X_train, y_train, cv=10, return_train_score=True)

In [34]:
results['Linear regression'] = mean_cross_val_scores(lr_pipe, X_train, y_train, cv=10, return_train_score=True)

In [35]:
pd.DataFrame(results)

Unnamed: 0,Ridge regression,Baseline dummy,Linear regression
fit_time,0.00813,0.00729,0.01778
score_time,0.00281,0.00289,0.00279
test_score,0.82485,-0.08033,0.81835
train_score,0.88353,0.0,0.8833


In [36]:
from sklearn.preprocessing import PolynomialFeatures

In [37]:
poly_pipe = make_pipeline(preprocessor, PolynomialFeatures(), Ridge())

In [38]:
results['Polynomial regression'] = mean_cross_val_scores(poly_pipe, X_train, y_train, cv=10, return_train_score=True)

In [39]:
pd.DataFrame(results).round(2)

Unnamed: 0,Ridge regression,Baseline dummy,Linear regression,Polynomial regression
fit_time,0.01,0.01,0.02,0.05
score_time,0.0,0.0,0.0,0.0
test_score,0.82,-0.08,0.82,0.53
train_score,0.88,0.0,0.88,1.0


In [40]:
ridge_pipe.fit(X_train, y_train);

In [41]:
feature_names = numeric_features + ridge_pipe.named_steps["columntransformer"].named_transformers_["onehotencoder"].get_feature_names_out().tolist()
feature_coef_ridge = pd.DataFrame(
    data={
        "Coefficient": ridge_pipe.named_steps["ridge"].coef_.flatten(),
        "Magnitude": abs(ridge_pipe.named_steps["ridge"].coef_.flatten())
    },
    index=feature_names
).sort_values("Magnitude", ascending=False)

In [42]:
feature_coef_ridge.head(10).round(2)

Unnamed: 0,Coefficient,Magnitude
Age,-4.88,4.88
International Reputation_4,2.57,2.57
International Reputation_1,-1.88,1.88
Following,-1.12,1.12
Likes,1.12,1.12
International Reputation_2,-1.07,1.07
Club_Others,-1.03,1.03
Wage,-1.03,1.03
Dribbling,-0.87,0.87
GK Diving,-0.82,0.82


In [43]:
ridge_pipe.score(X_test, y_test)



0.8408845480826803

## Find the players

In [44]:
prediction = ridge_pipe.predict(X_test)



In [45]:
final_test = X_test.copy()
final_test["Growth"] = prediction

In [46]:
final_test['Name'] = X_test.Name

In [47]:
order = final_test.query('Age < 20').sort_values('Growth', ascending=False).Name.tolist()

In [48]:
alt.Chart(final_test.query('Age < 20'),
          title="Rico Richards is good midfielder option"
         ).mark_circle(interpolate='monotone').encode(
    x=alt.X('Wage', title="Player wage (in thousand dollars)", scale=alt.Scale(zero=False)),
    y=alt.Y('Name', title='', sort=order),
    size='Growth',
color='Position').properties(height=200,width=300)

# EDA

In [49]:
df_eda = en_df.copy()

df_eda['Wage'] = df_eda['Wage'] // 1000

In [50]:
alt.Chart(df_eda.query('Age<=30'), title="ManUtd youth players with low growth potential").mark_line(interpolate='monotone').encode(
    x=alt.X('Age', title="Player age (in years)"),
    y=alt.Y('mean(Growth)', title='Mean growth of the player'),
    color=alt.Color('Club', title="", legend=alt.Legend(
        orient='none',
        legendX=330, legendY=20,
        direction='vertical',
        titleAnchor='middle'))).properties(height=200,width=300)

In [51]:
df.Position.unique()

array(['RM', 'ST', 'LM', 'RW', 'LCM', 'RS', 'RB', 'LW', 'CM', 'CB', 'CDM',
       'CAM', 'LB', 'RAM', 'RCM', 'RCB', 'RWB', 'LDM', 'LAM', 'LCB', 'CF',
       'LS', 'GK', 'LWB', 'LF', 'RDM', 'RF', nan], dtype=object)

In [52]:
alt.Chart(df_eda).mark_boxplot().encode(
    y='Position',
    x=alt.X('Growth', scale=alt.Scale(zero=False), title="Player growth"),
    color=alt.Color('Club', legend=None)
).properties(height=75,width=150).facet(alt.Facet('Club', title=""), columns=2, 
        title="ManUtd Goalkeepers & Midfielders have low growth potential")