### Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import opti_recruit.feature_engineering as fe
import opti_recruit.get_team_features as gtf
import opti_recruit.similarity as si
import pandas as pd
from sklearn.pipeline import Pipeline,make_pipeline,make_union
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from opti_recruit.data import get_data, clean_data
import numpy as np

### Loading Data

In [3]:
dfs = get_data()
dfs.keys()

dict_keys([17, 18, 19, 20, 21, 22])

In [4]:
df22 = dfs[22]
df21 = dfs[21]

In [5]:
df22_new = fe.add_features(df22)
df21_new = fe.add_features(df21)

In [14]:
df22_new.head()

Unnamed: 0,sofifa_id,short_name,player_positions,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,...,defending_standing_tackle,defending_sliding_tackle,is_bench,potential_diff,att_rate,def_rate,age_bin,prefered_pos,player_pos,new_nationality
0,158023,L. Messi,"RW, ST, CF",93,93,78000000.0,320000.0,34,170,72,...,35,24,False,0,Medium,Low,30-34,RW,ATT,Argentina
1,188545,R. Lewandowski,ST,92,92,119500000.0,270000.0,32,185,81,...,42,19,False,0,High,Medium,30-34,ST,ATT,Poland
2,20801,Cristiano Ronaldo,"ST, LW",91,91,45000000.0,270000.0,36,187,83,...,32,24,False,0,High,Low,Over 34,ST,ATT,Portugal
3,190871,Neymar Jr,"LW, CAM",91,91,129000000.0,270000.0,29,175,68,...,32,29,False,0,High,Medium,25-29,LW,ATT,Brazil
4,192985,K. De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,181,70,...,65,53,False,0,High,High,30-34,CM,MID,Others


### Data Preparation

In [7]:
for index, col in enumerate(df22_new.columns):
    print(f"{col:<26} | ", end="")
    if (index % 3 == 2):
        print("\n", end="")

sofifa_id                  | short_name                 | player_positions           | 
overall                    | potential                  | value_eur                  | 
wage_eur                   | age                        | height_cm                  | 
weight_kg                  | club_team_id               | club_name                  | 
league_name                | league_level               | club_position              | 
club_joined                | club_contract_valid_until  | nationality_name           | 
nation_team_id             | preferred_foot             | weak_foot                  | 
skill_moves                | international_reputation   | work_rate                  | 
body_type                  | release_clause_eur         | player_tags                | 
player_traits              | pace                       | shooting                   | 
passing                    | dribbling                  | defending                  | 
physic                     | att

In [6]:
to_drop = ['sofifa_id','short_name','player_positions','height_cm','weight_kg','club_team_id'
          ,'club_name' ,'league_name','club_position','club_joined','club_contract_valid_until','nationality_name'
          ,'nation_team_id','preferred_foot','weak_foot','work_rate','body_type','player_tags','player_traits'
          ,'is_bench','potential_diff','age_bin','player_pos','new_nationality'
          ,'value_eur','wage_eur','release_clause_eur']
#             'age']

In [7]:
df22_num = df22_new.drop(to_drop, axis = 1)

In [114]:
df22_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   overall                      19239 non-null  int64   
 1   potential                    19239 non-null  int64   
 2   age                          19239 non-null  int64   
 3   league_level                 19178 non-null  float64 
 4   skill_moves                  19239 non-null  int64   
 5   international_reputation     19239 non-null  int64   
 6   pace                         17107 non-null  float64 
 7   shooting                     17107 non-null  float64 
 8   passing                      17107 non-null  float64 
 9   dribbling                    17107 non-null  float64 
 10  defending                    17107 non-null  float64 
 11  physic                       17107 non-null  float64 
 12  attacking_crossing           19239 non-null  int64   
 13  a

### Pipeline for Numerical DFrame

In [115]:
from sklearn.pipeline import Pipeline,make_pipeline,make_union
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn import set_config; set_config(display='diagram')



num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64','int64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','category'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough')


preproc_full = make_union(preproc_basic)
# preproc_full


In [116]:
SimpleImputer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

In [117]:
X_train_transformed = preproc_basic.fit_transform(df22_num)


test_df = pd.DataFrame(X_train_transformed, 
             columns=preproc_basic.get_feature_names_out()
            )


In [118]:
def get_index(x):
    return df22_new[df22_new['short_name']==x].index.tolist()[0]

In [119]:
get_index('Cristiano Ronaldo')

2

In [120]:
test_df.head()

Unnamed: 0,pipeline__overall,pipeline__potential,pipeline__age,pipeline__league_level,pipeline__skill_moves,pipeline__international_reputation,pipeline__pace,pipeline__shooting,pipeline__passing,pipeline__dribbling,...,onehotencoder__prefered_pos_GK,onehotencoder__prefered_pos_LB,onehotencoder__prefered_pos_LM,onehotencoder__prefered_pos_LW,onehotencoder__prefered_pos_LWB,onehotencoder__prefered_pos_RB,onehotencoder__prefered_pos_RM,onehotencoder__prefered_pos_RW,onehotencoder__prefered_pos_RWB,onehotencoder__prefered_pos_ST
0,3.957501,3.60178,1.851089,-0.4746,2.146241,10.525295,1.628331,2.992852,3.548135,3.564477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,3.812154,3.43747,1.429869,-0.4746,2.146241,10.525295,0.949332,2.992852,2.284233,2.575529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3.666806,3.27316,2.272309,-0.4746,3.448937,10.525295,1.822331,3.143798,2.389558,2.795295,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3.666806,3.27316,0.798039,-0.4746,3.448937,10.525295,2.210331,2.313597,3.021509,3.454594,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.666806,3.27316,1.008649,-0.4746,2.146241,7.83052,0.755332,2.540015,3.758786,2.795295,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cosine Similarity

#### Cosine Similarity Matrix 

In [174]:
from sklearn.metrics.pairwise import cosine_similarity 
similarities = cosine_similarity(test_df)

#### Cricri 

In [122]:
playerIndex = similarities[2,:].argsort()[-16:][::-1][1:]
for i in playerIndex:
        print(df22_new.loc[i,'short_name'])

R. Lewandowski
S. Agüero
K. Benzema
S. Mané
A. Lacazette
A. Griezmann
L. Messi
K. Mbappé
A. Sánchez
Alexandre Pato
C. Immobile
L. Suárez
Rodrigo
P. Aubameyang
M. Salah


In [166]:
index = get_index('L. Messi')
playerIndex = similarities[index,:].argsort()[-11:][::-1][1:]
for i in playerIndex:
        print(df22_new.loc[i,'short_name'])

E. Hazard
Neymar Jr
M. Salah
M. Reus
H. Son
A. Sánchez
C. Vela
K. Coman
Á. Di María
K. Benzema


In [144]:
def normalize(array):
    return np.array([round(num, 2) for num in (array - min(array))*100/(max(array)-min(array))])

In [180]:
norm_simili= normalize(similarities[index,:])

In [179]:
np.sort(similarities[index,:])[-11:][::-1][1:]

array([0.97480236, 0.96869053, 0.96535517, 0.963597  , 0.96356528,
       0.96308683, 0.96255531, 0.96085239, 0.95864721, 0.95754507])

In [181]:
np.sort(norm_simili)[-11:][::-1][1:]

array([98.56, 98.21, 98.01, 97.91, 97.91, 97.88, 97.85, 97.76, 97.63,
       97.57])

In [184]:
dict(zip(similarities[index,:].argsort()[-11:][::-1][1:],
    np.sort(norm_simili)[-11:][::-1][1:]))

{75: 98.56,
 3: 98.21,
 17: 98.01,
 79: 97.91,
 13: 97.91,
 405: 97.88,
 141: 97.85,
 64: 97.76,
 33: 97.63,
 11: 97.57}

In [178]:
# Do not keep the first value as its the player similitude
similarities[index,:].argsort()[-11:][::-1][1:]

array([ 75,   3,  17,  79,  13, 405, 141,  64,  33,  11])

In [211]:
def cosine_recommendation(player):
    index = get_index(player)
    norm_sim_array= np.sort(normalize(similarities[index,:]))[-11:][::-1][1:]
    index_matrix = similarities[index,:].argsort()[-11:][::-1][1:]
    return {'index': index_matrix,'score':norm_sim_array}

In [151]:
df22_new.iloc[3]['short_name']

'Neymar Jr'

In [214]:
pd.DataFrame(cosine_recommendation('K. Havertz'))

Unnamed: 0,index,score
0,6,97.27
1,344,97.0
2,41,96.95
3,182,96.93
4,17,96.81
5,64,96.75
6,429,96.73
7,132,96.63
8,342,96.6
9,11,96.56


In [215]:
# def cosine_recommendation(df,player):
#     index = get_index(df,player)
#     similarities = get_similarity_dataframe(df)
#     norm_sim_array= np.sort(normalize(similarities[index,:]))[-11:][::-1][1:]
#     index_matrix = similarities[index,:].argsort()[-11:][::-1][1:]
#     dict_simili = {'index': index_matrix,'score':norm_sim_array}
#     reco_df = pd.DataFrame(dict_simili)
#     reco_df['Name'] = df.iloc[reco_df.index]['short_name']
#     return reco_df

In [9]:
res = si.cosine_recommendation('K. Havertz')

In [13]:
res.reset_index().to_dict()

{'index_player': {0: 6,
  1: 344,
  2: 41,
  3: 182,
  4: 17,
  5: 64,
  6: 429,
  7: 132,
  8: 342,
  9: 11},
 'score': {0: 97.27,
  1: 97.0,
  2: 96.95,
  3: 96.93,
  4: 96.81,
  5: 96.75,
  6: 96.73,
  7: 96.63,
  8: 96.6,
  9: 96.56},
 'sofifa_id': {0: 231747,
  1: 212194,
  2: 211110,
  3: 220834,
  4: 209331,
  5: 213345,
  6: 202166,
  7: 222492,
  8: 211300,
  9: 165153}}

In [204]:
for i,v in cosine_recommendation('K. Havertz').items():
    print(df22_new.iloc[i]['short_name'], v)

K. Mbappé 97.27
J. Brandt 97.0
P. Dybala 96.95
Marco Asensio 96.93
M. Salah 96.81
K. Coman 96.75
J. Draxler 96.73
L. Sané 96.63
A. Martial 96.6
K. Benzema 96.56


### KNN Similarity

In [185]:
from sklearn.neighbors import NearestNeighbors

reco = NearestNeighbors(n_neighbors=11, algorithm='ball_tree').fit(test_df)



In [140]:
reco.kneighbors(test_df)

(array([[0.        , 4.37251756, 5.09786212, ..., 5.76909196, 5.92727728,
         5.94435066],
        [0.        , 2.81167229, 3.64997796, ..., 4.96830537, 5.0442159 ,
         5.13351665],
        [0.        , 3.64997796, 4.99744547, ..., 5.31557724, 5.35417191,
         5.43989973],
        ...,
        [0.        , 1.5152964 , 1.59202776, ..., 2.19327266, 2.22910393,
         2.23736362],
        [0.        , 1.97220919, 2.22265254, ..., 2.91015225, 2.91222025,
         2.93074848],
        [0.        , 2.77200417, 2.79294138, ..., 3.01583612, 3.04754987,
         3.07467065]]),
 array([[    0,     3,     2, ...,     1,    83,    79],
        [    1,    23,     2, ...,    83,    17,    25],
        [    2,     1,    11, ...,     3,    97,    83],
        ...,
        [19236, 19158, 19226, ..., 19207, 19208, 19234],
        [19237, 19041, 19056, ..., 19014, 18923, 18801],
        [19238, 19110, 19217, ..., 18852, 18490, 18991]]))

In [186]:
players_recommended_index = reco.kneighbors(test_df)[1]

In [187]:
players_recommended_scores = reco.kneighbors(test_df)[0]

In [201]:
players_recommended_scores[index][1:]


array([4.37251756, 5.09786212, 5.31253871, 5.40636048, 5.57693262,
       5.737519  , 5.74582279, 5.76909196, 5.92727728, 5.94435066])

In [142]:
print(f'Here are 10 players similar to K.Mbappe :' '\n')
for i in players_recommended[index][1:]:
    print(df22_new.iloc[i]['short_name'])

Here are 10 players similar to K.Mbappe :

Neymar Jr
Cristiano Ronaldo
M. Salah
E. Hazard
H. Son
K. Benzema
Á. Di María
R. Lewandowski
A. Griezmann
M. Reus
