In [142]:
#===============================================================================
# Import libraries
#===============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
%matplotlib inline

In [119]:
df = pd.read_csv('./dataset/players_without_nan.csv')

In [120]:
df['player'] = df['player'].factorize(sort=True)[0]
df['club'] = df['club'].factorize(sort=True)[0]
df['leagueCountry'] = df['leagueCountry'].factorize(sort=True)[0]
df['position'] = df['position'].factorize(sort=True)[0]
df['skintone'] = df['skintone'].apply(lambda x: 0 if x >= 0.5 else 1)
df['games_meanIAT'] = df['games'] * df['meanIAT']
df['games_nIAT'] = df['games'] * df['nIAT']
df['games_seIAT'] = df['games'] * df['seIAT']
df['games_meanExp'] = df['games'] * df['meanExp']
df['games_nExp'] = df['games'] * df['nExp']
df['games_seExp'] = df['games'] * df['seExp']
useless_cols = ['playerShort', 'birthday', 'photoID', 'rater1', 'rater2', 'Alpha_3', 'refCount']
df.drop(useless_cols, axis=1, inplace=True)

In [169]:
df_agg = pd.DataFrame()
df_agg['total_game'] = df.groupby('player')['games'].agg('sum')
df_agg['total_yellow'] = df.groupby('player')['yellowCards'].agg('sum') / df_agg['total_game']
df_agg['total_yellow_red'] = df.groupby('player')['yellowReds'].agg('sum') / df_agg['total_game']
df_agg['total_direct_red'] = df.groupby('player')['redCards'].agg('sum') / df_agg['total_game']
df_agg['total_red'] = df.groupby('player')['allreds'].agg('sum') / df_agg['total_game']
df_agg['total_win'] = df.groupby('player')['victories'].agg('sum') / df_agg['total_game']
df_agg['total_lose'] = df.groupby('player')['defeats'].agg('sum') / df_agg['total_game']
df_agg['total_draw'] = df.groupby('player')['ties'].agg('sum') / df_agg['total_game']
df_agg['total_goal'] = df.groupby('player')['goals'].agg('sum') / df_agg['total_game']
df_agg['weighted_meanIAT'] = df.groupby('player')['games_meanIAT'].agg('sum') / df_agg['total_game']
#df_agg['weighted_nIAT'] = df.groupby('player')['games_nIAT'].agg('sum') / df_agg['total_game']
df_agg['weighted_seIAT'] = df.groupby('player')['games_seIAT'].agg('sum') / df_agg['total_game']
df_agg['weighted_meanExp'] = df.groupby('player')['games_meanExp'].agg('sum') / df_agg['total_game']
#df_agg['weighted_nExp'] = df.groupby('player')['games_nExp'].agg('sum') / df_agg['total_game']
df_agg['weighted_seExp'] = df.groupby('player')['games_seExp'].agg('sum') / df_agg['total_game']

There are some players who transfer to another country. For such players, we consider the country with highest number of occurrence.

In [170]:
df_agg['league'] = df.groupby('player')['leagueCountry'].apply(lambda x: mode(x)[0][0])
df_agg['club'] = df.groupby('player')['club'].apply(lambda x: mode(x)[0][0])
#df_agg['height'] = df.groupby('player')['height'].apply(lambda x: mode(x)[0][0])
#df_agg['weight'] = df.groupby('player')['weight'].apply(lambda x: mode(x)[0][0])
df_agg['position'] = df.groupby('player')['position'].apply(lambda x: mode(x)[0][0])
#df_agg['birthyear'] = df.groupby('player')['birthyear'].apply(lambda x: mode(x)[0][0])
df_agg['skintone'] = df.groupby('player')['skintone'].apply(lambda x: mode(x)[0][0])

If your variables are of incomparable units (e.g. height in cm and weight in kg) then you should standardize variables, of course. Even if variables are of the same units but show quite different variances it is still a good idea to standardize before K-means. You see, K-means clustering is "isotropic" in all directions of space and therefore tends to produce more or less round (rather than elongated) clusters. In this situation leaving variances unequal is equivalent to putting more weight on variables with smaller variance, so clusters will tend to be separated along variables with greater variance.

In [171]:
X = df_agg.iloc[:,1:-1]
Y = df_agg.iloc[:,-1].values
minmax_scaler = MinMaxScaler()
X_nrm = minmax_scaler.fit_transform(X)
kmeans = KMeans(n_clusters=2).fit(X_nrm)
labels = kmeans.labels_
print('silhouette score is:',
      silhouette_score(X_nrm, labels, metric="euclidean"))
print('accuracy of clusters is:',
      max(accuracy_score(Y, labels), 1 - accuracy_score(Y, labels)))
print('f1-score of clusters is:',
      max(f1_score(Y, labels), 1 - f1_score(Y, labels)))

silhouette score is: 0.330968459135
accuracy of clusters is: 0.656995486783
f1-score of clusters is: 0.558968732858


In [96]:
df['lvl1'] = df['games']*df['meanIAT']

In [99]:
df.groupby('player')['lvl1'].agg('sum') / df_agg['total_game']

Index(['player', 'club', 'leagueCountry', 'height', 'weight', 'position',
       'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards',
       'yellowReds', 'redCards', 'refNum', 'refCountry', 'meanIAT', 'nIAT',
       'seIAT', 'meanExp', 'nExp', 'seExp', 'allreds', 'skintone', 'birthyear',
       'lvl1'],
      dtype='object')

In [113]:
df_agg.describe()

Unnamed: 0,total_game,total_yellow,total_yellow_red,total_direct_red,total_red,total_win,total_lose,total_draw,total_goal,weighted_meanIAT,weighted_nIAT,weighted_seIAT,weighted_meanExp,weighted_nExp,weighted_seExp,skintone
count,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0,1551.0
mean,227.245648,0.135046,0.004318,0.004545,0.008863,0.425263,0.333703,0.241034,0.102878,0.346524,14767.017412,0.000276,0.439821,15366.07895,0.00133,0.758221
std,141.324412,0.080505,0.006596,0.008101,0.011106,0.102921,0.087955,0.056975,0.110786,0.014458,20582.876366,0.00036,0.098837,21310.018184,0.001878,0.428299
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28753,483.666667,1e-05,0.065271,500.333333,3.7e-05,0.0
25%,121.0,0.07934,0.0,0.0,0.0,0.362941,0.281205,0.214286,0.022603,0.33657,3156.419513,0.000109,0.360363,3303.283402,0.000505,1.0
50%,206.0,0.125,0.0,0.00216,0.006006,0.41453,0.339744,0.241294,0.064378,0.340113,7611.47486,0.000228,0.398529,7839.669967,0.001002,1.0
75%,312.0,0.178439,0.006623,0.006689,0.0128,0.482107,0.384615,0.268432,0.150188,0.36066,19245.244028,0.000291,0.543682,19985.410978,0.00137,1.0
max,895.0,1.0,0.045455,0.166667,0.166667,1.0,1.0,1.0,0.778455,0.396512,348464.744048,0.009063,1.031407,358534.630952,0.04565,1.0


In [117]:
df.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,allreds,refCount,skintone,birthyear
0,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,...,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0,29,0.125,1979
1,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,...,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0,29,0.125,1985
2,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,...,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0,29,1.0,1986
3,anders-lindegaard,Anders Lindegaard,Manchester United,England,13.04.1984,193.0,80.0,Goalkeeper,1,0,...,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0,29,0.25,1984
4,andreas-beck,Andreas Beck,1899 Hoffenheim,Germany,13.03.1987,180.0,70.0,Right Fullback,1,1,...,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0,29,0.0,1987


In [137]:
df_agg['league'].unique()

array([ 1.        ,  0.33333333,  0.        ,  0.66666667])