In [13]:
#===============================================================================
# Import libraries
#===============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.cluster import KMeans
%matplotlib inline
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn import metrics

In [60]:
df = pd.read_csv('ppplayers_without_nan.csv')

In [61]:
df['player'] = df['player'].factorize(sort=True)[0]
df['club'] = df['club'].factorize(sort=True)[0]
df['leagueCountry'] = df['leagueCountry'].factorize(sort=True)[0]
df['position'] = df['position'].factorize(sort=True)[0]
df['skintone'] = df['skintone'].apply(lambda x: 0 if x <= 0.5 else 1)

useless_cols = ['playerShort', 'birthday', 'photoID', 'rater1', 'rater2', 'Alpha_3', 'refCount']
df.drop(useless_cols, axis=1, inplace=True)

#### Aggregate data by summing

In [46]:
df_agg = pd.DataFrame()
df_agg['total_yellow'] = df.groupby('player')['yellowCards'].agg('sum')
df_agg['total_yellow_red'] = df.groupby('player')['yellowReds'].agg('sum')
df_agg['total_direct_red'] = df.groupby('player')['redCards'].agg('sum')
df_agg['total_red'] = df.groupby('player')['allreds'].agg('sum')
df_agg['total_game'] = df.groupby('player')['games'].agg('sum')
df_agg['total_win'] = df.groupby('player')['victories'].agg('sum')
df_agg['total_lose'] = df.groupby('player')['defeats'].agg('sum')
df_agg['total_draw'] = df.groupby('player')['ties'].agg('sum')
df_agg['total_goal'] = df.groupby('player')['goals'].agg('sum')

In [47]:
df_agg['league'] = df.groupby('player')['leagueCountry'].apply(lambda x: mode(x)[0][0])
df_agg['club'] = df.groupby('player')['club'].apply(lambda x: mode(x)[0][0])
df_agg['height'] = df.groupby('player')['height'].apply(lambda x: mode(x)[0][0])
df_agg['weight'] = df.groupby('player')['weight'].apply(lambda x: mode(x)[0][0])
df_agg['position'] = df.groupby('player')['position'].apply(lambda x: mode(x)[0][0])
#df_agg['birthyear'] = df.groupby('player')['birthyear'].apply(lambda x: mode(x)[0][0])
df_agg['skintone'] = df.groupby('player')['skintone'].apply(lambda x: mode(x)[0][0])

#### Aggregate by taking mean values

In [64]:
df_mean = pd.DataFrame()
df_mean['mean_yellow'] = df.groupby('player')['yellowCards'].agg('mean')
df_mean['mean_yellow_red'] = df.groupby('player')['yellowReds'].agg('mean')
df_mean['mean_direct_red'] = df.groupby('player')['redCards'].agg('mean')
df_mean['mean_red'] = df.groupby('player')['allreds'].agg('mean')
#df_mean['total_game'] = df.groupby('player')['games'].agg('mean')
df_mean['mean_win'] = df.groupby('player')['victories'].agg('mean')
df_mean['mean_lose'] = df.groupby('player')['defeats'].agg('mean')
df_mean['mean_draw'] = df.groupby('player')['ties'].agg('mean')
df_mean['mean_goal'] = df.groupby('player')['goals'].agg('mean')

In [65]:
df_mean['league'] = df_agg['league']
df_mean['club'] = df_agg['club']
df_mean['height'] = df_agg['height']
df_mean['weight'] = df_agg['weight']
df_mean['position'] = df_agg['position']
#df_agg['birthyear'] = df.groupby('player')['birthyear'].apply(lambda x: mode(x)[0][0])
df_mean['skintone'] = df_agg['skintone']

In [131]:
df_mean.describe()

Unnamed: 0,mean_yellow,mean_yellow_red,mean_direct_red,mean_red,mean_win,mean_lose,mean_draw,mean_goal,league,club,height,weight,position,skintone
count,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0
mean,0.39664,0.012182,0.012882,0.025064,1.241414,0.946435,0.700037,0.309289,1.599873,46.96374,182.059796,76.053435,4.804707,0.169847
std,0.284908,0.017692,0.017862,0.028073,0.529053,0.433371,0.316185,0.365928,1.128114,27.754068,6.748476,7.070506,3.434858,0.375618
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,161.0,54.0,0.0,0.0
25%,0.201163,0.0,0.0,0.0,0.845979,0.642857,0.487649,0.0636,1.0,25.0,178.0,71.0,2.0,0.0
50%,0.333333,0.0,0.007874,0.017621,1.207222,0.86327,0.654859,0.186047,2.0,45.5,182.0,76.0,5.0,0.0
75%,0.527237,0.018576,0.019608,0.036397,1.573558,1.163265,0.878393,0.429915,3.0,70.0,187.0,81.0,7.0,0.0
max,2.482759,0.132075,0.166667,0.24359,3.657343,3.45098,2.27451,3.299145,3.0,96.0,203.0,100.0,12.0,1.0


In [132]:
df_agg.describe()

Unnamed: 0,total_yellow,total_yellow_red,total_direct_red,total_red,total_game,total_win,total_lose,total_draw,total_goal,league,club,height,weight,position,skintone
count,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0
mean,31.632952,0.956107,1.010814,1.966921,237.201018,106.385496,74.166667,56.648855,28.001908,1.599873,46.96374,182.059796,76.053435,4.804707,0.169847
std,25.355665,1.39882,1.34709,2.28106,143.977858,78.043527,41.849328,34.415758,41.726421,1.128114,27.754068,6.748476,7.070506,3.434858,0.375618
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,161.0,54.0,0.0,0.0
25%,13.0,0.0,0.0,0.0,131.0,52.75,44.0,31.0,4.0,1.0,25.0,178.0,71.0,2.0,0.0
50%,26.0,0.0,1.0,1.0,215.0,85.5,69.0,52.0,13.0,2.0,45.5,182.0,76.0,5.0,0.0
75%,43.0,1.0,2.0,3.0,324.0,140.25,98.0,78.0,35.0,3.0,70.0,187.0,81.0,7.0,0.0
max,197.0,12.0,13.0,19.0,945.0,527.0,309.0,234.0,420.0,3.0,96.0,203.0,100.0,12.0,1.0


In [None]:
#### 

In [81]:
kmeanss = KMeans(n_clusters=2,init='k-means++').fit(df_agg.iloc[:,:-1])
labelss = kmeanss.labels_
a=metrics.silhouette_score(df_agg.iloc[:,:-1],labels,metric='euclidean')
kmeanss = KMeans(n_clusters=2,init='k-means++').fit(df_agg)
labelss = kmeanss.labels_
b=metrics.silhouette_score(df_agg,labels,metric='euclidean')

0.52566382123176392

In [79]:
kmeansm = KMeans(n_clusters=2,init='k-means++').fit(df_mean.iloc[:,:-1])
labelsm = kmeansm.labels_
metrics.silhouette_score(df_mean.iloc[:,:-1],labelsm,metric='euclidean')

0.5185165756638056

Standardization of datasets is a common requirement for many machine learning estimators. Standardize the data to get better clustering. 

In [83]:
scaled_agg = preprocessing.scale(df_agg)
kmeanssa = KMeans(n_clusters=2,init='k-means++').fit(scaled_agg)
labelssa = kmeanssa.labels_
metrics.silhouette_score(scaled_agg,labelssa,metric='euclidean')

0.24810979469353819

In [118]:
mdata=df_mean.iloc[:,7:10]
scaled_mean = preprocessing.normalize(mdata)
kmeanssm = KMeans(n_clusters=2,init='k-means++').fit(scaled_mean)
labelssm = kmeanssm.labels_
metrics.silhouette_score(scaled_mean,labelssm,metric='euclidean')

0.91157380653552711

In [130]:
adata=df_agg.iloc[:,5:14]
scaled_agg = preprocessing.normalize(adata)
kmeanssa = KMeans(n_clusters=2,init='k-means++').fit(scaled_agg)
labelssa = kmeanssa.labels_
metrics.silhouette_score(scaled_agg,labelssa,metric='euclidean')

0.3706553284466193

In [121]:
df_mean.columns[13]

'skintone'

In [151]:
df_agg.columns[0:1]

Index(['total_yellow'], dtype='object')

### Iteratively drop features and see the results

In [164]:
silh_results=np.zeros(df_mean.shape[1])
for i in range(2,14):
    Sdd=preprocessing.normalize(df_mean.iloc[:,:i])
    kmeansp = KMeans(n_clusters=2).fit(Sdd)
    labelsp = kmeansp.labels_
    silh_results[i]=metrics.silhouette_score(Sdd,labelsp,metric='euclidean')

In [165]:
import operator
max_index,max_value=max(enumerate(silh_results),key=operator.itemgetter(1))
print(max_index)
print(max_value)

2
0.961475774269


In [166]:
silh_results

array([ 0.        ,  0.        ,  0.96147577,  0.9190977 ,  0.88896111,
        0.53073336,  0.38558711,  0.34042811,  0.25590232,  0.54316036,
        0.88270567,  0.62110438,  0.58900474,  0.57304312])

### Now iteratively use different set of features and check the results

In [143]:
results=np.zeros((df_agg.shape[1]-1,df_agg.shape[1]-1))
for i in range(1,14):
    for j in range(i+1,15):
        for k in range(i,j):# feature with index k will not be used
            Sdd=preprocessing.scale(df_agg.iloc[:,i:j])
            kmeansp = KMeans(n_clusters=2).fit(Sdd)
            labelsp = kmeansp.labels_
            results[i-1][j-1]=metrics.silhouette_score(Sdd,labelsp,metric='euclidean')
        
        

In [161]:
results=np.zeros((df_agg.shape[1]-1,df_agg.shape[1]-1))
for i in range(1,14):
    for j in range(i+1,15):
        Sdd=preprocessing.normalize(df_agg.iloc[:,i:j])
        kmeansp = KMeans(n_clusters=2).fit(Sdd)
        labelsp = kmeansp.labels_
        results[i-1][j-1]=metrics.silhouette_score(Sdd,labelsp,metric='euclidean')

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [48]:
kmeans = KMeans(n_clusters=2,init='k-means++').fit(df_agg.iloc[:,:-1])
labels = kmeans.labels_

In [50]:
labels = kmeans.labels_
metrics.silhouette_score(df_agg.iloc[:,:-1],labels,metric='euclidean')

0.52566749884053066

In [51]:
(df_agg['skintone']==labels).sum()

951

In [54]:
#preprocess and apply
Sdd=preprocessing.scale(df_agg.iloc[:,:-1])
kmeansp = KMeans(n_clusters=2).fit(Sdd)
labelsp = kmeansp.labels_
metrics.silhouette_score(Sdd,labelsp,metric='euclidean')

0.26359741572503315

In [56]:
df_agg.columns

Index(['total_yellow', 'total_yellow_red', 'total_direct_red', 'total_red',
       'total_game', 'total_win', 'total_lose', 'total_draw', 'total_goal',
       'league', 'club', 'height', 'weight', 'position', 'skintone'],
      dtype='object')

In [57]:
df.head(3)

Unnamed: 0,player,club,leagueCountry,height,weight,position,games,victories,ties,defeats,...,refNum,refCountry,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,allreds,skintone
0,1046,70,3,177.0,72.0,1,1,0,0,1,...,1,1,0.326391,712.0,0.000564,0.396,750.0,0.002696,0,0
1,919,51,1,179.0,82.0,12,1,0,0,1,...,2,2,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,0,1
2,392,34,0,182.0,71.0,2,1,0,0,1,...,4,4,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0,0
