In [1]:
#===============================================================================
# Import libraries
#===============================================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.cluster import KMeans
%matplotlib inline
from sklearn.metrics import silhouette_score,accuracy_score
from sklearn import preprocessing
from sklearn import metrics
import operator

In [2]:
df = pd.read_csv('ppplayers_without_nan.csv') # this is the data cleaned in previous step

In [3]:
#delete useless columns that arent useful for clustering
useless_cols = ['playerShort', 'birthday', 'photoID', 'rater1', 'rater2', 'Alpha_3', 'refCount']
df.drop(useless_cols, axis=1, inplace=True)

# Encode input values as an enumerated type or categorical variable for further processing
df['player'] = df['player'].factorize(sort=True)[0]
df['club'] = df['club'].factorize(sort=True)[0]
df['leagueCountry'] = df['leagueCountry'].factorize(sort=True)[0]
df['position'] = df['position'].factorize(sort=True)[0]
df['skintone'] = df['skintone'].apply(lambda x: 0 if x <= 0.5 else 1)

Two possible ways such as summing or taking a mean value of numerical data(total games,win,red cards,etc.) of corresponding players would be reasonable for preprocessing data to make it ready for clustering step.

#### Aggregate data by summing and save it in df_agg for clustering

In [4]:
df_agg = pd.DataFrame()
df_agg['total_yellow'] = df.groupby('player')['yellowCards'].agg('sum')
df_agg['total_yellow_red'] = df.groupby('player')['yellowReds'].agg('sum')
df_agg['total_direct_red'] = df.groupby('player')['redCards'].agg('sum')
df_agg['total_red'] = df.groupby('player')['allreds'].agg('sum')
df_agg['total_game'] = df.groupby('player')['games'].agg('sum')
df_agg['total_win'] = df.groupby('player')['victories'].agg('sum')
df_agg['total_lose'] = df.groupby('player')['defeats'].agg('sum')
df_agg['total_draw'] = df.groupby('player')['ties'].agg('sum')
df_agg['total_goal'] = df.groupby('player')['goals'].agg('sum')

In [5]:
df_agg['league'] = df.groupby('player')['leagueCountry'].apply(lambda x: mode(x)[0][0])
df_agg['club'] = df.groupby('player')['club'].apply(lambda x: mode(x)[0][0])
df_agg['height'] = df.groupby('player')['height'].apply(lambda x: mode(x)[0][0])
df_agg['weight'] = df.groupby('player')['weight'].apply(lambda x: mode(x)[0][0])
df_agg['position'] = df.groupby('player')['position'].apply(lambda x: mode(x)[0][0])
#df_agg['birthyear'] = df.groupby('player')['birthyear'].apply(lambda x: mode(x)[0][0])
df_agg['skintone'] = df.groupby('player')['skintone'].apply(lambda x: mode(x)[0][0])

#### Aggregate by taking mean values

In [6]:
df_mean = pd.DataFrame()
df_mean['mean_yellow'] = df.groupby('player')['yellowCards'].agg('mean')
df_mean['mean_yellow_red'] = df.groupby('player')['yellowReds'].agg('mean')
df_mean['mean_direct_red'] = df.groupby('player')['redCards'].agg('mean')
df_mean['mean_red'] = df.groupby('player')['allreds'].agg('mean')
#df_mean['total_game'] = df.groupby('player')['games'].agg('mean')
df_mean['mean_win'] = df.groupby('player')['victories'].agg('mean')
df_mean['mean_lose'] = df.groupby('player')['defeats'].agg('mean')
df_mean['mean_draw'] = df.groupby('player')['ties'].agg('mean')
df_mean['mean_goal'] = df.groupby('player')['goals'].agg('mean')

In [7]:
df_mean['league'] = df_agg['league']
df_mean['club'] = df_agg['club']
df_mean['height'] = df_agg['height']
df_mean['weight'] = df_agg['weight']
df_mean['position'] = df_agg['position']
#df_agg['birthyear'] = df.groupby('player')['birthyear'].apply(lambda x: mode(x)[0][0])
df_mean['skintone'] = df_agg['skintone']

In [8]:
df_mean.describe()

Unnamed: 0,mean_yellow,mean_yellow_red,mean_direct_red,mean_red,mean_win,mean_lose,mean_draw,mean_goal,league,club,height,weight,position,skintone
count,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0
mean,0.39664,0.012182,0.012882,0.025064,1.241414,0.946435,0.700037,0.309289,1.599873,46.96374,182.059796,76.053435,4.804707,0.169847
std,0.284908,0.017692,0.017862,0.028073,0.529053,0.433371,0.316185,0.365928,1.128114,27.754068,6.748476,7.070506,3.434858,0.375618
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,161.0,54.0,0.0,0.0
25%,0.201163,0.0,0.0,0.0,0.845979,0.642857,0.487649,0.0636,1.0,25.0,178.0,71.0,2.0,0.0
50%,0.333333,0.0,0.007874,0.017621,1.207222,0.86327,0.654859,0.186047,2.0,45.5,182.0,76.0,5.0,0.0
75%,0.527237,0.018576,0.019608,0.036397,1.573558,1.163265,0.878393,0.429915,3.0,70.0,187.0,81.0,7.0,0.0
max,2.482759,0.132075,0.166667,0.24359,3.657343,3.45098,2.27451,3.299145,3.0,96.0,203.0,100.0,12.0,1.0


In [9]:
df_agg.describe()

Unnamed: 0,total_yellow,total_yellow_red,total_direct_red,total_red,total_game,total_win,total_lose,total_draw,total_goal,league,club,height,weight,position,skintone
count,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0,1572.0
mean,31.632952,0.956107,1.010814,1.966921,237.201018,106.385496,74.166667,56.648855,28.001908,1.599873,46.96374,182.059796,76.053435,4.804707,0.169847
std,25.355665,1.39882,1.34709,2.28106,143.977858,78.043527,41.849328,34.415758,41.726421,1.128114,27.754068,6.748476,7.070506,3.434858,0.375618
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,161.0,54.0,0.0,0.0
25%,13.0,0.0,0.0,0.0,131.0,52.75,44.0,31.0,4.0,1.0,25.0,178.0,71.0,2.0,0.0
50%,26.0,0.0,1.0,1.0,215.0,85.5,69.0,52.0,13.0,2.0,45.5,182.0,76.0,5.0,0.0
75%,43.0,1.0,2.0,3.0,324.0,140.25,98.0,78.0,35.0,3.0,70.0,187.0,81.0,7.0,0.0
max,197.0,12.0,13.0,19.0,945.0,527.0,309.0,234.0,420.0,3.0,96.0,203.0,100.0,12.0,1.0


Best clustering to compare the results to is created by clustering only the skintone. So simply skintone values themselves(which consists of 0's and 1's) are the labels for the player's corresponding cluster that we want to compare later with different feature configurations.


In [10]:
best_cluster_labels=df_mean.iloc[:,df_mean.shape[1]-1]# 'skincolor' column

### Check how related the resulting cluster is to the skincolor at the same time computing the silhoutte score of the result cluster
For checking the accuracy of cluster in terms of its ability to cluster according to the skintone compare it with the 'best labels' defined above.

Firstly, we use aggregated features by mean values (df_mean dataframe) for clustering.

### Iteratively drop features and see the results
Standardization of datasets is a common requirement for many machine learning estimators and in particular for our clustering problem it gives better results. We standardized the data to get better clustering. 

In [11]:
silh_results=np.zeros(df_mean.shape[1])# for silhoutte scores for different feature configuration scenarios
overl_cluster=np.zeros(df_mean.shape[1])# calculate how good players seperated by clusters according to skin color by
#finding the overlapping number of clustering labels
accuracy_res=np.zeros(df_mean.shape[1])#for standard accuracy score measure
for i in range(2,df_mean.shape[1]):
    Sdd=preprocessing.normalize(df_mean.iloc[:,:i])
    kmeansp = KMeans(n_clusters=2).fit(Sdd)
    labelsp = kmeansp.labels_
    overl_cluster[i]=(best_cluster_labels==labelsp).sum()
    accuracy_res[i]=metrics.accuracy_score(best_cluster_labels,labelsp)
    silh_results[i]=metrics.silhouette_score(Sdd,labelsp,metric='euclidean')

We compute accuracy score by comparing the result cluster labels with 'best cluster' labels

In [12]:
accuracy_res

array([ 0.        ,  0.        ,  0.83015267,  0.82888041,  0.82569975,
        0.57251908,  0.53244275,  0.45292621,  0.53307888,  0.36832061,
        0.21310433,  0.45547074,  0.45165394,  0.54834606])

In [13]:
overl_cluster

array([    0.,     0.,  1305.,  1303.,  1298.,   900.,   837.,   712.,
         838.,   579.,   335.,   716.,   710.,   862.])

In [14]:
silh_results

array([ 0.        ,  0.        ,  0.96147577,  0.9190977 ,  0.88896111,
        0.53073336,  0.38558711,  0.34042811,  0.25594822,  0.54316036,
        0.88270567,  0.62110438,  0.58900474,  0.57304312])

For finding 'best' results(silhouette score is high where players with dark and light skin colors belong to different clusters ) we do following:
We multiply results from label overlaps with the corresponding results(with the same feature configuration) of silhoutte scores.
Best combination of features would be when the correspodning product is the highest.

In [15]:
products=np.zeros(df_mean.shape[1])
for i in range(0,df_mean.shape[1]):
    products[i]=accuracy_res[i]*silh_results[i]

In [16]:
products

array([ 0.        ,  0.        ,  0.79817168,  0.76182208,  0.73401496,
        0.30385498,  0.20530306,  0.15418881,  0.13644059,  0.20005715,
        0.1881084 ,  0.28289487,  0.26602631,  0.31422594])

In [17]:
max_index,max_value=max(enumerate(products),key=operator.itemgetter(1))
print(max_index)
print(max_value)

2
0.798171682838


In [18]:
df_mean.iloc[:,:2].head(2)

Unnamed: 0_level_0,mean_yellow,mean_yellow_red
player,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.537037,0.009259
1,0.166667,0.0


As we can see the maximum case is the case where mean yellow data for players are clustered. In this case we get good result for silhoutte results which means data was clustered well, as well as in this case we got players with different skin colors belonged to different classes. This result is sadly consistent with the idea that player with different colors have gotten yellow cards correlated. 

In [19]:
df_mean.iloc[:,:3].head(2)

Unnamed: 0_level_0,mean_yellow,mean_yellow_red,mean_direct_red
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.537037,0.009259,0.027778
1,0.166667,0.0,0.0


As we could see from result 'products' scores using 3 features by adding mean direct red scores also had quite high score of silhoutte and accuracy.

### We do the same procedure with the summed data(df_agg dataframe) instead of previously averaged data

In [20]:
silh_results=np.zeros(df_agg.shape[1])# for silhoutte scores for different feature configuration scenarios
overl_cluster=np.zeros(df_agg.shape[1])# calculate how good players seperated by clusters according to skin color by
#finding the overlapping number of clustering labels
accuracy_res=np.zeros(df_mean.shape[1])#for standard accuracy score measure
for i in range(2,df_mean.shape[1]):
    Sdd=preprocessing.normalize(df_agg.iloc[:,:i])
    kmeansp = KMeans(n_clusters=2).fit(Sdd)
    labelsp = kmeansp.labels_
    overl_cluster[i]=(best_cluster_labels==labelsp).sum()
    accuracy_res[i]=metrics.accuracy_score(best_cluster_labels,labelsp)
    silh_results[i]=metrics.silhouette_score(Sdd,labelsp,metric='euclidean')

In [21]:
products=np.zeros(df_mean.shape[1])
for i in range(0,df_mean.shape[1]):
    products[i]=accuracy_res[i]*silh_results[i]

In [22]:
max_index,max_value=max(enumerate(products),key=operator.itemgetter(1))
print(max_index)
print(max_value)

2
0.798171682842


In [23]:
df_agg.iloc[:,:2].head(2)

Unnamed: 0_level_0,total_yellow,total_yellow_red
player,Unnamed: 1_level_1,Unnamed: 2_level_1
0,58,1
1,19,0


In [24]:
products

array([ 0.        ,  0.        ,  0.79817168,  0.76182208,  0.73401496,
        0.28862501,  0.17267861,  0.20644677,  0.19169418,  0.14599327,
        0.11550396,  0.42712527,  0.28119867,  0.17665136])

Not surprisingly, we get the same result as above showing that clustering according to total yellow cards will yield in a cluster where players with different skincolors yield in different clusters. Again here also by adding third feature mean direct red cards we get approximately the same results. So,mainly features consisting of red and yellow cards do better clustering of data and result in players with dark and light skin colors belong to different clusters.