In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_pickle('../data/player_stats.pkl')

In [3]:
df.head()

Unnamed: 0,Player,G,GS,MP,FG,FGA,FG%,2P,2PA,2P%,...,TOV,PF,PTS,ORtg,DRtg,Team,Season,Class,Pos,Height
0,Julian Edmonson,30,5.0,568,10.5,26.2,0.401,6.2,15.2,0.411,...,5.6,6.6,34.5,102.3,107.9,abilene-christian,2014,3.0,G,73.0
1,Jacob Lancaster,31,11.0,434,12.0,21.7,0.553,11.8,21.5,0.551,...,5.6,6.7,29.1,102.9,99.3,abilene-christian,2014,3.0,F,82.0
2,Parker Wentz,31,29.0,1040,7.7,15.7,0.491,3.2,5.6,0.576,...,3.4,4.0,24.5,124.7,107.5,abilene-christian,2014,2.0,G,69.0
3,LaDarrien Williams,16,14.0,474,8.1,21.8,0.371,6.4,16.3,0.389,...,4.2,5.2,24.4,101.2,104.1,abilene-christian,2014,3.0,G,72.0
4,Harrison Hawkins,16,16.0,481,6.8,18.1,0.374,3.7,10.4,0.353,...,6.0,5.8,24.1,100.9,108.5,abilene-christian,2014,3.0,G,72.0


## Functions

In [4]:
def guards(df):
    df = df[df['Pos'] == 'G']
    return df

In [5]:
def forwards(df):
    df = df[df['Pos'] == 'F']
    return df

In [6]:
def centers(df):
    df = df[df['Pos'] == 'C']
    return df

In [7]:
def ID(row):
    row['ID'] = ",".join([row['Team'], str(row['Season'])])
    return row

## Roster Counts

In [8]:
df.Pos.value_counts()

G    13323
F     8508
C     1480
Name: Pos, dtype: int64

In [9]:
df.dropna().Pos.value_counts()

G    11381
F     5599
C      489
Name: Pos, dtype: int64

## Reduced DateFrame

In [10]:
# Features to Cluster Centers
c_reduced_df_cols = ['Player', 'MP', '2P', '3P', 'TRB', 'AST', 'STL',
                     'BLK', 'TOV', 'PTS', 'Team', 'Season', 'Pos', 'Height']

c_df_r = df[c_reduced_df_cols]

# Features to Cluster Forwards
f_reduced_df_cols = ['Player', 'MP', '2P', '2PA', '3P', '3PA', 'TRB', 'AST', 'STL',
                     'BLK', 'TOV', 'PTS', 'Team', 'Season', 'Pos']

f_df_r = df[f_reduced_df_cols]

# Features to Cluster Guards
g_reduced_df_cols = ['Player', 'MP', '3P', 'AST', 'STL', 'TOV',
                     'PTS', 'TRB', 'Team', 'Season', 'Pos']

g_df_r = df[g_reduced_df_cols]

## Deal with NaNs

In [11]:
# Drop NaNs from reduced DataFrames
c_df_r = centers(c_df_r.dropna())
g_df_r = guards(g_df_r.dropna())
f_df_r = forwards(f_df_r.dropna())

## Vectorize dfs

In [12]:
def cluster_matrix(df):
    player = df['Player'].as_matrix()
    position = df['Pos'].as_matrix()
    X = df.drop(['Player', 'MP', 'Team', 'Season', 'Pos'], axis=1).as_matrix()
    return X, player, position

In [13]:
X_c, player_c, position_c = cluster_matrix(c_df_r)
X_f, player_f, position_f = cluster_matrix(f_df_r)
X_g, player_g, position_g = cluster_matrix(g_df_r)

In [14]:
X_c_cols = c_df_r.drop(['Player', 'MP', 'Team', 'Season', 'Pos'], axis=1).columns.tolist()
X_f_cols = f_df_r.drop(['Player', 'MP', 'Team', 'Season', 'Pos'], axis=1).columns.tolist()
X_g_cols = g_df_r.drop(['Player', 'MP', 'Team', 'Season', 'Pos'], axis=1).columns.tolist()

## Standardize

In [15]:
scale = StandardScaler()
X_c = scale.fit_transform(X_c)
X_f = scale.fit_transform(X_f)
X_g = scale.fit_transform(X_g)

## Cluster

In [16]:
# Ks = range(2, 10, 1)
# km = [KMeans(n_clusters=i) for i in Ks]
# sil_score_c = [silhouette_score(X_c, km[i].fit(X_c).labels_) for i in range(len(km))]
# sil_score_f = [silhouette_score(X_f, km[i].fit(X_f).labels_) for i in range(len(km))]
# sil_score_g = [silhouette_score(X_g, km[i].fit(X_g).labels_) for i in range(len(km))]

In [17]:
# fig = plt.figure(figsize=(15, 7))
# plt.plot(Ks, sil_score_c)
# plt.plot(Ks, sil_score_f)
# plt.plot(Ks, sil_score_g)
# plt.legend(['Centers', 'Forwards', 'Guards'], loc='best')
# plt.grid(True)
# plt.title('Silhouette Score Plot');

In [18]:
ckmeans = KMeans(n_clusters=3, init='k-means++', n_init=20, max_iter=500, tol=0.0001,
                        precompute_distances='auto', verbose=0, random_state=None,
                        copy_x=True, n_jobs=-1, algorithm='auto')
ckmeans.fit(X_c);

In [19]:
fkmeans = KMeans(n_clusters=3, init='k-means++', n_init=20, max_iter=500, tol=0.0001,
                        precompute_distances='auto', verbose=0, random_state=None,
                        copy_x=True, n_jobs=-1, algorithm='auto')
fkmeans.fit(X_f);

In [20]:
gkmeans = KMeans(n_clusters=4, init='k-means++', n_init=20, max_iter=500, tol=0.0001,
                        precompute_distances='auto', verbose=0, random_state=None,
                        copy_x=True, n_jobs=-1, algorithm='auto')
gkmeans.fit(X_g);

In [21]:
cclusters = ckmeans.labels_
fclusters = fkmeans.labels_
gclusters = gkmeans.labels_

In [22]:
# center_labels_dict = {0: 'All Team Centers', 1: 'B Team Centers', 2: 'Shooting Centers'}
# forward_labels_dict = {0: 'Deep Forwards', 1: 'Versatile Forwards', 2: 'Supporting Forwards'}
# guard_labels_dict = {0: 'Supporting Guards', 1: 'B Team Guards', 2: 'All Team Gue=ards', 3: 'Benchwarmer Guards'}

## Recompose player df with clusters

In [23]:
centers_df = c_df_r
centers_df['Cluster'] = cclusters
# centers_df['Cluster_names'] = centers_df['Cluster'].map(center_labels_dict)
centers_df.head()

Unnamed: 0,Player,MP,2P,3P,TRB,AST,STL,BLK,TOV,PTS,Team,Season,Pos,Height,Cluster
13,Zach Moer,52,8.2,0.0,9.4,5.9,1.2,4.7,5.9,24.7,air-force,2014,C,83.0,1
19,Darrius Parker,238,5.7,0.5,9.8,3.3,1.0,1.5,3.1,17.7,air-force,2014,C,79.0,0
36,Pat Forsythe,595,7.2,0.0,11.6,1.7,1.4,3.4,4.8,17.2,akron,2014,C,83.0,1
37,Isaiah Johnson,414,6.8,0.0,12.7,1.8,1.2,4.0,2.8,17.2,akron,2014,C,82.0,0
49,Justin Banks,408,6.6,0.0,17.7,0.9,1.0,4.4,3.0,17.7,alabama-am,2014,C,85.0,1


In [24]:
centers_df.groupby('Cluster')['MP', '2P', '3P', 'AST', 'STL', 'TOV', 'PTS', 'TRB', 'BLK'].mean().round()

Unnamed: 0_level_0,MP,2P,3P,AST,STL,TOV,PTS,TRB,BLK
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,271.0,4.0,0.0,1.0,1.0,4.0,11.0,12.0,3.0
1,547.0,9.0,0.0,2.0,1.0,4.0,24.0,16.0,3.0
2,489.0,5.0,3.0,3.0,1.0,3.0,22.0,11.0,2.0


In [25]:
forwards_df = f_df_r
forwards_df['Cluster'] = fclusters
# forwards_df['Cluster'] = forwards_df['Cluster'].map(forward_labels_dict)
forwards_df.head()

Unnamed: 0,Player,MP,2P,2PA,3P,3PA,TRB,AST,STL,BLK,TOV,PTS,Team,Season,Pos,Cluster
1,Jacob Lancaster,434,11.8,21.5,0.1,0.1,19.6,1.0,1.0,5.0,5.6,29.1,abilene-christian,2014,F,0
5,James Pegues,445,7.2,15.8,0.1,0.8,13.4,2.1,2.9,0.4,6.1,20.9,abilene-christian,2014,F,0
7,Alexsander Milosavljevic,127,7.9,14.9,0.0,0.5,15.4,1.4,0.0,1.4,4.7,17.7,abilene-christian,2014,F,0
8,Austin Cooke,914,2.1,4.3,3.0,7.5,11.4,3.9,1.9,0.3,1.9,15.8,abilene-christian,2014,F,2
10,Riley Payne,584,3.9,8.3,0.0,0.0,7.9,2.4,1.8,0.3,3.3,11.1,abilene-christian,2014,F,1


In [26]:
forwards_df.groupby('Cluster')['MP', '2P', '2PA', '3P', '3PA', 'AST', 'STL', 'TOV', 'PTS', 'TRB'].mean().round()

Unnamed: 0_level_0,MP,2P,2PA,3P,3PA,AST,STL,TOV,PTS,TRB
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,591.0,9.0,16.0,0.0,2.0,2.0,2.0,4.0,24.0,15.0
1,305.0,4.0,8.0,0.0,1.0,2.0,1.0,3.0,11.0,12.0
2,534.0,4.0,8.0,3.0,10.0,3.0,1.0,3.0,20.0,10.0


In [27]:
guards_df = g_df_r
guards_df['Cluster'] = gclusters
# guards_df['Cluster'] = guards_df['Cluster'].map(guard_labels_dict)
guards_df.head()

Unnamed: 0,Player,MP,3P,AST,STL,TOV,PTS,TRB,Team,Season,Pos,Cluster
0,Julian Edmonson,568,4.3,3.0,2.3,5.6,34.5,5.7,abilene-christian,2014,G,2
2,Parker Wentz,1040,4.5,4.9,2.8,3.4,24.5,3.9,abilene-christian,2014,G,2
3,LaDarrien Williams,474,1.7,7.7,2.4,4.2,24.4,10.5,abilene-christian,2014,G,3
4,Harrison Hawkins,481,3.1,6.3,2.1,6.0,24.1,5.0,abilene-christian,2014,G,3
6,Rafael Farley,421,4.6,1.7,0.6,2.5,18.7,4.9,abilene-christian,2014,G,2


In [28]:
guards_df.groupby('Cluster')['MP', '3P', 'AST', 'STL', 'TOV', 'PTS', 'TRB'].mean().round()

Unnamed: 0_level_0,MP,3P,AST,STL,TOV,PTS,TRB
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,331.0,1.0,3.0,2.0,3.0,12.0,8.0
1,3.0,0.0,6.0,33.0,6.0,22.0,8.0
2,657.0,4.0,3.0,2.0,3.0,25.0,7.0
3,614.0,2.0,8.0,2.0,5.0,17.0,7.0


In [29]:
guards_df.groupby('Cluster')['Player'].count()

Cluster
0    4923
1      22
2    4838
3    3486
Name: Player, dtype: int64

## Team df

In [30]:
c = centers_df[['Player', 'MP', 'Team', 'Season', 'Cluster', 'Pos']]
f = forwards_df[['Player', 'MP', 'Team', 'Season', 'Cluster', 'Pos']]
g = guards_df[['Player', 'MP', 'Team', 'Season', 'Cluster', 'Pos']]
positions = [c, f, g]

In [31]:
teams = pd.concat(positions)

In [32]:
def pos_cluster(row):
    row['pos_cluster'] = row['Pos'] + str(row['Cluster'])
    return row

In [33]:
teams = teams.apply(pos_cluster, axis=1)

In [34]:
piv = pd.pivot_table(teams, values='MP', index=['Team', 'Season'],
                     columns=['pos_cluster'], aggfunc='sum', fill_value=0)

In [35]:
type(piv)

pandas.core.frame.DataFrame

In [36]:
 # piv = pd.DataFrame(piv)

In [37]:
piv.columns

Index(['C0', 'C1', 'C2', 'F0', 'F1', 'F2', 'G0', 'G1', 'G2', 'G3'], dtype='object', name='pos_cluster')

In [38]:
piv.head()

Unnamed: 0_level_0,pos_cluster,C0,C1,C2,F0,F1,F2,G0,G1,G2,G3
Team,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
abilene-christian,2014,0,0,0,1006,584,914,642,0,2029,1100
abilene-christian,2015,0,0,0,689,526,902,668,0,2382,1033
abilene-christian,2016,0,0,0,1642,0,825,605,0,918,2260
abilene-christian,2017,0,0,563,451,0,413,2009,0,827,1637
abilene-christian,2018,0,105,714,326,0,300,2911,0,8,2086


In [39]:
piv = piv.reset_index()

In [40]:
piv.head()

pos_cluster,Team,Season,C0,C1,C2,F0,F1,F2,G0,G1,G2,G3
0,abilene-christian,2014,0,0,0,1006,584,914,642,0,2029,1100
1,abilene-christian,2015,0,0,0,689,526,902,668,0,2382,1033
2,abilene-christian,2016,0,0,0,1642,0,825,605,0,918,2260
3,abilene-christian,2017,0,0,563,451,0,413,2009,0,827,1637
4,abilene-christian,2018,0,105,714,326,0,300,2911,0,8,2086


In [41]:
team_and_season = piv.iloc[:, :2].as_matrix()
clusts = piv.iloc[:, 2:].as_matrix()
cols = piv.columns.tolist()
mp = clusts.sum(axis=1)
clustsnorm = clusts / mp.reshape(-1, 1)
clusters_df = pd.DataFrame(np.hstack((team_and_season, clustsnorm)), columns=cols)

In [57]:
clusters_df.head()

Unnamed: 0,Team,Season,C0,C1,C2,F0,F1,F2,G0,G1,G2,G3
0,abilene-christian,2014,0,0.0,0.0,0.160319,0.0930677,0.145657,0.102311,0,0.323347,0.175299
1,abilene-christian,2015,0,0.0,0.0,0.111129,0.0848387,0.145484,0.107742,0,0.384194,0.166613
2,abilene-christian,2016,0,0.0,0.0,0.26272,0.0,0.132,0.0968,0,0.14688,0.3616
3,abilene-christian,2017,0,0.0,0.0954237,0.0764407,0.0,0.07,0.340508,0,0.140169,0.277458
4,abilene-christian,2018,0,0.0162791,0.110698,0.0505426,0.0,0.0465116,0.451318,0,0.00124031,0.323411


## Team Experience Feature

In [65]:
class_df = df[['Team', 'Season', 'Class', 'MP']]
class_df.head()

Unnamed: 0,Team,Season,Class,MP
0,abilene-christian,2014,3.0,568
1,abilene-christian,2014,3.0,434
2,abilene-christian,2014,2.0,1040
3,abilene-christian,2014,3.0,474
4,abilene-christian,2014,3.0,481


In [98]:
piv = pd.pivot_table(class_df, values='MP', index=['Team', 'Season'],
                     columns=['Class'], aggfunc='sum', fill_value=0)
piv = piv.reset_index()

In [123]:
team_and_season = piv.iloc[:, :2].as_matrix()
classes = piv.iloc[:, 2:].as_matrix()
cols = piv.columns.tolist()


In [125]:
mp = classes.sum(axis=1)
classesnorm = classes / mp.reshape(-1, 1)
cl = np.array(cols[2:])
cols = cols[:2]
cols.append('exp_factor')
exp = cl * classesnorm
expfactor = exp.sum(axis=1)

In [126]:
classes_df = pd.DataFrame(np.hstack((team_and_season, expfactor.reshape(-1,1))), columns=cols)

In [127]:
classes_df.head()

Unnamed: 0,Team,Season,exp_factor
0,abilene-christian,2014,2.18072
1,abilene-christian,2015,2.76839
2,abilene-christian,2016,2.6672
3,abilene-christian,2017,2.15559
4,abilene-christian,2018,2.97798


## Merge These Mothas!

In [53]:
piv = piv.apply(ID, axis=1)

In [54]:
class_df = class_df.apply(ID, axis=1)
class_df = class_df.drop(['Team', 'Season'], axis=1)

In [55]:
final_df = piv.merge(class_df, on='ID', how='left')

In [56]:
final_df.head()

Unnamed: 0,Team,Season,C0,C1,C2,F0,F1,F2,G0,G1,G2,G3,ID,Class
0,abilene-christian,2014,0,0,0,1006,584,914,642,0,2029,1100,"abilene-christian,2014",2.25
1,abilene-christian,2015,0,0,0,689,526,902,668,0,2382,1033,"abilene-christian,2015",2.36
2,abilene-christian,2016,0,0,0,1642,0,825,605,0,918,2260,"abilene-christian,2016",2.45
3,abilene-christian,2017,0,0,563,451,0,413,2009,0,827,1637,"abilene-christian,2017",2.0
4,abilene-christian,2018,0,105,714,326,0,300,2911,0,8,2086,"abilene-christian,2018",2.57
