In [None]:
# loading packages

import os

import pandas as pd
import numpy as np

# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs


# Kmeans algorithm from scikit-learn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
raw=pd.read_csv('../input/rank-best-goalers/rank_of_players')
raw

## Clustering based on shots

In [None]:
#plot the distribution of shots
plt.figure(1)
raw['Shot<25%'].plot(kind='hist',title='Shot<25%',alpha=0.5)

plt.figure(2)
raw['Shot 25%-50%'].plot(kind='hist',title='Shot 25%-50%',alpha=0.5)

plt.figure(3)
raw['Shot 50%-75%'].plot(kind='hist',alpha=0.5)

plt.figure(4)
raw['Shot>75%'].plot(kind='hist',alpha=0.5)

In [None]:
X=raw[['Shot<25%','Shot 25%-50%','Shot 50%-75%','Shot>75%']]

In [None]:
Ks = range(1, 10)
inertia = [KMeans(i).fit(X).inertia_ for i in Ks]

fig = plt.figure()
plt.plot(Ks, inertia, '-bo')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia (within-cluster sum of squares)')
plt.show()

In [None]:
# Silhouette Analysis
range_n_clusters=[2,3,4,5,6,7,8,9,10]
for n_clusters in range_n_clusters:
    clusterer=KMeans(n_clusters=n_clusters, random_state=1)
    cluster_labels=clusterer.fit_predict(X)
    silhouette_avg=silhouette_score(X,cluster_labels)
    print("For n_clusters=", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

In [None]:
k = 2
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(X)

# print inertia & cluster center
print("inertia for k=2 is", kmeans.inertia_)
print("cluster centers: ", kmeans.cluster_centers_)

# take a quick look at the result
y = kmeans.labels_
print("cluster labels: ", y)

In [None]:
X.head()

In [None]:
raw['Shot<50%'] = raw['Shot<25%'] + raw['Shot 25%-50%']
raw['Shot>50%'] = raw['Shot 50%-75%'] + raw['Shot>75%']
raw.head()

In [None]:
import plotly.express as px
import random


kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
clusters = kmeans.predict(X)
print(clusters.shape)
raw['cluster'] = clusters
raw['cluster'] = raw.cluster.astype('category')

print(X.shape)

fig = px.scatter(raw, 
                 x=raw['Shot<50%'], 
                 y=raw['Shot>50%'], 
                 color=raw['cluster'], 
                 hover_data=['Player'])


fig.update_layout(
    height=1000,
    title_text='Players With 2 Clusters for Long and Shot Goalers'
)

#fig.savefig('./plots/QB_2D.png', bbox_inches='tight')
fig.show()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
pca_df = pd.DataFrame(data = principalComponents
                      ,columns = ['pc1', 'pc2'])

labels = raw[['Player']]
pca_df = pd.concat([labels,pca_df],axis=1)

pca_df = pca_df.sort_values(by='pc1',ascending=True).reset_index()

pca_df.head(5)

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import random
from itertools import cycle
palette = cycle(px.colors.qualitative.Plotly)
palette = cycle(px.colors.sequential.PuBu)

X = pca_df[['pc1','pc2']].values

x = pca_df.pc1.values
y = pca_df.pc2.values
n = pca_df.Player.values

# clusters = DBSCAN(eps=0.6, min_samples=5).fit_predict(X)
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
clusters = kmeans.predict(X)
print(clusters.shape)
pca_df['cluster'] = clusters
pca_df['cluster'] = pca_df.cluster.astype('category')

print(X.shape)

fig = px.scatter(pca_df, x=pca_df['pc1'], 
                 y=pca_df['pc2'], 
                 color=pca_df['cluster'], 
                 hover_data=['Player'], 
                 text=pca_df['Player'])

fig.update_layout(
    height=1000,
    title_text='2D PCA Viz of Players With  Six Clusters'
)

fig.update_traces(marker=dict(size=202,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

#fig.savefig('./plots/QB_2D.png', bbox_inches='tight')
fig.show()

#### Red(1) are long distance shooters and Blue(0) are short distance shooters

In [None]:
pca_df.head()

In [None]:
pca_df.to_csv('Clustering_Shot_Distance.csv')

In [None]:
raw[raw['Player']=='Cailey Hutchison']

In [None]:
X = raw[['Shot<25%','Shot 25%-50%','Shot 50%-75%','Shot>75%']]
plt.matshow(pca.components_,cmap='viridis')
plt.yticks([0,1],['PC1','PC2'],fontsize=10)
plt.colorbar()
plt.xticks(range(len(X.columns)),X.columns,rotation=65,ha='left')
plt.tight_layout()
plt.show()#

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

figs = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
labels = ['Shot<25%','Shot 25%-50%','Shot 50%-75%','Shot>75%']
colors = ['blue','red']
centers = kmeans.cluster_centers_

for i in range(6):
    fig = plt.figure(i, figsize=(8, 8))
    x_1 = figs[i][0]
    x_2 = figs[i][1]
    plt.scatter(X.iloc[:, x_1], X.iloc[:, x_2], c=y, s=0, alpha=0)
    plt.scatter(centers[:, x_1], centers[:, x_2], c='black', s=200, alpha=0.5)
    for j in range(X.shape[0]):
        plt.text(X.iloc[j, x_1], X.iloc[j, x_2], raw['Player'].iloc[j], 
                 color=colors[y[j]], weight='semibold', horizontalalignment = 'center', verticalalignment = 'center')
    plt.xlabel(labels[x_1])
    plt.ylabel(labels[x_2])

plt.show()

In [None]:
cluster_info=pd.DataFrame({'Player':raw['Player'],'Type':y})
cluster_info

In [None]:
fulldata=pd.merge(raw,cluster_info, on='Player')
fulldata

In [None]:
# rank type 0 players (long range shooters)
long_range_shooters=fulldata[fulldata['Type']==0].sort_values('Weighted Score',ascending=False)
long_range_shooters

In [None]:
# rank type 0 players (short range shooters)
short_range_shooters=fulldata[fulldata['Type']==1].sort_values('Weighted Score',ascending=False)
short_range_shooters

# Team style analysis

In [None]:
raw2.info()

In [None]:
raw2.describe()

In [None]:
# team style metrics: team goals, team shots, team skaters (Home/Away)

In [None]:
hometeam_goal=raw2.groupby(['Home Team','game_id'])[['Home Team Goals','Away Team Goals']].max()
hometeam_goal=hometeam_goal.reset_index()
hometeam_goal=hometeam_goal.groupby('Home Team')[['Home Team Goals','Away Team Goals']].mean()
hometeam_goal=hometeam_goal.reset_index()
hometeam_goal

In [None]:
hometeam_goal=raw2.groupby(['Home Team','game_id'])[['Home Team Goals','Away Team Goals']].max()
hometeam_goal=hometeam_goal.reset_index()
hometeam_goal=hometeam_goal.groupby('Home Team')[['Home Team Goals','Away Team Goals']].mean()
hometeam_goal=hometeam_goal.reset_index()

hometeam_skater=raw2.groupby('Home Team')[['Home Team Skaters','Away Team Skaters']].mean()
hometeam_skater=hometeam_skater.reset_index()

hometeam=pd.merge(hometeam_skater,hometeam_goal,on='Home Team')
hometeam=hometeam.rename(columns={'Home Team':'Team'})
hometeam=hometeam.rename(columns={'Home Team Skaters':'As Home Team: Skaters'})
hometeam=hometeam.rename(columns={'Away Team Skaters':'As Home Team: Opponents Skaters'})
hometeam=hometeam.rename(columns={'Home Team Goals':'As Home Team: Goals'})
hometeam=hometeam.rename(columns={'Away Team Goals':'As Home Team: Opponents Goals'})

hometeam

awayteam_goal=raw2.groupby(['Away Team','game_id'])[['Home Team Goals','Away Team Goals']].max()
awayteam_goal=awayteam_goal.reset_index()
awayteam_goal=awayteam_goal.groupby('Away Team')[['Home Team Goals','Away Team Goals']].mean()
awayteam_goal=awayteam_goal.reset_index()

awayteam_skater=raw2.groupby('Away Team')[['Home Team Skaters','Away Team Skaters']].mean()
awayteam_skater=awayteam_skater.reset_index()

awayteam=pd.merge(awayteam_skater,awayteam_goal,on='Away Team')
awayteam=awayteam.rename(columns={'Away Team':'Team'})
awayteam=awayteam.rename(columns={'Home Team Skaters':'As Away Team: Opponents Skaters'})
awayteam=awayteam.rename(columns={'Away Team Skaters':'As Away Team: Skaters'})
awayteam=awayteam.rename(columns={'Home Team Goals':'As Away Team: Opponents Goals'})
awayteam=awayteam.rename(columns={'Away Team Goals':'As Away Team: Goals'})

team=pd.merge(hometeam,awayteam,on='Team')
team

In [None]:
team['Net Skater Scores (Home)']=team['As Home Team: Skaters']-team['As Home Team: Opponents Skaters']
team['Net Skater Scores (Away)']=team['As Away Team: Skaters']-team['As Away Team: Opponents Skaters']
team['Net Goal Scores (Home)']=team['As Home Team: Goals']-team['As Home Team: Opponents Goals']
team['Net Goal Scores (Away)']=team['As Away Team: Goals']-team['As Away Team: Opponents Goals']
team_lite=team[['Team','Net Skater Scores (Home)','Net Skater Scores (Away)','Net Goal Scores (Home)','Net Goal Scores (Away)']]
team_lite

In [None]:
team_lite=team_lite.set_index('Team')
team_norm=(team_lite-team_lite.mean())/team_lite.std()
team_norm

In [None]:
homeskaters1=raw2.groupby('Home Team')[['Home Team Skaters','Away Team Skaters']].sum()
homeskaters1=homeskaters1.rename(columns={'Home Team Skaters':'Skaters'})
homeskaters1=homeskaters1.rename(columns={'Away Team Skaters':'Opponents Skaters'})

homeskaters2=raw2.groupby('Home Team')[['Home Team Skaters','Away Team Skaters']].count()
homeskaters2=homeskaters2.rename(columns={'Home Team Skaters':'Skaters'})
homeskaters2=homeskaters2.rename(columns={'Away Team Skaters':'Opponents Skaters'})

awayskaters1=raw2.groupby('Away Team')[['Home Team Skaters','Away Team Skaters']].sum()
ayayskaters1=awayskaters1[['Away Team Skaters','Home Team Skaters']]
awayskaters1=awayskaters1.rename(columns={'Home Team Skaters':'Opponents Skaters'})
awayskaters1=awayskaters1.rename(columns={'Away Team Skaters':'Skaters'})

awayskaters2=raw2.groupby('Away Team')[['Home Team Skaters','Away Team Skaters']].count()
ayayskaters2=awayskaters2[['Away Team Skaters','Home Team Skaters']]
awayskaters2=awayskaters2.rename(columns={'Home Team Skaters':'Opponents Skaters'})
awayskaters2=awayskaters2.rename(columns={'Away Team Skaters':'Skaters'})

skaters=(homeskaters1+awayskaters1)/(homeskaters2+awayskaters2)
skaters

In [None]:
hometeam_goal1=raw2.groupby(['Home Team','game_id'])[['Home Team Goals','Away Team Goals']].max()
hometeam_goal1=hometeam_goal1.reset_index()
hometeam_goal1=hometeam_goal1.groupby('Home Team')[['Home Team Goals','Away Team Goals']].sum()
hometeam_goal1=hometeam_goal1.rename(columns={'Home Team Goals':'Goals'})
hometeam_goal1=hometeam_goal1.rename(columns={'Away Team Goals':'Opponents Goals'})

hometeam_goal2=raw2.groupby(['Home Team','game_id'])[['Home Team Goals','Away Team Goals']].count()
hometeam_goal2=hometeam_goal2.reset_index()
hometeam_goal2=hometeam_goal2.groupby('Home Team')[['Home Team Goals','Away Team Goals']].count()
hometeam_goal2=hometeam_goal2.rename(columns={'Home Team Goals':'Goals'})
hometeam_goal2=hometeam_goal2.rename(columns={'Away Team Goals':'Opponents Goals'})

awayteam_goal1=raw2.groupby(['Away Team','game_id'])[['Home Team Goals','Away Team Goals']].max()
awayteam_goal1=awayteam_goal1.reset_index()
awayteam_goal1=awayteam_goal1.groupby('Away Team')[['Home Team Goals','Away Team Goals']].sum()
awayteam_goal1=awayteam_goal1[['Away Team Goals','Home Team Goals']]
awayteam_goal1=awayteam_goal1.rename(columns={'Away Team Goals':'Goals'})
awayteam_goal1=awayteam_goal1.rename(columns={'Home Team Goals':'Opponents Goals'})

awayteam_goal2=raw2.groupby(['Away Team','game_id'])[['Home Team Goals','Away Team Goals']].count()
awayteam_goal2=awayteam_goal2.reset_index()
awayteam_goal2=awayteam_goal2.groupby('Away Team')[['Home Team Goals','Away Team Goals']].count()
awayteam_goal2=awayteam_goal2[['Away Team Goals','Home Team Goals']]
awayteam_goal2=awayteam_goal2.rename(columns={'Away Team Goals':'Goals'})
awayteam_goal2=awayteam_goal2.rename(columns={'Home Team Goals':'Opponents Goals'})

avg_goal=(hometeam_goal1+awayteam_goal1)/(hometeam_goal2+awayteam_goal2)
avg_goal