In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
OWU_game = pd.read_csv("Game 15 OWU.xlsx - OWU.csv")

In [3]:
OWU_full = OWU_game[OWU_game['Split Name']== 'game']
OWU_full

Unnamed: 0,Session Title,Player Name,Minutes Played,Split Name,Distance (miles),Player Name.1,Sprint Distance (yards),Player Name.2,Power Plays,Energy (kcal),...,Accelerations Zone Count: 3 - 4 m/s/s,Accelerations Zone Count: > 4 m/s/s,Deceleration Zone Count: 0 - 1 m/s/s,Deceleration Zone Count: 1 - 2 m/s/s,Deceleration Zone Count: 2 - 3 m/s/s,Deceleration Zone Count: 3 - 4 m/s/s,Deceleration Zone Count: > 4 m/s/s,Unnamed: 99,Unnamed: 100,Player Name.4
0,OWU,O,0.0,game,0.1535,O,0.0,O,0,18.1199,...,0,0,0,0,0,0,0,0.0,,O
1,OWU,A,0.0,game,0.1513,A,0.0,A,0,22.2237,...,0,0,0,0,0,0,0,0.0,,A
2,OWU,J,32.0,game,2.4101,J,85.5859,J,19,471.209,...,19,5,0,94,48,25,13,38.0,38.0,J
3,OWU,Q,67.0,game,4.7929,Q,93.5233,Q,26,1011.3102,...,21,1,0,190,93,41,24,65.0,65.0,Q
4,OWU,B,17.0,game,1.3648,B,141.3917,B,12,240.5307,...,7,2,0,41,23,8,10,18.0,18.0,B
5,OWU,G,42.0,game,2.5185,G,189.5653,G,27,422.5878,...,19,3,0,98,53,20,15,35.0,35.0,G
6,OWU,I,48.0,game,3.8549,I,354.1054,I,38,563.0843,...,19,8,0,149,64,26,22,48.0,48.0,I
7,OWU,Z,0.0,game,0.2394,Z,0.0,Z,0,32.5113,...,0,0,0,1,0,0,0,0.0,,Z
8,OWU,C,90.0,game,6.2563,C,49.0823,C,26,1105.7095,...,45,13,0,352,149,44,28,72.0,72.0,C
9,OWU,N,0.0,game,0.1147,N,0.0,N,0,19.0553,...,0,0,0,0,0,0,0,0.0,,N


In [None]:
OWU_volume_intensity = OWU_full[["Player Name", "Minutes Played", "Distance (miles)", "Sprint Distance (yards)", "Power Plays", "Energy (kcal)", "Impacts", "Top Speed (mph)","Distance Per Min (yd/min)", "Power Score (w/kg)", "Player Load", "Work Ratio"]]

In [None]:
OWU_cluster = OWU_volume_intensity.loc[:, OWU_volume_intensity.columns != 'Player Name']

In [None]:
OWU_distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(OWU_cluster)
    OWU_distortions.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, OWU_distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method shwoing the optimal k')
plt.show()

In [None]:
x_OWU = OWU_cluster.values # numpy array
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x_OWU)
X_norm = pd.DataFrame(x_scaled)

In [None]:
OWU_pca = PCA(n_components = 2) # 2D PCA for the plot
reduced_OWU = pd.DataFrame(OWU_pca.fit_transform(X_norm))

In [None]:
kmeans = KMeans(n_clusters=3)
# fit the input data
kmeans = kmeans.fit(reduced_OWU)
# get the cluster labels
labels = kmeans.predict(reduced_OWU)
# centroid values
centroid = kmeans.cluster_centers_
# cluster values
clusters = kmeans.labels_.tolist()
# adding names
OWU_names = OWU_full['Player Name']

In [None]:
reduced_OWU['cluster'] = clusters
reduced_OWU['names'] = OWU_names
reduced_OWU.columns = ['x', 'y', 'cluster', 'names']
reduced_OWU.head()

In [1]:
%matplotlib inline
sns.set(style="white")
ax = sns.lmplot(x="x", y="y", hue='cluster', data = reduced_OWU, legend= True,
fit_reg=False, height = 15, scatter_kws={"s": 700})
texts = []
for x, y, s in zip(reduced_OWU.x, reduced_OWU.y, reduced_OWU.names):
    texts.append(plt.text(x, y, s))
ax.set(ylim=(-2, 2))
plt.tick_params(labelsize=15)
plt.xlabel("OWU PC 1", fontsize = 40)
plt.ylabel("OWU PC 2", fontsize = 40)
plt.title('OWU Clusters', fontsize=40)
plt.legend(fontsize = 20)
plt.show()

NameError: name 'sns' is not defined

In [None]:
OWU_group_0 = reduced_OWU[(reduced_OWU['cluster'] == 0)]
OWU_group_0

In [None]:
OWU_group_1 = reduced_OWU[(reduced_OWU['cluster'] == 1)]
OWU_group_1

In [None]:
OWU_group_2 = reduced_OWU[(reduced_OWU['cluster'] == 2)]
OWU_group_2