In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
Witt_game = pd.read_csv("Game 12 Witt.xlsx - Wittenberg.csv")

In [3]:
Witt_full = Witt_game[Witt_game['Split Name']== '1st-Half A']
Witt_full

Unnamed: 0,Session Title,Player Code,Minutes Played,Split Name,Tags,Distance (miles),Sprint Distance (yards),Power Plays,Energy (kcal),Impacts,...,Accelerations Zone Count: > 4 m/s/s,Deceleration Zone Count: 0 - 1 m/s/s,Deceleration Zone Count: 1 - 2 m/s/s,Deceleration Zone Count: 2 - 3 m/s/s,Deceleration Zone Count: 3 - 4 m/s/s,Deceleration Zone Count: > 4 m/s/s,Unnamed: 97,Unnamed: 98,Player Code.1,Minutes Played.1
0,Wittenberg,K,90.0,1st-Half A,game,1.9884,67.6671,17,364.8291,1,...,5,0,92,39,17,7,,,K,90.0
1,Wittenberg,L,90.0,1st-Half A,game,2.1,72.6288,20,370.7686,1,...,10,0,102,67,22,10,,,L,90.0
2,Wittenberg,H,25.0,1st-Half A,game,1.7539,143.5002,9,285.0858,0,...,1,0,82,29,18,8,,,H,25.0
3,Wittenberg,V,90.0,1st-Half A,game,2.1698,54.8599,13,396.1563,0,...,4,0,114,76,24,5,,,V,90.0
4,Wittenberg,M,90.0,1st-Half A,game,1.8179,152.1102,10,286.4598,0,...,2,0,80,36,11,7,,,M,90.0
5,Wittenberg,C,0.0,1st-Half A,game,0.021,0.0,0,3.093,0,...,0,0,0,0,0,0,,,C,0.0
6,Wittenberg,O,0.0,1st-Half A,game,0.0435,0.0,0,5.0998,0,...,0,0,0,0,0,0,,,O,0.0
7,Wittenberg,J,11.0,1st-Half A,game,0.0261,0.0,0,4.2404,0,...,0,0,0,0,0,0,,,J,11.0
8,Wittenberg,Z,0.0,1st-Half A,game,0.0405,0.0,0,5.4164,0,...,0,0,0,0,0,0,,,Z,0.0
9,Wittenberg,R,90.0,1st-Half A,game,2.0074,124.7853,15,348.5033,0,...,7,0,72,65,17,14,,,R,90.0


In [None]:
Witt_volume_intensity = Witt_full[["Player Code", "Minutes Played", "Distance (miles)", "Sprint Distance (yards)", "Power Plays", "Energy (kcal)", "Impacts", "Top Speed (mph)","Distance Per Min (yd/min)", "Power Score (w/kg)", "Player Load", "Work Ratio"]]

In [None]:
Witt_cluster = Witt_volume_intensity.loc[:, Witt_volume_intensity.columns != 'Player Code']

In [None]:
Witt_distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(Witt_cluster)
    Witt_distortions.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, Witt_distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method shwoing the optimal k')
plt.show()

In [None]:
x_Witt = Witt_cluster.values # numpy array
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x_Witt)
X_norm = pd.DataFrame(x_scaled)

In [None]:
pca_Witt= PCA(n_components = 2) # 2D PCA for the plot
reduced_Witt = pd.DataFrame(pca_Witt.fit_transform(X_norm))

In [None]:
kmeans = KMeans(n_clusters=3)
# fit the input data
kmeans = kmeans.fit(reduced_Witt)
# get the cluster labels
labels = kmeans.predict(reduced_Witt)
# centroid values
centroid = kmeans.cluster_centers_
# cluster values
clusters = kmeans.labels_.tolist()
# adding names
Witt_names = Witt_full['Player Code']

In [None]:
reduced_Witt['cluster'] = clusters
reduced_Witt['names'] = Witt_names
reduced_Witt.columns = ['x', 'y', 'cluster', 'names']
reduced_Witt.head()

In [None]:
%matplotlib inline
sns.set(style="white")
ax = sns.lmplot(x="x", y="y", hue='cluster', data = reduced_Witt, legend= True,
fit_reg=False, height = 15, scatter_kws={"s": 250})
texts = []
for x, y, s in zip(reduced_Witt.x, reduced_Witt.y, reduced_Witt.names):
    texts.append(plt.text(x, y, s))
ax.set(ylim=(-2, 2))
plt.tick_params(labelsize=15)
plt.xlabel(" Witt PC 1", fontsize = 20)
plt.ylabel(" Witt PC 2", fontsize = 20)
plt.show()

In [None]:
Witt_group_0 = reduced_Witt[(reduced_Witt['cluster'] == 0)]
Witt_group_0

In [None]:
Witt_group_1 = reduced_Witt[(reduced_Witt['cluster'] == 1)]
Witt_group_1

In [None]:
Witt_group_2 = reduced_Witt[(reduced_Witt['cluster'] == 2)]
Witt_group_2