In [None]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
df = pd.read_csv("../input/social-power-nba/nba_2016_2017_100.csv")

In [None]:
df.head()

In [None]:
df[['PLAYER_NAME','PTS','SALARY_MILLIONS']].describe()

The descriptive statistics are given above. Let's see a scatter plot in order to have a better understanding.

In [None]:
Points=df.iloc[:, [60,59]]['PTS']
Salary=df.iloc[:, [60,59]]['SALARY_MILLIONS']

In [None]:
plt.scatter(Points, Salary, color = 'blue')
plt.xlabel('Points')
plt.ylabel('Salary (in Millions)')

K-Means Algorithm

The first step in K-Means algorithm is to determine the number "K" for the optimal number of clusters. We are going to use the Elbow method in order to determine the best "K".

In [None]:
sse=[] #Sum of Squared Errors
for i in range(1,11):
    kmeans = KMeans(i, init='k-means++', random_state=0)
    kmeans.fit(pd.concat([Points, Salary],axis=1))
    sse.append(kmeans.inertia_)
sse

In [None]:
plt.plot(range(1,11), sse, )
plt.title('Elbow Method for K-Means', color = 'blue')
plt.xlabel('K: Number of Clusters')
plt.ylabel('Error Sum of Squares')

The idea behind the Elbow Method is to choose the k value at which the Sum of Squared Errors decreases abruptly. In other words, we consider the "joints" in the graph that resemble an elbow shape. Based on this graph, an ideal number of K can be 2 or maybe 3. We will investigate both of them.

Two Clusters

In [None]:
Model = KMeans(2, init='k-means++')
k_means= Model.fit_predict(pd.concat([Points, Salary],axis=1))
X=pd.concat([Points, Salary],axis=1)

In [None]:

plt.scatter(X[k_means == 0]['PTS'], X[k_means == 0]['SALARY_MILLIONS'], c = 'orange', label = 'Cluster 1')
plt.scatter(X[k_means == 1]['PTS'], X[k_means == 1]['SALARY_MILLIONS'], c = 'red', label = 'Cluster 2')
plt.xlabel('Points')
plt.ylabel('Salary')
plt.legend()

Three Clusters

In [None]:
Model = KMeans(3, init='k-means++')
k_means= Model.fit_predict(pd.concat([Points, Salary],axis=1))
X=pd.concat([Points, Salary],axis=1)

In [None]:
plt.scatter(X[k_means == 0]['PTS'], X[k_means == 0]['SALARY_MILLIONS'], c = 'orange', label = 'Cluster 1')
plt.scatter(X[k_means == 1]['PTS'], X[k_means == 1]['SALARY_MILLIONS'], c = 'red', label = 'Cluster 2')
plt.scatter(X[k_means == 2]['PTS'], X[k_means == 2]['SALARY_MILLIONS'], c = 'black', label = 'Cluster 3')
plt.xlabel('Points')
plt.ylabel('Salary')
plt.legend()

When the joint is not very obvious in the Elbow Method, it is better to do Silhouette analysis.

Silhouette Analysis

In [None]:
X=pd.concat([Points, Salary],axis=1)

for num in range(2,6):
    Model = KMeans(num, init='k-means++')
    kmeans = Model.fit_predict(X)
    score = silhouette_score(X, kmeans)
    values = silhouette_samples(X, kmeans)
    fig = plt.subplots()
    minimum = 0
    for i in range(num):
        SVI = \
            values[kmeans == i] #Silhouette Value of the current number of clusters
        SVI.sort()
        length = SVI.shape[0]
        maximum = minimum + length
        plt.fill_betweenx(np.arange(minimum, maximum), SVI)
        minimum = maximum
    plt.title("The silhouette Plot wit "+str(num)+" clusters")
    plt.xlabel("Silhouette Score")
    plt.axvline(x=score,color="black")
plt.show()

In [None]:
for num in range(2,6):
    Model = KMeans(num, init='k-means++')
    kmeans = Model.fit_predict(X)
    print("For "+str(num)+" clusters, the average Silhouette values is equal to", round(silhouette_score(X, kmeans),2))

From these plots and Silhouette scores, we can conclude that the number of clusters should be 2.