In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Idea
The idea is group players in 5 different groups to see some similarities between them. I think this could be helpful when analyzing similarities between players, and could be used to find some players that are not in the "hype" but have stats like the big stars in the league.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
df = pd.read_csv('/kaggle/input/basketball-players-stats-per-season-49-leagues/players_stats_by_season_full_details.csv')
df.shape

# Data processing

First step: select only who played last NBA Season.

In [None]:

df_nba = df[(df['League'] == 'NBA') & (df['Season'] == '2018 - 2019') & (df['Stage'] == 'Regular_Season')].reset_index(drop=True)
df_nba.head()

In [None]:
#Number of players
print(len(df_nba.Player))
#Columns available
print(df_nba.columns.values)

Now that we have the players, we need to select the stats that will be used in the clustering.

In [None]:
#Columns that we will use to cluster
columns =  ['Player', 'GP', 'MIN', 'FGM',
       'FGA', '3PM', '3PA', 'FTM', 'FTA', 'TOV', 'PF', 'ORB', 'DRB',
       'REB', 'AST', 'STL', 'BLK', 'PTS']
df_nba_ = df_nba[columns]

And we have our dataset almost ready to apply KMeans model.

# Some EDA

I did not dropped the Games Played column. This will be used now, to take out the outliers, player that played few games in the last season.

I will use the interquartile range technique to remove outliers. But in this case we don't have outliers in 3rd quartile, since any player can only play a maximum of 82 games. So, I will remove all players that played less than the lower bound calculated below.

In [None]:
#Some EDA...
#function used to find outliers based in quartiles
def find_outliers(data):
    data = sorted(data)
    Q1, Q3 = np.percentile(data, [25,75])
    IQR = Q3-Q1
    lower_bound = Q1 -(1.5 * IQR) 
    upper_bound = Q3 +(1.5 * IQR)
    return lower_bound, upper_bound

gp_lower, gp_upper = find_outliers(df_nba_['GP'])
gp_lower

All players that played less than 42 games are dataset's points that are below than 1.5 times the interquartile range in first quartile, so that's considered an outlier and will be removed.

In [None]:
df_filt = df_nba_[df_nba_['GP'] > gp_lower]
len(df_filt['Player'])

Now I can remove Games Played and Minutes information.

In [None]:
#df that will be used in model
#contains only stats and name of the player
df_p = df_filt[np.append(df_filt.columns.values[0], (df_filt.columns.values[3:]))].reset_index(drop=True)
df_p.head()

I will plot the correlation matrix to see if any variable can explain another. This will help visualization of the cluster.

In [None]:
int_cols = df_p.columns.values[1:]

#correlation
plt.figure(figsize=(25,10))
cm = np.corrcoef(df_p[int_cols].values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size':15}, yticklabels=int_cols, xticklabels=int_cols)
plt.show()

FGM, FGA, FTM and FTA have a high correlation with Points. Also, 3PM and 3PM have almost perfect correlation.

Rebounds, of course, has a high correlation with ORB and DRB.

# Model

Start selecting model features, which are all the stats available.

In [None]:
X = df_p[int_cols]

Then, I will rescale the data with StandardScaler.

In [None]:
sc = preprocessing.StandardScaler()
X_std = sc.fit_transform(X)
X_std

And now I have the data ready to apply KMeans model.

In [None]:
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(X_std)

In [None]:
df_p['cluster'] = kmeans.predict(X_std)
df_p.head()

I will plot the stats by cluster. It helps to see which stats impact more in each clusters, and see how players are grouped.

To select stats that will be ploted, in order to help view, I used the information of the correlation matrix.

In [None]:
data_plot = df_p[['3PM', 'TOV', 'PF', 'REB', 'AST', 'STL', 'BLK', 'PTS', 'cluster']]
data_plot.groupby("cluster").aggregate("mean").plot.bar(figsize=(15,10))
plt.title("Stats by cluster")

# Conclusion

It seems that the model captured well players' style. As we can see, Cluster 0 has very good scorers and rebounders, which is proved when we see which players are there: Giannis Antetokounmpo, Karl-Anthony Towns, Ben Simmons. And, in Cluster 4 we have the pure scores, like James Harden, Kemba Walker and Trae Young.

However, I think that we could see a better grouping reducing number of cluster. This is why Cluster's 3 players are like the Cluster's 4 players but "rescaled", in other words, with less minutes played. 

I could say the same for players of Cluster 0 and Cluster 2, but, in this case, we have some differences in player's style: cluster 2 has more deffensive players. 

Besides that, Cluster 1 contains players with low stats in all areas, a lot of them had a few minutes to play in each game.