In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install seaborn
!pip install dmba

%matplotlib inline


from pathlib import Path

import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import scipy.cluster.hierarchy as shc
from sklearn.cluster import KMeans
import matplotlib.pylab as plt
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm


from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from dmba import classificationSummary


In [None]:
details_df = pd.read_csv('/kaggle/input/nba-players-stats-20142015/players_stats.csv')
details_df.set_index('Name', inplace=True)
details_df.head()

In [None]:
stats_df = details_df[['EFF', 'Games Played', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', 'FTM', 'FTA', 'FT%', 
                       '3PM', '3PA', '3P%',
                       'OREB', 'DREB', 'REB', 'AST',
                       'STL', 'BLK', 'TOV', 'PF']]
stats_df.head()

In [None]:
stats_df.describe()

The df.describe() above presents the descriptive analysis of the whole NBA season. Out of the 490 records, this is the descriptive analysis focusing on Efficiency (EFF):
Mean: 564.33
Std: 464.43
Min: -3
Max: 2202
The output gave us an acceptance range of Efficiency score, guiding us in recognizing a strong contender versus a weaker one. It also gave us indicators on where the player stands based on other stats compared to others (total points, rebounds, steals, to name a few stats recognizing skill level). This is a great reference to compare players in the league.
It's interesting to see the average points scored per player every season, it give each of them a quick reflection on where they stand in the league.

In [None]:
stats_df.info()

FGA/FGM, FTM/FTA, and 3PM/3PA, have high correlation themselves and may have a difficult time distinguishing their effects on Efficiency. Yes, multicollinearity apperas in this dataset but does not affect the accuracy of the model since these correlation doesn't mean causation.

The dataset used were mostly numerical values, dummies are not needed. The categorical values were dropped since they did not pertain to our approach.

In [None]:
stats_df.corr()

Yes, need to normalize the data first to get a cleaner result. Normalizing will give us a common scale without a large difference in ranges. Not normalizing first will result in big distances because some columns have 1000's (eg. MIN) in values while some (eg. OREB) have only 10's.
Below is new dataframe k_stats_df then normalized it. The variables used on k_stats_df dataframe to run clustering analysis were chosen based on factors that affect Efficiency rating. These variables are shown below. Below we also changed the data type to float64, before normalizing it.[](http://)

In [None]:
k_stats_df = details_df[['EFF', 'Games Played', 'MIN', 'PTS', 'FGM', 'FGA', 'FTM', 'FTA', '3PM', '3PA',
                       'OREB', 'DREB', 'REB', 'AST',
                       'STL', 'BLK', 'TOV', 'PF']]
k_stats_df = k_stats_df.apply(lambda x: x.astype('float64'))
k_stats_df.head()

In [None]:
k_stats_df_norm = k_stats_df.apply(preprocessing.scale, axis=0)
k_stats_df_norm.head()

Pairwise Distance using Euclidean metric - it measures the distance between 2 players.

In [None]:
d = pairwise.pairwise_distances(k_stats_df_norm, metric='euclidean')
pd.DataFrame(d, columns=k_stats_df_norm.index, index=k_stats_df_norm.index).head(5)

K Clustering, we chose K=15. There are 30 teams in the NBA, and we expect 15 clusters can give us a good representation of the players. Even though we had 490 players, 15 clusters should give us a nice spread of members per cluster.

In [None]:
kmeans = KMeans(n_clusters=15, random_state=1).fit(k_stats_df_norm)
memb = pd.Series(kmeans.labels_, index=k_stats_df_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))

Cluster 11, containing only 2 players (James Harden and Russell Westbrook), scored high on 10 out of 18 columns.
Although this could be considered an outlier, we chose to keep it since the dataframe is based on player's performance. We cannot treat great players (or bad players) as outliers, as mentioned before.
Although we didn't mind looking over the players listed in clusters, we found the centroids measurement much easier to analyze. Higher values per cluster/column meant that those group of players score higher on that specific stat.
This is especially important for NBA organizations to get a quick look on a smaller list of players who can contribute a particular skill on the team. They can analyze the clusters list below to find which cluster can offer the most improvement in their team, and refer to the list above for players on that cluster.

In [None]:
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=k_stats_df_norm.columns)
pd.set_option('precision', 3)
print(centroids)
pd.set_option('precision', 6)

A good data set should be clustered together; it indicates strong cohesiveness.
And to measure cohesion, we used Within Cluster Sum of Square distances

In [None]:
withinClusterSS = [0] * 15
clusterCount = [0] * 15
for cluster, distance in zip(kmeans.labels_, kmeans.transform(k_stats_df_norm)):
    withinClusterSS[cluster] += distance[cluster]**2
    clusterCount[cluster] += 1
for cluster, withClustSS in enumerate(withinClusterSS):
    print('Cluster {} ({} members): {:5.2f} within cluster'.format(cluster, 
        clusterCount[cluster], withinClusterSS[cluster]))

In [None]:
centroids['cluster'] = ['Cluster {}'.format(i) for i in centroids.index]

fig = plt.figure(figsize=(20,30))
fig.subplots_adjust(right=3)
ax = parallel_coordinates(centroids, class_column='cluster', colormap='Dark2', linewidth=5)
plt.legend(loc='upper right', bbox_to_anchor=(0.95, 0.5))
plt.xlim(-0.5,7.5)
centroids

Above (centroids), is the average value per cluster per column. Since we are focusing on EFF, Cluster 7 was second in highest EFF at 2.492 (after Harden and Westbrook's 3.158). Other notable clusters were Cluster 4- high Games Played (0.978), Cluster 6- high 3 Points Made (2.271), and Cluster 7- high Rebounds (3.152)

In [None]:
inertia = []
for n_clusters in range(1, 8):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(k_stats_df_norm)
    inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(1, 8), 'inertia': inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters(k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.show()

The elbow chart suggests that the k=4 is the optimum amount of clusters.
After analyzing this problem for some time, we came to conclusion that elbow chart or KMeans clustering are not the best method in calculating our dataset. Although it's fast at partitioning for clustering, it doesn't do a great job scaling a big data set.



In [None]:
scatter_df = details_df[['PTS', 'FGM', 'FGA', 'FTM', 'FTA', '3PM', '3PA',
                         'REB', 'AST', 'STL', 'BLK', 'TOV',
                         'EFF','MIN']]
scatter_df.info()
#review the data once again

In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
cluster.fit_predict(scatter_df)

In [None]:
plt.figure(figsize=(10, 7))
plt.title("Player Dendrograms")
dend = shc.dendrogram(shc.linkage(scatter_df, method='ward'))
#dendrogram of players

In [None]:
eff_min = scatter_df.iloc[:, 12:14].values

plt.figure(figsize=(10, 7))
plt.scatter(eff_min[:,0], eff_min[:,1], c=cluster.labels_, cmap='rainbow')
plt.title('Minutes vs Efficiency')
plt.xlabel('Minutes')
plt.ylabel('Efficiency')

Above is a scatter of EFF vs MIN. We assumed the more players stay on the floor the more effective they can be, and this diagram confirms that.
Below is a representation of Assists and Steals. Position such as Point Guard and Shooting Guard are known to handle the ball more and quick on the hands

In [None]:
ast_stl = scatter_df.iloc[:, 10:18].values
plt.figure(figsize=(10, 7))
plt.scatter(ast_stl[:,0], ast_stl[:,1], c=cluster.labels_, cmap='rainbow')
plt.title('Assists vs Steals')
plt.xlabel('Assist')
plt.ylabel('Steal')

In [None]:
blk_3pm = scatter_df[['BLK', '3PM']].values
plt.figure(figsize=(10, 7))
plt.scatter(blk_3pm[:,0], blk_3pm[:,1], c=cluster.labels_, cmap='rainbow')
plt.title('Block vs 3 Point Made')
plt.xlabel('3 Point Made')
plt.ylabel('Blocks')

Lastly, above we have Blocks and 3 Point Made. We were expecting to see a better scatter diagram, but this was the result. We assumed that Center players are tall and don't make 3 point shots often.

Below is a heirarchical clustering dendrogram using "Single Linkage", which we found less meaningful. Next code was dendrogram using "Average Linkage" which we found more useful

In [None]:
Z = linkage(k_stats_df_norm, method='single')

fig = plt.figure(figsize=(20, 6))
fig.subplots_adjust(bottom=0.23)
plt.title('NBA - Single Linkage Dendrogram')
plt.xlabel('Player')
dendrogram(Z, labels=k_stats_df_norm.index, color_threshold=2.75)
plt.axhline(y=2.65, color='black', linewidth=0.5, linestyle='dashed')
plt.show()

In [None]:
Z = linkage(k_stats_df_norm, method='average')

fig = plt.figure(figsize=(50, 12))
fig.subplots_adjust(bottom=0.23)
plt.title('NBA - Average Linkage Dendrogram')
plt.xlabel('Player')
dendrogram(Z, labels=k_stats_df_norm.index, color_threshold=3.6)
plt.axhline(y=3.3, color='black', linewidth=0.5, linestyle='dashed')
plt.show()

Above is a dendrogram of players and below is the list of players the algorithm split the data.

In [None]:
memb = fcluster(linkage(k_stats_df_norm, 'average'), 17, criterion='maxclust')
memb = pd.Series(memb, index=k_stats_df_norm.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))

In [None]:
k_stats_df.index = ['{}: {}'.format(cluster, state) for cluster, state in zip(memb, k_stats_df_norm.index)]
sns.clustermap(k_stats_df_norm, method='average', col_cluster=False,  cmap="mako_r")
plt.show()

Above is a dendrogram with heatmap.

There is a large difference in number of players per cluster and that was against our expectations.
We found that KMeans clustering at k=15 was the best model. Besides Cluster 11 (Harden and Westbrook), results after partitioning made more sense than single/average linkage model. The label on clusters can be referred above on 10.1 Clustering.
Although there are better visualizations than dendrograms, there is one lesson we can learn from this data. When estimating the player's salary, NBA organizations can take these clusters into consideration to avoid undervaluing or overvaluing potential recruits. Since players are paid for their skills and potential contribution, the players who stand alone in a cluster can set the bar for their salary, while those with members with, let's say, 5 players, creates a range NBA organizations can refer to.

We came to a conclusion that KMeans cluster analysis is not the best model to approach this data. Although it tries to calculate the best partitioning of the given data quickly, the drawback is that it struggles to find clusters with stronger cohesion. We searched online for various ways to approach this data (eg. visualization), tried new codes multiple times and got errors after errors, and only when we finally get the code to work is when we realized that the output was meaningless (scatter plots and dendrograms). We also realized that maybe other neighbor classifier or other clustering methods are more suitable on our dataset. Unfortunately time would not allow us to explore other clustering methods.

**Regression Modeling**

In [None]:
predictors = ['MIN', 'PTS', 'FG%', 'FT%', '3P%',
                       'REB', 'AST',
                       'STL', 'BLK', 'TOV']
outcome = 'EFF'

X = stats_df[predictors]
y = stats_df[outcome]
print(X.shape)
X.head()

In [None]:
#partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.5, random_state=1)

In [None]:
car_lm = LinearRegression()
car_lm.fit(train_X, train_y)
#train the model

In [None]:
#Shown below, the coefficients are being printed, as well as the performance measures, which include the mean error, root mean squaed error, and mean absolute error.

print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

regressionSummary(train_y, car_lm.predict(train_X))

In the code below, we are using the training set to test the model.

In [None]:
pred_y = car_lm.predict(train_X)

print('adjusted r2 : ', adjusted_r2_score(train_y, pred_y, car_lm))
print('AIC : ', AIC_score(train_y, pred_y, car_lm))
print('BIC : ', BIC_score(train_y, pred_y, car_lm))

We are using the validation set to test the model. In addition to that, the code is making predictions on a new set. Shown below are the first 20 predictions.

In [None]:
car_lm_pred = car_lm.predict(valid_X)

result = pd.DataFrame({'Predicted': car_lm_pred, 'Actual': valid_y,
                       'Residual': valid_y - car_lm_pred})

print(result.head(20))

Forward Selection:

In [None]:
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

In [None]:
regressionSummary(valid_y, best_model.predict(valid_X[best_variables]))

Below, is a histogram of residuals.

In [None]:
car_lm_pred = car_lm.predict(valid_X)
all_residuals = valid_y - car_lm_pred

print(len(all_residuals[(all_residuals > -50) & (all_residuals < 50)]) / len(all_residuals))

ax = pd.DataFrame({'Residuals': all_residuals}).hist(bins=25)

plt.tight_layout()
plt.show()

As it can be seen, 87% of the efficiency scores are within 50 of the actual efficiency score. This means that less than 13% are more off than 50 points. Based on the residuals, there are very few numbers that are very large, specifcally around the values of -150 and 150. Overall, a majority are between -100 and 100, which is not a suprise. In addition to that, it is pretty symmetrical around 0, which means that there is not exactly underestimating or overestimating. In a way, it is pretty well balanced.

Listed below, is both the predictors and the outcome. In addition to that, the MLPClassifier is created with one hidden layer of a size of 3 nodes.

In [None]:
predictors = ['MIN', 'PTS', 'FG%', 'FT%', '3P%',
                       'REB', 'AST',
                       'STL', 'BLK', 'TOV']
outcome = 'EFF'

A = stats_df[predictors]
B = stats_df[outcome]
classes = sorted(y.unique())

clf = MLPClassifier(hidden_layer_sizes=(3), activation='logistic', solver='lbfgs', random_state=1)
clf.fit(A, B)
clf.predict(A)

print('Intercepts')
print(clf.intercepts_)

print('Weights')
print(clf.coefs_)

print(pd.concat([
    stats_df,
    pd.DataFrame(clf.predict_proba(X), columns=classes)
], axis=1))

The code below partitions the data. In addition to that, the clf trains the neural network with only two hidden nodes. It also shows the training performaqnce, as well as the validation performance.

In [None]:
A = stats_df[predictors]
b = stats_df[outcome]
train_A, valid_A, train_b, valid_b = train_test_split(A, b, test_size=0.4, random_state=1)

scaler = StandardScaler()
train_A = scaler.fit_transform(train_A)
valid_A = scaler.transform(valid_A)

# train neural network with 2 hidden nodes
clf = MLPClassifier(hidden_layer_sizes=(2), activation='logistic', solver='lbfgs',
                    random_state=1)
clf.fit(train_A, train_b.values)

classificationSummary(train_b, clf.predict(train_A))

classificationSummary(valid_b, clf.predict(valid_A))

Below, is the performance measures being printed, such as mean error, root mean squared error, and mean absolute error.



In [None]:
regressionSummary(train_b, car_lm.predict(train_A))

In [None]:
for i, (weights, intercepts) in enumerate(zip(clf.coefs_, clf.intercepts_)):
    print('Hidden layer' if i == 0 else 'Output layer', '{0[0]} => {0[1]}'.format(weights.shape))
    print(' Intercepts:\n ', intercepts)
    print(' Weights:')
    for weight in weights:
        print(' ', weight)
    print()

Below, is the confusion matrix being printed.

In [None]:
classificationSummary(b, clf.predict(A), class_names=classes)

In [None]:
regressionSummary(train_b, car_lm.predict(train_A))

Below, is a histogram of residuals.

In [None]:
car_lm_pred = car_lm.predict(valid_A)
all_residuals = valid_b - car_lm_pred

print(len(all_residuals[(all_residuals > -50) & (all_residuals < 50)]) / len(all_residuals))

ax = pd.DataFrame({'Residuals': all_residuals}).hist(bins=25)

plt.tight_layout()
plt.show()

After running the regression analysis, the conclusions that can be drawn are that James Harden has the highest efficiency of 2202. Our prediction that the player with the highest efficiency would be either a point guard or a shooting guard was correct. James Harden is a shooting guard and his points were significantly higher than all the other players, which gave him a huge increase in efficiency. Vice versa, the player who had the lowest efficiency of -3 was Julius Randle. The highest effiency score of 2202 and the lowest efficiency score of -3 was an insanely large difference that was quite surprising.