# NBAPositionify: Leveraging Data Mining Techniques to Classify Professional Basketball Players into Positions

In [None]:
# Import Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
## Get Dataset from Kaggle
# Requires the kaggle package to download dataset. Uncomment the line below to install
# !pip3 install kaggle
import kaggle
# Requires kaggle.json (Kaggle API Key) placed in ~/.kaggle
kaggle.api.authenticate()
kaggle.api.dataset_download_files("drgilermo/nba-players-stats", path="./data/", unzip=True)

## Exploratory Data Analysis

Our data is from `basketball-reference.com`. Three datasets:

In [None]:
player_data = pd.read_csv("./data/player_data.csv")
player_data.head(1)


In [None]:
players = pd.read_csv("./data/Players.csv")
players.head(1)

In [None]:
plt.bar(player_data.groupby('position').size().index, player_data.groupby('position').size().values)
plt.xlabel('Position')
plt.ylabel('Count')
plt.title('Counts for each position')
plt.show()

In [None]:
plt.bar(player_data.groupby('year_start').size().index, player_data.groupby('year_start').size().values)
plt.xlabel('Start Year')
plt.ylabel('Count')
plt.title('Counts for Start Year')
plt.show()

plt.bar(player_data.groupby('year_start').size().index, player_data.groupby('year_start').size().values)
plt.xlabel('End Year')
plt.ylabel('Count')
plt.title('Counts for End Year')
plt.show()

In [None]:
stats = pd.read_csv("./data/Seasons_Stats.csv")
stats = stats.rename(columns={"Unnamed: 0" : "Index"})
stats.drop(['blanl', 'blank2'], axis=1, inplace=True)
for index, row in stats.iterrows():
    if not pd.isna(row["Pos"]):
        positions = row["Pos"].split("-")
        # print(positions)
        stats.at[index, "Pos"] = positions[0]
        for i in range(1, len(positions)):
            position = positions[i]
            new_row = stats.iloc[index].copy()
            new_row["Pos"] = position
            stats.loc[len(stats)] = new_row
print(stats.shape)
print(stats.dropna().head())
print(stats.dropna().shape)
stats.to_csv("season_stats_separated_pos.csv")

# Unsupervised Naive K-Means Clustering (Points, Rebounds, Assists, Steals, Blocks)

As part of our exploratory data analysis, we did a round of naive k-means clustering, taking into account 5 stat columns which are subjectively "most relevant" to player performance and player type: points, rebounds, assists, steals and blocks. We used 5 clusters in the hopes of seeing each cluster group associated with a different position. The results yielded clusters which had datapoints evenly distributed between the 5 position classes. We hypothesized that a reason for this could be a lack of data preprocessing - some players had more stats than others, so they would be grouped in a similar class, regardless of which stats stood out. To that end, we decided to try multiple types of preprocessing on these columns - division by the number of minutes played, and normalization.

In [None]:
from sklearn.cluster import KMeans

unsupervised_kmeans_stats = stats[["PTS", "ORB", "DRB", "AST", "STL", "BLK"]].dropna()

unsupervised_kmeans_stats_1 = unsupervised_kmeans_stats.copy()
kmeans = KMeans(n_clusters=5)
kmeans.fit(unsupervised_kmeans_stats_1)

unsupervised_kmeans_stats_1["cluster"] = kmeans.labels_
unsupervised_kmeans_stats_1.insert(0, "Pos", stats["Pos"])
unsupervised_kmeans_stats_1.head()

In [None]:
for cluster in range(5):
    cur_df = unsupervised_kmeans_stats_1.loc[unsupervised_kmeans_stats_1["cluster"] == cluster]
    plt.bar(cur_df.groupby('Pos').size().index, cur_df.groupby('Pos').size().values)
    plt.xlabel('Position')
    plt.ylabel('Count')
    plt.title(f'Counts for each position in cluster {cluster}')
    plt.show()

    position_percentages = (cur_df['Pos'].value_counts() / len(cur_df)).sort_values(ascending=False)
    plt.pie(cur_df['Pos'].value_counts())
    plt.title(f"Distribution of players by position in cluster {cluster}")
    plt.axis("equal")
    legend_labels = [f"{position}: {round(percentage, 3)}%" for position, percentage in zip(position_percentages.index, position_percentages.values)]
    plt.legend(legend_labels, loc="best")
    plt.show()

### Normalization

In [None]:
# Normalization
unsupervised_kmeans_stats_2 = (unsupervised_kmeans_stats-unsupervised_kmeans_stats.mean())/unsupervised_kmeans_stats.std()

kmeans = KMeans(n_clusters=5)
kmeans.fit(unsupervised_kmeans_stats_2)

unsupervised_kmeans_stats_2["cluster"] = kmeans.labels_
unsupervised_kmeans_stats_2.insert(0, "Pos", stats["Pos"])
unsupervised_kmeans_stats_2.head()

In [None]:
for cluster in range(5):
    cur_df = unsupervised_kmeans_stats_2.loc[unsupervised_kmeans_stats_2["cluster"] == cluster]
    plt.bar(cur_df.groupby('Pos').size().index, cur_df.groupby('Pos').size().values)
    plt.xlabel('Position')
    plt.ylabel('Count')
    plt.title(f'Counts for each position in cluster {cluster}')
    plt.show()

    position_percentages = (cur_df['Pos'].value_counts() / len(cur_df)).sort_values(ascending=False)
    plt.pie(cur_df['Pos'].value_counts())
    plt.title(f"Distribution of players by position in cluster {cluster}")
    plt.axis("equal")
    legend_labels = [f"{position}: {round(percentage, 3)}%" for position, percentage in zip(position_percentages.index, position_percentages.values)]
    plt.legend(legend_labels, loc="best")
    plt.show()

### Division by Number of Minutes

In [None]:
# TODO: Division by number of minutes
unsupervised_kmeans_stats_3 = unsupervised_kmeans_stats.copy()
unsupervised_kmeans_stats_3.insert(0, "MIN", stats["MP"])
unsupervised_kmeans_stats_3[["PTS", "ORB", "DRB", "AST", "STL", "BLK"]] = (
    unsupervised_kmeans_stats_3[["PTS", "ORB", "DRB", "AST", "STL", "BLK"]].div(unsupervised_kmeans_stats_3["MIN"], axis=0)
)
unsupervised_kmeans_stats_3 = unsupervised_kmeans_stats_3.drop("MIN", axis=1).dropna()

kmeans = KMeans(n_clusters=5)
kmeans.fit(unsupervised_kmeans_stats_3)

unsupervised_kmeans_stats_3["cluster"] = kmeans.labels_
unsupervised_kmeans_stats_3.insert(0, "Pos", stats["Pos"])
unsupervised_kmeans_stats_3.head()

In [None]:
for cluster in range(5):
    cur_df = unsupervised_kmeans_stats_3.loc[unsupervised_kmeans_stats_3["cluster"] == cluster]
    plt.bar(cur_df.groupby('Pos').size().index, cur_df.groupby('Pos').size().values)
    plt.xlabel('Position')
    plt.ylabel('Count')
    plt.title(f'Counts for each position in cluster {cluster}')
    plt.show()

    position_percentages = (cur_df['Pos'].value_counts() / len(cur_df)).sort_values(ascending=False)
    plt.pie(cur_df['Pos'].value_counts())
    plt.title(f"Distribution of players by position in cluster {cluster}")
    plt.axis("equal")
    legend_labels = [f"{position}: {round(percentage, 3)}%" for position, percentage in zip(position_percentages.index, position_percentages.values)]
    plt.legend(legend_labels, loc="best")
    plt.show()

In [None]:
for column in stats:
    print(column)

# Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

agglomerative_stats = stats[["PTS", "ORB", "DRB", "AST", "STL", "BLK"]].dropna()

agglomerative_stats_1 = agglomerative_stats.copy()
agglomerative_clustering = AgglomerativeClustering(n_clusters=5)
agglomerative_clustering.fit(agglomerative_stats_1)

agglomerative_stats_1["cluster"] = agglomerative_clustering.labels_
agglomerative_stats_1.insert(0, "Pos", stats["Pos"])
agglomerative_stats_1.head()

In [None]:
for cluster in range(5):
    cur_df = agglomerative_stats_1.loc[agglomerative_stats_1["cluster"] == cluster]
    plt.bar(cur_df.groupby('Pos').size().index, cur_df.groupby('Pos').size().values)
    plt.xlabel('Position')
    plt.ylabel('Count')
    plt.title(f'Counts for each position in cluster {cluster}')
    plt.show()

    position_percentages = (cur_df['Pos'].value_counts() / len(cur_df)).sort_values(ascending=False)
    plt.pie(cur_df['Pos'].value_counts())
    plt.title(f"Distribution of players by position in cluster {cluster}")
    plt.axis("equal")
    legend_labels = [f"{position}: {round(percentage, 3)}%" for position, percentage in zip(position_percentages.index, position_percentages.values)]
    plt.legend(legend_labels, loc="best")
    plt.show()

### Normalization

In [None]:
# Normalization
agglomerative_stats_2 = (agglomerative_stats-agglomerative_stats.mean())/agglomerative_stats.std()

agglomerative_clustering = AgglomerativeClustering(n_clusters=5)
agglomerative_clustering.fit(agglomerative_stats_2)

agglomerative_stats_2["cluster"] = agglomerative_clustering.labels_
agglomerative_stats_2.insert(0, "Pos", stats["Pos"])
agglomerative_stats_2.head()

In [None]:
for cluster in range(5):
    cur_df = agglomerative_stats_2.loc[agglomerative_stats_2["cluster"] == cluster]
    plt.bar(cur_df.groupby('Pos').size().index, cur_df.groupby('Pos').size().values)
    plt.xlabel('Position')
    plt.ylabel('Count')
    plt.title(f'Counts for each position in cluster {cluster}')
    plt.show()

    position_percentages = (cur_df['Pos'].value_counts() / len(cur_df)).sort_values(ascending=False)
    plt.pie(cur_df['Pos'].value_counts())
    plt.title(f"Distribution of players by position in cluster {cluster}")
    plt.axis("equal")
    legend_labels = [f"{position}: {round(percentage, 3)}%" for position, percentage in zip(position_percentages.index, position_percentages.values)]
    plt.legend(legend_labels, loc="best")
    plt.show()

### Normalization with 3 Clusters

In [None]:
# Normalization
agglomerative_stats_3 = (agglomerative_stats-agglomerative_stats.mean())/agglomerative_stats.std()

agglomerative_clustering = AgglomerativeClustering(n_clusters=3)
agglomerative_clustering.fit(agglomerative_stats_3)

agglomerative_stats_3["cluster"] = agglomerative_clustering.labels_
agglomerative_stats_3.insert(0, "Pos", stats["Pos"])
agglomerative_stats_3.head()

In [None]:
for cluster in range(3):
    cur_df = agglomerative_stats_3.loc[agglomerative_stats_3["cluster"] == cluster]
    plt.bar(cur_df.groupby('Pos').size().index, cur_df.groupby('Pos').size().values)
    plt.xlabel('Position')
    plt.ylabel('Count')
    plt.title(f'Counts for each position in cluster {cluster}')
    plt.show()

    position_percentages = (cur_df['Pos'].value_counts() / len(cur_df)).sort_values(ascending=False)
    plt.pie(cur_df['Pos'].value_counts())
    plt.title(f"Distribution of players by position in cluster {cluster}")
    plt.axis("equal")
    legend_labels = [f"{position}: {round(percentage, 3)}%" for position, percentage in zip(position_percentages.index, position_percentages.values)]
    plt.legend(legend_labels, loc="best")
    plt.show()

# Outlier Analysis - From Clustering

Visual observation of the counts from clusters 1 and 2 shows that cluster 1 contains mostly backcourt players - those playing center or power forward. Similarly, cluster 2 conatins mostly frontcourt players - those playing point guard and shooting guard. As a result, we'll inspect the player info of the rows in those clusters which do not fit within those groups and therefore can be classified as "outliers" in a sense.

In [None]:
cluster_1 = agglomerative_stats_3.loc[agglomerative_stats_3["cluster"] == 1]
cluster_1.head()
cluster_1_pg_outliers = cluster_1.loc[cluster_1["Pos"] == "PG"]
cluster_1_sg_outliers = cluster_1.loc[cluster_1["Pos"] == "SG"]
cluster_1_outliers = pd.concat([cluster_1_pg_outliers, cluster_1_sg_outliers], axis=0)
stats.loc[cluster_1_outliers.index.values]

In [None]:
cluster_2 = agglomerative_stats_3.loc[agglomerative_stats_3["cluster"] == 2]
cluster_2.head()
cluster_2_pf_outliers = cluster_2.loc[cluster_2["Pos"] == "PF"]
cluster_2_c_outliers = cluster_2.loc[cluster_2["Pos"] == "C"]
cluster_2_outliers = pd.concat([cluster_2_pf_outliers, cluster_2_c_outliers], axis=0)
stats.loc[cluster_2_outliers.index.values]

# Outlier Detection - iForest Approach

_add description of iForest approach for outlier detection here_

In [None]:
from sklearn.ensemble import IsolationForest

def iForest(data, n_estimators=100, outlier_fraction='auto'):
    isolation_forest = IsolationForest(n_estimators=n_estimators, contamination=outlier_fraction)
    # Fit the model to the data
    isolation_forest.fit(data)
    # Use the model to predict the outliers
    outliers = isolation_forest.predict(data) == -1
    # Extract the rows with outliers
    outliers_only = data[outliers]
    # Extract the rows without outliers
    no_outliers =data[~outliers]
    return outliers_only, no_outliers

for cluster in range(3):
    cur_df = agglomerative_stats_3.loc[agglomerative_stats_3["cluster"] == cluster]
    # print(cur_df[["PTS", "ORB", "DRB", "AST", "STL", "BLK"]].head())
    outliers = iForest(cur_df[["PTS", "ORB", "DRB", "AST", "STL", "BLK"]])
    print(outliers)