In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px

In [3]:
filename = '../../../data/NBAShot Locations1997-2020.csv'
df = pd.read_csv(filename)

In [4]:
df.head()

Unnamed: 0,Game ID,Game Event ID,Player ID,Player Name,Team ID,Team Name,Period,Minutes Remaining,Seconds Remaining,Action Type,...,Shot Zone Area,Shot Zone Range,Shot Distance,X Location,Y Location,Shot Made Flag,Game Date,Home Team,Away Team,Season Type
0,29700427,389,100,Tim Legler,1610612764,Washington Wizards,4,11,22,Jump Shot,...,Right Side(R),8-16 ft.,15,117,109,1,19980102,WAS,IND,Regular Season
1,29700427,406,100,Tim Legler,1610612764,Washington Wizards,4,9,36,Jump Shot,...,Right Side(R),8-16 ft.,14,143,25,0,19980102,WAS,IND,Regular Season
2,29700427,475,100,Tim Legler,1610612764,Washington Wizards,4,3,7,Jump Shot,...,Left Side(L),8-16 ft.,10,-87,55,0,19980102,WAS,IND,Regular Season
3,29700427,487,100,Tim Legler,1610612764,Washington Wizards,4,1,45,Jump Shot,...,Center(C),Less Than 8 ft.,5,-1,53,0,19980102,WAS,IND,Regular Season
4,29700427,497,100,Tim Legler,1610612764,Washington Wizards,4,0,45,Jump Shot,...,Right Side(R),8-16 ft.,14,89,113,0,19980102,WAS,IND,Regular Season


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4729512 entries, 0 to 4729511
Data columns (total 22 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Game ID            int64 
 1   Game Event ID      int64 
 2   Player ID          int64 
 3   Player Name        object
 4   Team ID            int64 
 5   Team Name          object
 6   Period             int64 
 7   Minutes Remaining  int64 
 8   Seconds Remaining  int64 
 9   Action Type        object
 10  Shot Type          object
 11  Shot Zone Basic    object
 12  Shot Zone Area     object
 13  Shot Zone Range    object
 14  Shot Distance      int64 
 15  X Location         int64 
 16  Y Location         int64 
 17  Shot Made Flag     int64 
 18  Game Date          int64 
 19  Home Team          object
 20  Away Team          object
 21  Season Type        object
dtypes: int64(12), object(10)
memory usage: 793.8+ MB


In [6]:
three_point_shots = df[df['Shot Type'] == '3PT Field Goal']

In [7]:
player_stats = three_point_shots.groupby('Player Name').agg(
    shooting_percentage=('Shot Made Flag', 'mean'),
    total_attempts=('Game ID', 'count'),
    avg_shot_distance=('Shot Distance', 'mean')
).reset_index()

In [8]:
player_stats.head()

Unnamed: 0,Player Name,shooting_percentage,total_attempts,avg_shot_distance
0,A.C. Green,0.045455,22,33.545455
1,A.J. Guyton,0.375,192,25.171875
2,AJ Hammons,0.5,10,23.8
3,AJ Price,0.318697,706,25.430595
4,Aaron Brooks,0.367698,2328,25.721649


In [9]:
player_stats.describe()

Unnamed: 0,shooting_percentage,total_attempts,avg_shot_distance
count,1917.0,1917.0,1917.0
mean,0.267691,586.075639,26.311453
std,0.146272,1045.323658,4.924854
min,0.0,1.0,22.0
25%,0.2,13.0,24.462857
50%,0.314476,96.0,24.936508
75%,0.357143,664.0,25.641509
max,1.0,8089.0,84.0


In [10]:
player_games = three_point_shots.groupby(['Player Name', 'Game ID']).size().reset_index(name='Attempts')
player_attempts_per_game = player_games.groupby('Player Name')['Attempts'].mean().reset_index()


In [11]:
player_attempts_per_game.describe()

Unnamed: 0,Attempts
count,1917.0
mean,2.240827
std,1.223158
min,1.0
25%,1.2
50%,1.941176
75%,2.93921
max,8.557621


In [12]:
player_stats = pd.merge(player_stats, player_attempts_per_game, on='Player Name')


In [13]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(player_stats[['shooting_percentage', 'Attempts', 'avg_shot_distance']])


kmeans = KMeans(n_clusters=6, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

player_stats['cluster'] = clusters




In [14]:
player_stats['cluster'] = clusters

# Create an interactive scatter plot using Plotly
fig = px.scatter(player_stats, x='shooting_percentage', y='Attempts', color='cluster',
                 hover_data=['Player Name'], color_continuous_scale='viridis', title='NBA Players Clustering Based on 3-Point Shooting')

fig.update_layout(xaxis_title='Shooting Percentage', yaxis_title='Attempts per Game')
fig.show()

In [15]:
df.head()

Unnamed: 0,Game ID,Game Event ID,Player ID,Player Name,Team ID,Team Name,Period,Minutes Remaining,Seconds Remaining,Action Type,...,Shot Zone Area,Shot Zone Range,Shot Distance,X Location,Y Location,Shot Made Flag,Game Date,Home Team,Away Team,Season Type
0,29700427,389,100,Tim Legler,1610612764,Washington Wizards,4,11,22,Jump Shot,...,Right Side(R),8-16 ft.,15,117,109,1,19980102,WAS,IND,Regular Season
1,29700427,406,100,Tim Legler,1610612764,Washington Wizards,4,9,36,Jump Shot,...,Right Side(R),8-16 ft.,14,143,25,0,19980102,WAS,IND,Regular Season
2,29700427,475,100,Tim Legler,1610612764,Washington Wizards,4,3,7,Jump Shot,...,Left Side(L),8-16 ft.,10,-87,55,0,19980102,WAS,IND,Regular Season
3,29700427,487,100,Tim Legler,1610612764,Washington Wizards,4,1,45,Jump Shot,...,Center(C),Less Than 8 ft.,5,-1,53,0,19980102,WAS,IND,Regular Season
4,29700427,497,100,Tim Legler,1610612764,Washington Wizards,4,0,45,Jump Shot,...,Right Side(R),8-16 ft.,14,89,113,0,19980102,WAS,IND,Regular Season


In [16]:
df['Game Date'] = pd.to_datetime(df['Game Date'], format='%Y%m%d')
df['year'] = df['Game Date'].dt.year

In [17]:
df.head()

Unnamed: 0,Game ID,Game Event ID,Player ID,Player Name,Team ID,Team Name,Period,Minutes Remaining,Seconds Remaining,Action Type,...,Shot Zone Range,Shot Distance,X Location,Y Location,Shot Made Flag,Game Date,Home Team,Away Team,Season Type,year
0,29700427,389,100,Tim Legler,1610612764,Washington Wizards,4,11,22,Jump Shot,...,8-16 ft.,15,117,109,1,1998-01-02,WAS,IND,Regular Season,1998
1,29700427,406,100,Tim Legler,1610612764,Washington Wizards,4,9,36,Jump Shot,...,8-16 ft.,14,143,25,0,1998-01-02,WAS,IND,Regular Season,1998
2,29700427,475,100,Tim Legler,1610612764,Washington Wizards,4,3,7,Jump Shot,...,8-16 ft.,10,-87,55,0,1998-01-02,WAS,IND,Regular Season,1998
3,29700427,487,100,Tim Legler,1610612764,Washington Wizards,4,1,45,Jump Shot,...,Less Than 8 ft.,5,-1,53,0,1998-01-02,WAS,IND,Regular Season,1998
4,29700427,497,100,Tim Legler,1610612764,Washington Wizards,4,0,45,Jump Shot,...,8-16 ft.,14,89,113,0,1998-01-02,WAS,IND,Regular Season,1998


In [18]:
def cluster_data(data, start_year, end_year):
    
    data_decade = data[(data['year'] >= start_year) & (data['year'] <= end_year)]

    three_point_shots = data_decade[data_decade['Shot Type'] == '3PT Field Goal']

    player_stats = three_point_shots.groupby('Player ID').agg(
        player_name=('Player Name', 'first'),
        shooting_percentage=('Shot Made Flag', 'mean'),
        total_attempts=('Game ID', 'count'),
        avg_shot_distance=('Shot Distance', 'mean')
    ).reset_index()

    player_games = three_point_shots.groupby(['Player ID', 'Game ID']).size().reset_index(name='Attempts')
    player_attempts_per_game = player_games.groupby('Player ID')['Attempts'].mean().reset_index()

    player_stats = pd.merge(player_stats, player_attempts_per_game, on='Player ID')

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(player_stats[['shooting_percentage', 'Attempts', 'avg_shot_distance']])

    kmeans = KMeans(n_clusters=6, random_state=42)
    clusters = kmeans.fit_predict(scaled_features)

    player_stats['cluster'] = clusters

    return player_stats

In [19]:
df_1998_2004 = cluster_data(df, start_year=1998, end_year=2004)
df_2005_2012 = cluster_data(df, start_year=2005, end_year=2012)
df_2013_2020 = cluster_data(df, start_year=2013, end_year=2020)









In [20]:
for idx, decade_data in enumerate([df_1998_2004, df_2005_2012, df_2013_2020], start=1):
    fig = px.scatter(decade_data, x='shooting_percentage', y='Attempts', color='cluster',
                     hover_data=['player_name'], color_continuous_scale='viridis',
                     title=f'NBA Players Clustering Based on 3-Point Shooting (Period {idx})')
    fig.update_layout(xaxis_title='Shooting Percentage', yaxis_title='Attempts per Game')
    fig.show()

In [29]:
clutch_time_shots = df[(df['Minutes Remaining'] <= 5) & (df['Period'] >= 4)]

In [37]:
clutch_shots = clutch_time_shots.groupby('Player ID').agg(
        player_name=('Player Name', 'first'),
        shooting_percentage=('Shot Made Flag', 'mean'),
        total_attempts=('Game ID', 'count'),
    ).reset_index()


clutch_shots = clutch_shots[clutch_shots['total_attempts'] >= 100]
clutch_shots.head()

Unnamed: 0,Player ID,player_name,shooting_percentage,total_attempts
0,3,Grant Long,0.417526,194
3,15,Eric Piatkowski,0.478088,502
4,17,Clyde Drexler,0.42446,139
5,21,Greg Anthony,0.41195,318
6,22,Rik Smits,0.491525,295


In [41]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(clutch_shots[['shooting_percentage', 'total_attempts']])

kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

clutch_shots['cluster'] = clusters

fig = px.scatter(clutch_shots, x='shooting_percentage', y='total_attempts', color='cluster',
                 hover_data=['player_name'], color_continuous_scale='viridis', title='NBA Players Clustering Based on Clutch shooting')

fig.update_layout(xaxis_title='Shooting Percentage', yaxis_title='Total Attempts')
fig.show()



