# Clustering NBA Players Based on 2-Game Rolling Averages


# Intro and Setup
We will begin by importing necessary libraries. The purpose of this notebook is to test, evaluate and compare different clustering algorithms for training prediction models later on.

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Load Data and Ensure unique player IDs

In [24]:
season_22_23 = pd.read_csv('/content/NBA-Fantasy-Points-Prediction/data/processed/Season(2022-23)_cleaned.csv')
season_23_24 = pd.read_csv('/content/NBA-Fantasy-Points-Prediction/data/processed/Season(2023-24)_cleaned.csv')

season_22_23['Season'] = '2022-23'
season_23_24['Season'] = '2023-24'

def get_unique_ids(df):
  unique_ids = df.groupby('Name')['id'].nunique()
  players_with_multiple_ids = unique_ids[unique_ids > 1]
  return players_with_multiple_ids

season_22_23_ids = get_unique_ids(season_22_23)
season_23_24_ids = get_unique_ids(season_23_24)

print(season_22_23_ids)
print(season_23_24_ids)

Series([], Name: id, dtype: int64)
Series([], Name: id, dtype: int64)


## Calculate 2-game rolling averages

In [25]:
stats_columns = ['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+-', 'fpts_fanduel']

def calculate_rolling_avg(df):
  df.sort_values(by=['id', 'Date'], inplace=True)
  grouped = df.groupby('id')
  for column in stats_columns:
    rolling_means = grouped[column].rolling(window=2).mean().shift(1)
    rolling_means = rolling_means.reset_index(level=0, drop=True)
    df[f'{column}_2game_avg'] = rolling_means
  df.loc[df.groupby('id').head(1).index, [f'{column}_2game_avg' for column in stats_columns]] = pd.NA
  return df

season_22_23 = calculate_rolling_avg(season_22_23)
season_23_24 = calculate_rolling_avg(season_23_24)

df = pd.concat([season_22_23, season_23_24], ignore_index=True)


## Convert MP to total minutes as floating values

In [26]:
# Convert MM:SS format to total minutes as float
def convert_mp_to_minutes(mp_str):
  if isinstance(mp_str, str):
    minutes, seconds = map(int, mp_str.split(':'))
    return minutes + seconds / 60.0
  return 0

pd.set_option('display.max_columns', None)
df['MP'] = df['MP'].apply(convert_mp_to_minutes)
df['Date'] = pd.to_datetime(df['Date'])
df.dtypes.to_frame().T

Unnamed: 0,Date,Name,Team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+-,TeamAbbr,GameLink,Opponent,Home,GameType,id,fpts_fanduel,Season,FG_2game_avg,FGA_2game_avg,FG%_2game_avg,3P_2game_avg,3PA_2game_avg,3P%_2game_avg,FT_2game_avg,FTA_2game_avg,FT%_2game_avg,ORB_2game_avg,DRB_2game_avg,TRB_2game_avg,AST_2game_avg,STL_2game_avg,BLK_2game_avg,TOV_2game_avg,PF_2game_avg,PTS_2game_avg,GmSc_2game_avg,+-_2game_avg,fpts_fanduel_2game_avg
0,datetime64[ns],object,object,float64,int64,int64,float64,int64,int64,float64,int64,int64,float64,int64,int64,int64,int64,int64,int64,int64,int64,int64,float64,int64,object,object,object,int64,object,int64,float64,object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64


## Display Data

In [36]:
df = df.sort_values(by='Date')
df.tail()

Unnamed: 0,Date,Name,Team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+-,TeamAbbr,GameLink,Opponent,Home,GameType,id,fpts_fanduel,Season,FG_2game_avg,FGA_2game_avg,FG%_2game_avg,3P_2game_avg,3PA_2game_avg,3P%_2game_avg,FT_2game_avg,FTA_2game_avg,FT%_2game_avg,ORB_2game_avg,DRB_2game_avg,TRB_2game_avg,AST_2game_avg,STL_2game_avg,BLK_2game_avg,TOV_2game_avg,PF_2game_avg,PTS_2game_avg,GmSc_2game_avg,+-_2game_avg,fpts_fanduel_2game_avg
38463,2024-06-17,maxi kleber,Dallas Mavericks,13.433333,1,3,0.333,0,2,0.0,0,0,0.0,1,2,3,1,0,0,0,2,2,1.5,-12,DAL,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Finals,20001909,7.1,2023-24,0.5,2.0,0.25,0.5,1.0,0.5,1.0,1.0,0.5,0.0,1.0,1.0,0.5,0.5,0.0,0.0,0.0,2.5,2.45,6.0,5.95
49700,2024-06-17,a.j. lawson,Dallas Mavericks,2.616667,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,3,DAL,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Finals,20002725,0.0,2023-24,0.5,1.5,0.25,0.0,0.5,0.0,0.5,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.45,-2.5,1.5
55034,2024-06-17,olivier-maxence prosper,Dallas Mavericks,2.616667,0,0,0.0,0,0,0.0,0,0,0.0,0,1,1,0,0,0,0,0,0,0.3,3,DAL,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Finals,20003067,1.2,2023-24,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,-0.05,-4.5,1.95
40606,2024-06-17,svi mykhailiuk,Boston Celtics,0.633333,0,0,0.0,0,0,0.0,0,0,0.0,0,1,1,0,0,0,0,0,0,0.3,0,BOS,https://www.basketball-reference.com/boxscores...,Dallas Mavericks,1,Finals,20002035,1.2,2023-24,1.0,4.0,0.5715,0.5,2.5,0.1,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,2.5,0.25,-5.0,3.1
43790,2024-06-17,oshae brissett,Boston Celtics,0.633333,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,BOS,https://www.basketball-reference.com/boxscores...,Dallas Mavericks,1,Finals,20002356,0.0,2023-24,1.0,2.0,0.25,0.5,0.5,0.5,1.0,1.0,0.5,0.5,1.0,1.5,0.0,0.0,0.5,0.0,0.0,3.5,3.5,-1.5,6.8


# Data Prep for Clustering
In this section, we will prepare the data for clustering. This includes selecting the relevant features, handling any missing values, and scaling the features to ensure they contribute equally to the clustering process.


# Save Colab notebook and push to Github Repo

In [None]:
import os
from getpass import getpass

if not os.path.exists('/content/drive'):
  from google.colab import drive
  drive.mount('/content/drive')
else:
  print("Google Drive already mounted")

token = getpass('Enter your GitHub token: ')

if not os.path.exists('/content/NBA-Fantasy-Points-Prediction'):
  !git clone https://{token}@github.com/sammig6i/NBA-Fantasy-Points-Prediction.git
else:
  print("Repo already cloned, pulling the latest changes")
  %cd /content/NBA-Fantasy-Points-Prediction
  !git pull https://{token}@github.com/sammig6i/NBA-Fantasy-Points-Prediction.git

!cp /content/drive/MyDrive/NBA_Fantasy_Clustering.ipynb /content/NBA-Fantasy-Points-Prediction/notebooks/

%cd /content/NBA-Fantasy-Points-Prediction

!git add notebooks/NBA_Fantasy_Clustering.ipynb

!git commit -m "Add clustering notebook for NBA Fantasy project"

!git push https://{token}@github.com/sammig6i/NBA-Fantasy-Points-Prediction.git


Google Drive already mounted
