# Cluster-Based Predictive Modeling (NBA Fantasy Points)

## Setup

### Imports

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Load Data into DataFrame

In [3]:
data_dir = '/teamspace/uploads/'
df = pd.read_parquet(f'{data_dir}/player_stats_with_clusters_2022_2024_2.parquet')
print(f'Shape: {df.shape} \n{df.columns}')

Shape: (55968, 55) 
Index(['Date', 'Name', 'Team', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', 'GmSc', '+-', 'TeamAbbr', 'GameLink', 'Opponent', 'Home',
       'GameType', 'id', 'fpts_fanduel', 'Season', 'FG_2game_avg',
       'FGA_2game_avg', 'FG%_2game_avg', '3P_2game_avg', '3PA_2game_avg',
       '3P%_2game_avg', 'FT_2game_avg', 'FTA_2game_avg', 'FT%_2game_avg',
       'ORB_2game_avg', 'DRB_2game_avg', 'TRB_2game_avg', 'AST_2game_avg',
       'STL_2game_avg', 'BLK_2game_avg', 'TOV_2game_avg', 'PF_2game_avg',
       'PTS_2game_avg', 'GmSc_2game_avg', '+-_2game_avg',
       'fpts_fanduel_2game_avg', 'PTS_per_FGA_2game_avg', 'kmeans_cluster'],
      dtype='object')


In [4]:
df.head()

Unnamed: 0,Date,Name,Team,MP,FG,FGA,FG%,3P,3PA,3P%,...,STL_2game_avg,BLK_2game_avg,TOV_2game_avg,PF_2game_avg,PTS_2game_avg,GmSc_2game_avg,+-_2game_avg,fpts_fanduel_2game_avg,PTS_per_FGA_2game_avg,kmeans_cluster
4557,2022-10-18,noah vonleh,Boston Celtics,20.166667,1,2,0.5,0,0,0.0,...,0.0,0.0,1.0,2.5,1.0,-1.25,-2.0,2.4,0.666667,3
14351,2022-10-18,de'anthony melton,Philadelphia 76ers,20.55,2,4,0.5,1,2,0.5,...,0.5,0.0,0.0,0.0,1.5,1.8,-9.0,6.45,0.6,3
3461,2022-10-18,blake griffin,Boston Celtics,8.283333,0,2,0.0,0,1,0.0,...,1.5,0.0,2.0,4.5,6.0,5.95,-9.5,21.4,1.0,2
9848,2022-10-18,danuel house jr.,Philadelphia 76ers,16.2,0,2,0.0,0,1,0.0,...,1.0,0.0,1.0,1.0,1.5,1.6,-3.0,12.95,0.272727,3
12649,2022-10-18,donte divincenzo,Golden State Warriors,21.616667,3,6,0.5,1,2,0.5,...,1.0,0.0,0.5,1.0,11.0,8.4,-3.5,17.4,1.375,0


### Correlation Matrix

In [31]:
numeric_columns = [
  'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
  'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
  'PF', 'PTS', 'GmSc', '+-', 'fpts_fanduel', 'Home','FG_2game_avg', 'FGA_2game_avg', 'FG%_2game_avg', '3P_2game_avg', 
  '3PA_2game_avg', '3P%_2game_avg', 'FT_2game_avg', 'FTA_2game_avg', 
  'FT%_2game_avg', 'ORB_2game_avg', 'DRB_2game_avg', 'TRB_2game_avg', 
  'AST_2game_avg', 'STL_2game_avg', 'BLK_2game_avg', 'TOV_2game_avg', 
  'PF_2game_avg', 'PTS_2game_avg', 'GmSc_2game_avg', '+-_2game_avg',
  'fpts_fanduel_2game_avg', 'PTS_per_FGA_2game_avg', 'kmeans_cluster'
]

filtered_df = df[numeric_columns]
correlation_matrix = filtered_df.corr()

# Convert the correlation matrix to long format
correlation_matrix_long = correlation_matrix.reset_index().melt(id_vars='index')
correlation_matrix_long.columns = ['Feature1', 'Feature2', 'Correlation']

# Create an interactive heatmap using Plotly
fig = px.imshow(correlation_matrix,
                labels=dict(x="Features", y="Features", color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',
                zmin=-1, zmax=1,
                aspect="auto")

# Customize the layout for better visibility
fig.update_layout(
    title="Correlation Heatmap of Selected Features with Target Variable",
    xaxis_title="Features",
    yaxis_title="Features",
    width=1400,
    height=1200
)

# Add hover information with the correlation value and feature names
fig.update_traces(hovertemplate='Correlation: %{z}<br>Feature 1: %{x}<br>Feature 2: %{y}')

# Show the interactive plot
fig.show()

Based on the correlation heatmap, we are going to select the following features:
- 

## Model 1