# Model Development

In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Prepare Data for Model Development

In [67]:
season_22_23 = pd.read_csv('../data/processed/Season(2022-23)_cleaned.csv')
season_23_24 = pd.read_csv('../data/processed/Season(2023-24)_cleaned.csv')

In [68]:
season_22_23['Season'] = '2022-23'
season_23_24['Season'] = '2023-24'

season_22_23.head()

Unnamed: 0,Date,Name,Team,MP,FG,FGA,FG%,3P,3PA,3P%,...,GmSc,+-,TeamAbbr,GameLink,Opponent,Home,GameType,id,fpts_fanduel,Season
0,2022-10-18,tyrese maxey,Philadelphia 76ers,38:11,8,16,0.5,2,5,0.4,...,13.7,-6,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20002546,30.2,2022-23
1,2022-10-18,james harden,Philadelphia 76ers,37:17,9,14,0.643,5,9,0.556,...,31.9,1,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20000544,52.1,2022-23
2,2022-10-18,joel embiid,Philadelphia 76ers,37:16,9,18,0.5,1,6,0.167,...,17.7,-13,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20000726,48.5,2022-23
3,2022-10-18,tobias harris,Philadelphia 76ers,34:14,7,14,0.5,3,6,0.5,...,13.4,-1,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20000676,29.4,2022-23
4,2022-10-18,p.j. tucker,Philadelphia 76ers,33:01,3,5,0.6,0,2,0.0,...,3.6,-6,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20000856,11.8,2022-23


In [69]:
def get_unique_ids(df):
  unique_ids = df.groupby('Name')['id'].nunique()
  players_with_multiple_ids = unique_ids[unique_ids > 1]
  return players_with_multiple_ids

season_22_23_ids = get_unique_ids(season_22_23)
season_23_24_ids = get_unique_ids(season_23_24)

print(season_22_23_ids) 
print(season_23_24_ids) 

Series([], Name: id, dtype: int64)
Series([], Name: id, dtype: int64)


Calculate rolling window averages of every 2 games for each player. This will add a new column that represents each of these 2 game averages across the datasets.

In [70]:
stats_columns = ['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+-', 'fpts_fanduel']

def calculate_rolling_avg(df):
  df.sort_values(by=['id', 'Date'], inplace=True)
  grouped = df.groupby('id')
  for column in stats_columns:
    rolling_means = grouped[column].rolling(window=2).mean().shift(1)
    rolling_means = rolling_means.reset_index(level=0, drop=True)
    df[f'{column}_2game_avg'] = rolling_means
  df.loc[df.groupby('id').head(1).index, [f'{column}_2game_avg' for column in stats_columns]] = pd.NA
  return df

season_22_23 = calculate_rolling_avg(season_22_23)
season_23_24 = calculate_rolling_avg(season_23_24)

df = pd.concat([season_22_23, season_23_24], ignore_index=True)


In [71]:
df.head()

Unnamed: 0,Date,Name,Team,MP,FG,FGA,FG%,3P,3PA,3P%,...,TRB_2game_avg,AST_2game_avg,STL_2game_avg,BLK_2game_avg,TOV_2game_avg,PF_2game_avg,PTS_2game_avg,GmSc_2game_avg,+-_2game_avg,fpts_fanduel_2game_avg
0,2022-10-19,bradley beal,Washington Wizards,37:31,9,17,0.529,2,7,0.286,...,,,,,,,,,,
1,2022-10-21,bradley beal,Washington Wizards,36:39,9,14,0.643,1,2,0.5,...,,,,,,,,,,
2,2022-10-23,bradley beal,Washington Wizards,41:35,11,16,0.688,0,1,0.0,...,3.0,7.0,0.0,1.5,4.0,2.0,21.0,16.2,6.0,35.6
3,2022-10-25,bradley beal,Washington Wizards,21:49,4,9,0.444,1,4,0.25,...,2.5,6.5,1.0,1.0,5.5,2.5,23.0,16.8,-0.5,36.25
4,2022-10-28,bradley beal,Washington Wizards,37:13,11,18,0.611,2,5,0.4,...,4.0,5.5,1.0,0.0,4.0,2.5,20.0,15.3,0.0,32.05


In [55]:
df = pd.concat([season_22_23, season_23_24], ignore_index=True)

Unnamed: 0,Date,Name,Team,MP,FG,FGA,FG%,3P,3PA,3P%,...,GmSc,+-,TeamAbbr,GameLink,Opponent,Home,GameType,id,fpts_fanduel,Season
0,2022-10-18,tyrese maxey,Philadelphia 76ers,38:11,8,16,0.5,2,5,0.4,...,13.7,-6,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20002546,30.2,2022-23
1,2022-10-18,james harden,Philadelphia 76ers,37:17,9,14,0.643,5,9,0.556,...,31.9,1,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20000544,52.1,2022-23
2,2022-10-18,joel embiid,Philadelphia 76ers,37:16,9,18,0.5,1,6,0.167,...,17.7,-13,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20000726,48.5,2022-23
3,2022-10-18,tobias harris,Philadelphia 76ers,34:14,7,14,0.5,3,6,0.5,...,13.4,-1,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20000676,29.4,2022-23
4,2022-10-18,p.j. tucker,Philadelphia 76ers,33:01,3,5,0.6,0,2,0.0,...,3.6,-6,PHI,https://www.basketball-reference.com/boxscores...,Boston Celtics,0,Season,20000856,11.8,2022-23


In [88]:
player_id = 20003022
player_data = df[df['id'] == player_id][['Date', 'FG', 'FG_2game_avg', 'PTS', 'PTS_2game_avg']]
player_data.head(10)

Unnamed: 0,Date,FG,FG_2game_avg,PTS,PTS_2game_avg
27738,2023-02-15,0,,2,
53362,2023-10-28,0,,0,
53363,2023-10-30,0,,0,
53364,2023-11-01,0,0.0,0,0.0
53365,2023-11-02,0,0.0,0,0.0
53366,2023-11-05,0,0.0,0,0.0
53367,2023-11-06,5,0.0,15,0.0
53368,2023-11-08,1,2.5,2,7.5
53369,2023-11-10,1,3.0,10,8.5
53370,2023-11-17,2,1.0,10,6.0


In [103]:
player_ids_in_both_seasons = df[df['Date'] == '2023-06-12']['id'].unique()
player_ids_in_both_seasons_2023_24 = df[df['Date'] > '2023-10-24']['id'].unique()
player_ids_in_both_seasons = np.intersect1d(player_ids_in_both_seasons, player_ids_in_both_seasons_2023_24)

selected_player_id = player_ids_in_both_seasons[0]

In [104]:
selected_player_data = df[df['id'] == selected_player_id]

print("End of 2022-23 season:")
end_of_2022_23 = selected_player_data[(selected_player_data['Date'] > '2023-05-01') & (selected_player_data['Date'] <= '2023-06-12')]
print(end_of_2022_23[['Date', 'PTS', 'PTS_2game_avg']])

print("\nBeginning of 2023-24 season:")
beginning_of_2023_24 = selected_player_data[(selected_player_data['Date'] >= '2023-10-24') & (selected_player_data['Date'] <= '2023-11-30')]
print(beginning_of_2023_24[['Date', 'PTS', 'PTS_2game_avg']])


End of 2022-23 season:
           Date  PTS  PTS_2game_avg
405  2023-05-02    6           14.0
406  2023-05-06   14           12.0
407  2023-05-08   15           10.0
408  2023-05-10    9           14.5
409  2023-05-12   11           12.0
410  2023-05-17   15           10.0
411  2023-05-19    0           13.0
412  2023-05-21    7            7.5
413  2023-05-23    5            3.5
414  2023-05-25    5            6.0
415  2023-05-27    8            5.0
416  2023-05-29    7            6.5
417  2023-06-01   11            7.5
418  2023-06-04    9            9.0
419  2023-06-07    9           10.0
420  2023-06-09   13            9.0
421  2023-06-12   12           11.0

Beginning of 2023-24 season:
             Date  PTS  PTS_2game_avg
28007  2023-10-25    0            NaN
28008  2023-10-27   13            NaN
28009  2023-10-28    3            6.5
28010  2023-10-30    8            8.0
28011  2023-11-01    6            5.5
28012  2023-11-03    3            7.0
28013  2023-11-06    5           

In [83]:
print(df[df['Season'] == '2022-23']['Date'].max())
print(df[df['Season'] == '2023-24']['Date'].min())

2023-06-12
2023-10-24


# Model Development