# This Jupyter notebook is to bring in the game data and prepare it for Machine Learning algorithims

In [235]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
tqdm.pandas()

In [310]:
# Importing csv file
df_game_data = pd.read_csv(Path('../Resources/games.csv'))
df_game_data.sort_index(ascending = False, inplace = True)
df_game_data

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
25795,2014-10-04,11400001,Final,1610612748,1610612740,2014,1610612748,86.0,0.431,0.679,...,18.0,42.0,1610612740,98.0,0.462,0.706,0.438,19.0,42.0,0
25794,2014-10-05,11400002,Final,1610612761,1610612758,2014,1610612761,99.0,0.440,0.771,...,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0,1
25793,2014-10-06,11400005,Final,1610612747,1610612743,2014,1610612747,98.0,0.448,0.682,...,29.0,45.0,1610612743,95.0,0.387,0.659,0.500,19.0,43.0,1
25792,2014-10-06,11400004,Final,1610612741,1610612764,2014,1610612741,81.0,0.338,0.719,...,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0,0
25791,2014-10-06,11400007,Final,1610612737,1610612740,2014,1610612737,93.0,0.419,0.821,...,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,2022-03-12,22101009,Final,1610612743,1610612761,2021,1610612743,115.0,0.551,0.750,...,32.0,39.0,1610612761,127.0,0.471,0.760,0.387,28.0,50.0,0
3,2022-03-12,22101008,Final,1610612744,1610612749,2021,1610612744,122.0,0.484,0.933,...,33.0,55.0,1610612749,109.0,0.413,0.696,0.386,27.0,39.0,1
2,2022-03-12,22101007,Final,1610612759,1610612754,2021,1610612759,108.0,0.412,0.813,...,28.0,52.0,1610612754,119.0,0.489,1.000,0.389,23.0,47.0,0
1,2022-03-12,22101006,Final,1610612741,1610612739,2021,1610612741,101.0,0.443,0.933,...,20.0,46.0,1610612739,91.0,0.419,0.824,0.208,19.0,40.0,1


In [311]:
# Checking Data types
df_game_data.dtypes

GAME_DATE_EST        object
GAME_ID               int64
GAME_STATUS_TEXT     object
HOME_TEAM_ID          int64
VISITOR_TEAM_ID       int64
SEASON                int64
TEAM_ID_home          int64
PTS_home            float64
FG_PCT_home         float64
FT_PCT_home         float64
FG3_PCT_home        float64
AST_home            float64
REB_home            float64
TEAM_ID_away          int64
PTS_away            float64
FG_PCT_away         float64
FT_PCT_away         float64
FG3_PCT_away        float64
AST_away            float64
REB_away            float64
HOME_TEAM_WINS        int64
dtype: object

In [312]:
# Fixing Data types
df_game_data['GAME_DATE_EST'] = pd.to_datetime(df_game_data['GAME_DATE_EST'])
# Checking if any game status is not 'Final'
df_check = df_game_data.where(df_game_data['GAME_STATUS_TEXT'] != 'Final').dropna()
df_check
# Dropping column
df_game_data.drop(columns= ['GAME_STATUS_TEXT'], inplace = True)

In [313]:
# Re-checking data types
df_game_data.dtypes

GAME_DATE_EST      datetime64[ns]
GAME_ID                     int64
HOME_TEAM_ID                int64
VISITOR_TEAM_ID             int64
SEASON                      int64
TEAM_ID_home                int64
PTS_home                  float64
FG_PCT_home               float64
FT_PCT_home               float64
FG3_PCT_home              float64
AST_home                  float64
REB_home                  float64
TEAM_ID_away                int64
PTS_away                  float64
FG_PCT_away               float64
FT_PCT_away               float64
FG3_PCT_away              float64
AST_away                  float64
REB_away                  float64
HOME_TEAM_WINS              int64
dtype: object

In [314]:
#df_game_data.to_csv('../Resources/nba_game_data.csv')
df_game_data

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
25795,2014-10-04,11400001,1610612748,1610612740,2014,1610612748,86.0,0.431,0.679,0.333,18.0,42.0,1610612740,98.0,0.462,0.706,0.438,19.0,42.0,0
25794,2014-10-05,11400002,1610612761,1610612758,2014,1610612761,99.0,0.440,0.771,0.333,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0,1
25793,2014-10-06,11400005,1610612747,1610612743,2014,1610612747,98.0,0.448,0.682,0.500,29.0,45.0,1610612743,95.0,0.387,0.659,0.500,19.0,43.0,1
25792,2014-10-06,11400004,1610612741,1610612764,2014,1610612741,81.0,0.338,0.719,0.381,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0,0
25791,2014-10-06,11400007,1610612737,1610612740,2014,1610612737,93.0,0.419,0.821,0.421,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,2022-03-12,22101009,1610612743,1610612761,2021,1610612743,115.0,0.551,0.750,0.407,32.0,39.0,1610612761,127.0,0.471,0.760,0.387,28.0,50.0,0
3,2022-03-12,22101008,1610612744,1610612749,2021,1610612744,122.0,0.484,0.933,0.400,33.0,55.0,1610612749,109.0,0.413,0.696,0.386,27.0,39.0,1
2,2022-03-12,22101007,1610612759,1610612754,2021,1610612759,108.0,0.412,0.813,0.324,28.0,52.0,1610612754,119.0,0.489,1.000,0.389,23.0,47.0,0
1,2022-03-12,22101006,1610612741,1610612739,2021,1610612741,101.0,0.443,0.933,0.429,20.0,46.0,1610612739,91.0,0.419,0.824,0.208,19.0,40.0,1


# This dataset presented heavy look ahead bias as all datapoints were from the gameplayer, not the game before. I will now use rolling averages to fix this issue

In [315]:
# Creating new dataframe for values changing to rolling averages
df_game_data_stats = df_game_data.select_dtypes(include = 'float64')
df_game_data_stats['TEAM_ID_home'] = df_game_data['TEAM_ID_home']
# Creating a list of columns with float dtype
float_dtypes = list(df_game_data.select_dtypes(include = 'float64'))
# Shifting all columns down one row 
for column in df_game_data:
   if column in float_dtypes:
    df_game_data_stats[column] = df_game_data_stats[column].shift(periods = 1)
# Displaying dataframe
display(df_game_data_stats)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,TEAM_ID_home
25795,,,,,,,,,,,,,1610612748
25794,86.0,0.431,0.679,0.333,18.0,42.0,98.0,0.462,0.706,0.438,19.0,42.0,1610612761
25793,99.0,0.440,0.771,0.333,21.0,30.0,94.0,0.469,0.725,0.385,18.0,45.0,1610612747
25792,98.0,0.448,0.682,0.500,29.0,45.0,95.0,0.387,0.659,0.500,19.0,43.0,1610612741
25791,81.0,0.338,0.719,0.381,18.0,40.0,85.0,0.411,0.636,0.267,17.0,47.0,1610612737
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,134.0,0.558,0.710,0.390,21.0,44.0,125.0,0.500,0.857,0.394,27.0,33.0,1610612743
3,115.0,0.551,0.750,0.407,32.0,39.0,127.0,0.471,0.760,0.387,28.0,50.0,1610612744
2,122.0,0.484,0.933,0.400,33.0,55.0,109.0,0.413,0.696,0.386,27.0,39.0,1610612759
1,108.0,0.412,0.813,0.324,28.0,52.0,119.0,0.489,1.000,0.389,23.0,47.0,1610612741


In [316]:
# Changing all columns to rolling averages using GroupBy and rolling
df_group_game_data = df_game_data_stats.groupby(by ='TEAM_ID_home', as_index= True).rolling(window = 20).mean()
# Displaying dataframe
display(df_group_game_data)

Unnamed: 0_level_0,Unnamed: 1_level_0,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
TEAM_ID_home,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1610612737,25791,,,,,,,,,,,,
1610612737,25729,,,,,,,,,,,,
1610612737,25719,,,,,,,,,,,,
1610612737,25656,,,,,,,,,,,,
1610612737,25602,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1610612766,111,106.05,0.44970,0.78575,0.35080,24.25,43.40,108.90,0.47170,0.79275,0.36455,23.45,44.65
1610612766,99,105.80,0.44800,0.77465,0.35155,24.50,42.65,108.15,0.46880,0.79010,0.36275,23.25,45.60
1610612766,54,105.00,0.44480,0.77300,0.35155,24.10,42.45,107.65,0.46500,0.79570,0.35785,22.85,45.80
1610612766,31,105.90,0.44815,0.76050,0.35850,24.45,42.15,109.35,0.46680,0.79430,0.36500,23.20,45.95


In [318]:
# Dropping extra index column
df_group_game_data = df_group_game_data.droplevel('TEAM_ID_home')


In [319]:
# Editing original dataframe so it can be concatenated with the stats dataframe
df_game_data = df_game_data.drop(columns = float_dtypes)
df_game_data.sort_values(by = ['HOME_TEAM_ID', 'GAME_DATE_EST'], inplace = True)
# Displaying dataframe
df_game_data


Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,TEAM_ID_away,HOME_TEAM_WINS
18417,2003-10-08,10300011,1610612737,1610612739,2003,1610612737,1610612739,0
18373,2003-10-16,10300058,1610612737,1610612748,2003,1610612737,1610612748,0
18321,2003-10-24,10300108,1610612737,1610612764,2003,1610612737,1610612764,0
18284,2003-11-01,20300029,1610612737,1610612754,2003,1610612737,1610612754,0
18273,2003-11-03,20300042,1610612737,1610612740,2003,1610612737,1610612740,1
...,...,...,...,...,...,...,...,...
111,2022-02-25,22100896,1610612766,1610612761,2021,1610612766,1610612761,1
99,2022-02-27,22100915,1610612766,1610612765,2021,1610612766,1610612765,0
54,2022-03-05,22100955,1610612766,1610612759,2021,1610612766,1610612759,1
31,2022-03-08,22100975,1610612766,1610612751,2021,1610612766,1610612751,0


In [320]:
# Concatenating both dataframes
df_game_data_final = pd.concat([df_game_data, df_group_game_data], join='inner', axis = 1)
df_game_data_final

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,TEAM_ID_away,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
18417,2003-10-08,10300011,1610612737,1610612739,2003,1610612737,1610612739,0,106.20,0.43210,0.75900,0.30090,22.25,46.85,108.85,0.44225,0.77540,0.36805,23.35,46.45
18373,2003-10-16,10300058,1610612737,1610612748,2003,1610612737,1610612748,0,,,,,,,,,,,,
18321,2003-10-24,10300108,1610612737,1610612764,2003,1610612737,1610612764,0,,,,,,,,,,,,
18284,2003-11-01,20300029,1610612737,1610612754,2003,1610612737,1610612754,0,,,,,,,,,,,,
18273,2003-11-03,20300042,1610612737,1610612740,2003,1610612737,1610612740,1,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,2022-02-25,22100896,1610612766,1610612761,2021,1610612766,1610612761,1,106.05,0.44970,0.78575,0.35080,24.25,43.40,108.90,0.47170,0.79275,0.36455,23.45,44.65
99,2022-02-27,22100915,1610612766,1610612765,2021,1610612766,1610612765,0,105.80,0.44800,0.77465,0.35155,24.50,42.65,108.15,0.46880,0.79010,0.36275,23.25,45.60
54,2022-03-05,22100955,1610612766,1610612759,2021,1610612766,1610612759,1,105.00,0.44480,0.77300,0.35155,24.10,42.45,107.65,0.46500,0.79570,0.35785,22.85,45.80
31,2022-03-08,22100975,1610612766,1610612751,2021,1610612766,1610612751,0,105.90,0.44815,0.76050,0.35850,24.45,42.15,109.35,0.46680,0.79430,0.36500,23.20,45.95


In [321]:
df_game_data_final.isna().sum()

GAME_DATE_EST         0
GAME_ID               0
HOME_TEAM_ID          0
VISITOR_TEAM_ID       0
SEASON                0
TEAM_ID_home          0
TEAM_ID_away          0
HOME_TEAM_WINS        0
PTS_home           1221
FG_PCT_home        1221
FT_PCT_home        1221
FG3_PCT_home       1221
AST_home           1221
REB_home           1221
PTS_away           1221
FG_PCT_away        1221
FT_PCT_away        1221
FG3_PCT_away       1221
AST_away           1221
REB_away           1221
dtype: int64

In [323]:
# Dropping nulls and counting nulls
df_game_data_final = df_game_data_final.dropna()
df_game_data.isna().sum().sum()

0

In [325]:
# Saving dataframe to CSV for Machine Learning Notebooks
df_game_data_final.to_csv('../Resources/nba_game_data.csv')