# NBA Shots Taken 2014-15 Data Cleaning


#### Import Python Packages

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
from src.shot_functions import *

#### Import Original Data

In [3]:
# Import original data as pandas DataFrame df
df = pd.read_csv('data/original_shots_data.csv')

In [4]:
# Check for null values
df.isna().any()

GAME_ID                       False
MATCHUP                       False
LOCATION                      False
W                             False
FINAL_MARGIN                  False
SHOT_NUMBER                   False
PERIOD                        False
GAME_CLOCK                    False
SHOT_CLOCK                     True
DRIBBLES                      False
TOUCH_TIME                    False
SHOT_DIST                     False
PTS_TYPE                      False
SHOT_RESULT                   False
CLOSEST_DEFENDER              False
CLOSEST_DEFENDER_PLAYER_ID    False
CLOSE_DEF_DIST                False
FGM                           False
PTS                           False
player_name                   False
player_id                     False
dtype: bool

In [5]:
# Update df after dropping null values in SHOT_CLOCK
df = df.dropna()

In [6]:
# Look at df Columns and their datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122502 entries, 0 to 128067
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   GAME_ID                     122502 non-null  int64  
 1   MATCHUP                     122502 non-null  object 
 2   LOCATION                    122502 non-null  object 
 3   W                           122502 non-null  object 
 4   FINAL_MARGIN                122502 non-null  int64  
 5   SHOT_NUMBER                 122502 non-null  int64  
 6   PERIOD                      122502 non-null  int64  
 7   GAME_CLOCK                  122502 non-null  object 
 8   SHOT_CLOCK                  122502 non-null  float64
 9   DRIBBLES                    122502 non-null  int64  
 10  TOUCH_TIME                  122502 non-null  float64
 11  SHOT_DIST                   122502 non-null  float64
 12  PTS_TYPE                    122502 non-null  int64  
 13  SHOT_RESULT   

In [7]:
#df.head()

#### Clean Original Data

In [8]:
# Extract Important Features
df2 = df[['LOCATION', 'W', 'FINAL_MARGIN', 'SHOT_NUMBER', 'PERIOD', 'GAME_CLOCK',
          'SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME', 'SHOT_DIST', 'CLOSE_DEF_DIST',
          'PTS_TYPE', 'SHOT_RESULT', 'FGM', 'PTS']]

In [9]:
# Verify correct features were taken
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122502 entries, 0 to 128067
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LOCATION        122502 non-null  object 
 1   W               122502 non-null  object 
 2   FINAL_MARGIN    122502 non-null  int64  
 3   SHOT_NUMBER     122502 non-null  int64  
 4   PERIOD          122502 non-null  int64  
 5   GAME_CLOCK      122502 non-null  object 
 6   SHOT_CLOCK      122502 non-null  float64
 7   DRIBBLES        122502 non-null  int64  
 8   TOUCH_TIME      122502 non-null  float64
 9   SHOT_DIST       122502 non-null  float64
 10  CLOSE_DEF_DIST  122502 non-null  float64
 11  PTS_TYPE        122502 non-null  int64  
 12  SHOT_RESULT     122502 non-null  object 
 13  FGM             122502 non-null  int64  
 14  PTS             122502 non-null  int64  
dtypes: float64(4), int64(7), object(4)
memory usage: 15.0+ MB


In [10]:
# Remove TOUCH_TIME values that are 0 or below
df2 = df2[df2.TOUCH_TIME > 0]

In [11]:
# Remove incorrectly recorded 3 pointers
df2 = df2[~((df2.PTS_TYPE==3) & (df2.SHOT_DIST<=22))]

# Remove incorrectly recorded 2 pointers
df2 = df2[~((df2.PTS_TYPE==2) & (df2.SHOT_DIST>23.75))]

In [12]:
df2.SHOT_RESULT[1], df2.FGM[1]

('missed', 0)

In [13]:
# Compare FGM (1&0) and SHOT_RESULT ('made'&'missed')
df2.FGM.equals(df2.SHOT_RESULT.apply(binary_shot_result))

True

In [14]:
# Compare if PTS is equal to PTS_TYPE * FGM
df2.PTS.equals(df2.PTS_TYPE * df2.FGM)

True

In [15]:
# Use PERIOD and GAME_CLOCK to change GAME_CLOCK into seconds
df2.GAME_CLOCK = df2.apply(lambda df2: game_seconds(df2.PERIOD, df2.GAME_CLOCK), axis=1)

In [16]:
df2.head()

Unnamed: 0,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,CLOSE_DEF_DIST,PTS_TYPE,SHOT_RESULT,FGM,PTS
0,A,W,24,1,1,69,10.8,2,1.9,7.7,1.3,2,made,1,2
1,A,W,24,2,1,14,3.4,0,0.8,28.2,6.1,3,missed,0,0
3,A,W,24,4,2,1427,10.3,2,1.9,17.2,3.4,2,missed,0,0
4,A,W,24,5,2,1354,10.9,2,2.7,3.7,1.1,2,missed,0,0
5,A,W,24,6,2,1215,9.1,2,4.4,18.4,2.6,2,missed,0,0


In [17]:
df2.to_csv('data/clean_shots_data.csv', index=False)