In [None]:
import pandas as pd
import numpy as np
# below is all visualization
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import *

In [None]:
# Read-in the dataset
NBA_Shot_Logs = pd.read_csv('../input/nba-shot-logs/shot_logs.csv')

In [None]:
# Split variables into numerical and categorical
numerical_col = NBA_Shot_Logs.select_dtypes(include = ['float64', 'int64'])
categorical_col = NBA_Shot_Logs.select_dtypes(include = ['object'])

In [None]:
numerical_col.head()

In [None]:
categorical_col.head()

In [None]:
NBA_Shot_Logs

In [None]:
NBA_Shot_Logs.head(15)

In [None]:
NBA_Shot_Logs.tail(8)

# Summarize Data

In [None]:
# Checking the types of each feature
NBA_Shot_Logs.dtypes

In [None]:
# Count the rows and columns
NBA_Shot_Logs.shape

In [None]:
# Counting the types
NBA_Shot_Logs.dtypes.value_counts()

In [None]:
# Check duplicate rows
NBA_Shot_Logs_duplicate_rows = NBA_Shot_Logs[NBA_Shot_Logs.duplicated()]
print('Numbers of duplicated rows', NBA_Shot_Logs_duplicate_rows.shape[0])
# Drop the duplicated row if applicable
NBA_Shot_Logs = NBA_Shot_Logs.drop_duplicates()

In [None]:
#percentage of missing value
missing_percentage = (NBA_Shot_Logs.isnull().sum() / NBA_Shot_Logs.shape[0] * 100).to_frame()
#display columns that have missing values
missing_percentage = missing_percentage[missing_percentage[0] > 0]
missing_percentage

In [None]:
NBA_Shot_Logs.describe()

# Numerical Variable Analysis

In [None]:
numerical_col.head()

In [None]:
numerical_col.nunique()

In [None]:
numerical_col.shape

In [None]:
# Plottting histograms for each numerical feature
# Use plt.tight_layout() to automatically manage the padding for the features
sns.set()
numerical_col.hist(figsize = (25, 80), layout = (10, 2), )
plt.tight_layout()
plt.show()

In [None]:
# Making subplots to see if there are values that weren't shown clearly in the above plots and we found that some values that cannot be seen clearly in above plot but in the subplots.
plt.figure(figsize = (25, 18), dpi = 100)

plt.subplot(7, 2, 1)
plt.hist(NBA_Shot_Logs['FINAL_MARGIN'][NBA_Shot_Logs.FINAL_MARGIN > 30], color = 'r')
plt.title('FINAL_MARGIN')

plt.subplot(7, 2, 2)
plt.hist(NBA_Shot_Logs['SHOT_NUMBER'][NBA_Shot_Logs.SHOT_NUMBER > 21], color = 'r')
plt.title('SHOT_NUMBER')

plt.subplot(7, 2, 3)
plt.hist(NBA_Shot_Logs['PERIOD'][NBA_Shot_Logs.PERIOD > 5], color = 'r')
plt.title('PERIOD')

plt.subplot(7, 2, 4)
plt.hist(NBA_Shot_Logs['DRIBBLES'][NBA_Shot_Logs.DRIBBLES > 10], color = 'r')
plt.title('DRIBBLES')

plt.subplot(7, 2, 5)
plt.hist(NBA_Shot_Logs['TOUCH_TIME'][NBA_Shot_Logs.TOUCH_TIME < 0], color = 'r')
plt.title('TOUCH_TIME')

plt.subplot(7, 2, 6)
plt.hist(NBA_Shot_Logs['SHOT_DIST'][NBA_Shot_Logs.SHOT_DIST > 25], color = 'r')
plt.title('SHOT_DIST')

plt.subplot(7, 2, 7)
plt.hist(NBA_Shot_Logs['CLOSE_DEF_DIST'][NBA_Shot_Logs.CLOSE_DEF_DIST > 10], color = 'r')
plt.title('CLOSE_DEF_DIST')

plt.subplot(7, 2, 8)
plt.hist(NBA_Shot_Logs['FGM'][NBA_Shot_Logs.FGM < 1], color = 'r')
plt.title('FGM')

plt.subplot(7, 2, 9)
plt.hist(NBA_Shot_Logs['PTS'][NBA_Shot_Logs.PTS > 2.5], color = 'r')
plt.title('PTS')


plt.tight_layout()


In [None]:
# Picking a random value for FINAL_MARGIN and see how many games have this value
NBA_Shot_Logs[NBA_Shot_Logs.FINAL_MARGIN == 42]

In [None]:
# Picking a random value for DRIBBLES and see how many games have this value

NBA_Shot_Logs[NBA_Shot_Logs.DRIBBLES == 23]

In [None]:
# Plotting heatmap to see the correlation between every features.We can see that combos of SHOT_NUMBER and PERIOD, DRRIBLES and TOUCH_TIME, SHOT_DIST and PTS_TYPE and FGM and PTS are highly correlated.
corr = numerical_col.corr()
sns.heatmap(corr)

In [None]:
corr

# Categorical Variable Analysis

In [None]:
# Counting types of categorical features
categorical_col.nunique()

In [None]:
# Counting rows and columns of categorical features
categorical_col.shape

In [None]:
# Creating barplot for categorical feature SHOT_RESULT. We can see there are approximately 12000 more missed shot than made shots.
categorical_col['SHOT_RESULT'].value_counts().plot(kind = 'bar')

# Target Variable Analysis

In [None]:
NBA_Shot_Logs = NBA_Shot_Logs.dropna(axis = 0)
plt.figure(figsize = (35, 20))

# Making a distplot to see the full distribution for SHOT_NUMBER. The graph shows the density is decreasing as the full distribution goes up.
sns.distplot(NBA_Shot_Logs['SHOT_NUMBER'], color = 'r', hist_kws = {'alpha':0.75})
plt.title('Full Distribution')

In [None]:
# Right tail of the distribution
plt.hist(NBA_Shot_Logs['SHOT_NUMBER'][NBA_Shot_Logs.SHOT_NUMBER > 20])

In [None]:
# Left tail of the distribution
plt.hist(NBA_Shot_Logs['SHOT_NUMBER'][NBA_Shot_Logs.SHOT_NUMBER < 3])

In [None]:
numerical_col[numerical_col.columns[0:]].corr()['SHOT_NUMBER'][:]

In [None]:
sns.set(font_scale = 0.8)
for i in range(0, len(numerical_col.columns), 2):
    sns.pairplot(data = numerical_col,
                x_vars = numerical_col.columns[i:i+2],
                y_vars = ['SHOT_NUMBER'],
                plot_kws = {'alpha':0.6}) # ALPHA IS RESOLUTION

In [None]:
# Make boxplots of W and SHOT_NUMBER
plt.figure(figsize = (16, 12))

plt.subplot(1, 2, 1)
sns.boxplot(x = 'W', y = NBA_Shot_Logs.SHOT_NUMBER, data = categorical_col, showfliers=True)
plt.xticks(rotation = 45)

plt.subplot(1, 2, 2)
sns.boxplot(x = 'W', y = NBA_Shot_Logs.SHOT_NUMBER, data = categorical_col, showfliers=False)
plt.xticks(rotation = 45)

In [None]:
# Make boxplots of LOCATION and SHOT_NUMBER
plt.figure(figsize = (16, 12))

plt.subplot(1, 2, 1)
sns.boxplot(x = 'LOCATION', y = NBA_Shot_Logs.SHOT_NUMBER, data = categorical_col, showfliers=True)
plt.xticks(rotation = 55)

plt.subplot(1, 2, 2)
sns.boxplot(x = 'LOCATION', y = NBA_Shot_Logs.SHOT_NUMBER, data = categorical_col, showfliers=False)
plt.xticks(rotation = 55)