# Preprocessing and Exploration

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import datetime
import re
from ggplot import *

In [None]:
shots_ds = pd.read_csv('../input/shot_logs.csv')

In [None]:
shots_ds.columns

In [None]:
shots_ds.dtypes

In [None]:
shots_ds.head()

In [None]:
pd.isnull(shots_ds).any()

In [None]:
shots_ds.shape

In [None]:
splits = lambda x: pd.Series([i for i in re.split(' - | @ | vs. ', x)])

In [None]:
res = shots_ds['MATCHUP'].apply(splits)

In [None]:
shots_ds['date'], shots_ds['team1'], shots_ds['team2'] = res[0], res[1], res[2]

In [None]:
print (sorted(shots_ds.team1.unique()))
print (sorted(shots_ds.team2.unique()))
assert len(shots_ds.team1) == len(shots_ds.team2)

In [None]:
# Now that we have split up MATCHUP we can drop this column
shots_ds.drop('MATCHUP', axis=1, inplace=True)

# Defensive Analysis

Some things to investigate:

- Who did people shoot around the most
- Who had the best/worst shots made % within x distance
- Who had the best/worst shots made by being closest defender
    - Wing vs interior?
- Who had best ratio of minutes played to being closest defender to shot
- Who had the worst ration of minutes played to being closest defender to a shot

In [None]:
# More defenders than shot takers recorded..
print (shots_ds.player_id.unique().shape[0])
print (shots_ds.CLOSEST_DEFENDER_PLAYER_ID.unique().shape[0])

In [None]:
players_ds = pd.DataFrame(list(set(shots_ds['CLOSEST_DEFENDER_PLAYER_ID'])))

We can see that people shoot around Big Men the most.  This makes sense because a lot of shots are from drives to the hoop and these people are rim protectors.  They have to deal with drives to the hoop by players they aren't defending, in addition to post up moves from the players they are actually defending.  One should note that being the closest defender to a shot may also show the player's ability to get close to shooters (agility).

In [None]:
defender_df = pd.concat([shots_ds['CLOSEST_DEFENDER_PLAYER_ID'], shots_ds['CLOSEST_DEFENDER']], axis=1, keys=['PLAYER_ID', 'PLAYER'])
defender_df = defender_df.drop_duplicates()

In [None]:
t_df = shots_ds.groupby('CLOSEST_DEFENDER_PLAYER_ID').count()[['GAME_ID']]
t_df.reset_index(level=0, inplace=True)
t_df.rename(columns={'CLOSEST_DEFENDER_PLAYER_ID':'PLAYER_ID', 'GAME_ID':'SHOTS_DEFENDED'}, inplace=True)
defender_df = defender_df.merge(t_df, on='PLAYER_ID')

In [None]:
t_df = shots_ds.groupby('CLOSEST_DEFENDER_PLAYER_ID').sum()[['DRIBBLES', 'TOUCH_TIME', 'SHOT_DIST', 'CLOSE_DEF_DIST', 'FGM', 'PTS']]
t_df.reset_index(level=0, inplace=True)
t_df.rename(columns={'CLOSEST_DEFENDER_PLAYER_ID':'PLAYER_ID', 'FGM':'SHOTS_DEFENDED_MADE'}, inplace=True)
defender_df = defender_df.merge(t_df, on='PLAYER_ID')

In [None]:
# Calculate column percentages
defender_df['FG%'] = defender_df['SHOTS_DEFENDED_MADE']/defender_df['SHOTS_DEFENDED']
defender_df['AVG_DRIBBLES'] = defender_df['DRIBBLES']/defender_df['SHOTS_DEFENDED']
defender_df['AVG_SHOT_DIST'] = defender_df['SHOT_DIST']/defender_df['SHOTS_DEFENDED']
defender_df['AVG_CLOSE_DEF_DIST'] = defender_df['CLOSE_DEF_DIST']/defender_df['SHOTS_DEFENDED']
defender_df['AVG_PTS'] = defender_df['PTS']/defender_df['SHOTS_DEFENDED']
defender_df['AVG_TOUCH_TIME'] = defender_df['TOUCH_TIME']/defender_df['SHOTS_DEFENDED']

In [None]:
# We only really care about defenders with a decent number of shots against them
defender_dfs = defender_df[defender_df.SHOTS_DEFENDED > 200]

In [None]:
defender_dfs.columns

In [None]:
# The worst defenders by average points scored against.. Some pretty big names on here!
defender_dfs.sort_values('AVG_PTS', ascending=False)[:20]

In [None]:
def get_defensive_stats(player_name):
    return defender_dfs[defender_dfs['PLAYER'] == player_name]
get_defensive_stats('Lillard, Damian')

In [None]:
correlations = defender_dfs[['SHOTS_DEFENDED', 'FG%', 'AVG_DRIBBLES', 'AVG_SHOT_DIST', 'AVG_CLOSE_DEF_DIST', 'AVG_PTS', 'AVG_TOUCH_TIME']].corr()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(correlations, annot=True)

The above heatmap shows several interesting points.  Some significant insights include:

 - Worse defenders have a slightly higher chance of a shot being taken on them (shots defended versus FG%)
     - This is a broad generalization with a large assumption
 - The further away the shot is, the less chance of it going in (ok this is obvious)
 - The more dribbles, the further away the shot is likely to be from the basket
 - There is no siginificant relation between touch time of the basketball or dribbles and the chance the shot goes in
     - Perhaps this is because a lot of dribbles means not a 3 pt shot, so it evens out a little there
 - More dribbles means a defender was closer to the shot
 - Further away shots have further away defenders
 - The further away the shot, the more points are scored (although FG% goes down)
 - The further away the defender is, the more points scored (although FG% is not correlated with this)
 - The longer the ball is held by an offensive player, the closer the defender gets for the shot
 
While these are mostly broad generalizations, I believe they still offer some insight to the NBA game.

# Offensive Analysis

In [None]:
shooter_df = pd.concat([shots_ds['player_id'], shots_ds['player_name']], axis=1, keys=['player_id', 'name'])
shooter_df = shooter_df.drop_duplicates()

In [None]:
t_df = shots_ds.groupby('player_id').count()[['GAME_ID']]
t_df.reset_index(level=0, inplace=True)
t_df.rename(columns={'GAME_ID':'SHOTS_TAKEN'}, inplace=True)
shooter_df = shooter_df.merge(t_df, on='player_id')

In [None]:
t_df = shots_ds.groupby('player_id').sum()[['DRIBBLES', 'TOUCH_TIME', 'SHOT_DIST', 'CLOSE_DEF_DIST', 'FGM', 'PTS', 'PTS_TYPE', 'PERIOD', 'SHOT_NUMBER']]
t_df.reset_index(level=0, inplace=True)
t_df.rename(columns={'FGM':'SHOTS_MADE'}, inplace=True)
shooter_df = shooter_df.merge(t_df, on='player_id')

In [None]:
# Calculate column percentages
shooter_df['FG%'] = shooter_df['SHOTS_MADE']/shooter_df['SHOTS_TAKEN']
shooter_df['AVG_DRIBBLES'] = shooter_df['DRIBBLES']/shooter_df['SHOTS_TAKEN']
shooter_df['AVG_SHOT_DIST'] = shooter_df['SHOT_DIST']/shooter_df['SHOTS_TAKEN']
shooter_df['AVG_CLOSE_DEF_DIST'] = shooter_df['CLOSE_DEF_DIST']/shooter_df['SHOTS_TAKEN']
shooter_df['AVG_PTS'] = shooter_df['PTS']/shooter_df['SHOTS_TAKEN']
shooter_df['AVG_TOUCH_TIME'] = shooter_df['TOUCH_TIME']/shooter_df['SHOTS_TAKEN']
shooter_df['AVG_PTS_TYPE'] = shooter_df['PTS_TYPE']/shooter_df['SHOTS_TAKEN']
shooter_df['AVG_PERIOD'] = shooter_df['PERIOD']/shooter_df['SHOTS_TAKEN']
shooter_df['AVG_SHOT_NUMBER'] = shooter_df['SHOT_NUMBER']/shooter_df['SHOTS_TAKEN']

In [None]:
shooter_df

In [None]:
# Only take shooters with significant number of shots
shooter_dfs = shooter_df[shooter_df.SHOTS_TAKEN > 200]

In [None]:
correlations = shooter_dfs[['SHOTS_TAKEN', 'FG%', 'AVG_DRIBBLES', 'AVG_SHOT_DIST', 'AVG_CLOSE_DEF_DIST', 'AVG_PTS', 'AVG_PTS_TYPE', 'AVG_PERIOD', 'AVG_SHOT_NUMBER', 'AVG_TOUCH_TIME']].corr()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(correlations, annot=True)

### Initial Insights

Here are some insights from the offensive analysis heatmap:

   - The more a player dribbles, the more likely they are to take more shots
   - The more shots taken, the defender will be slightly closer on average
   - The longer a shooter dribbles or holds the ball, the less chance of them making the field goal
       - This is probably related to the fact that ball handlers take more 3 pointers
   - The longer a shooter dribbles or holds the ball, the less points they're likely to have per shot
       - Interesting that the more that players possess the ball before a shot, the more shots they take, but the less their average points per shot is.  This is a sign that selfishness ruins offenses.
   - FG% is higher for those who shoot more in earlier periods
   - The further away the defender is, the higher the FG%
   - The more a shooter dribbles, the more shots they take later in the game
   - Further away shots are take by players who shoot later in the game
   - Defenders are likely to be further away from players who shoot later in the game

In [None]:
# Best percentage shooters
shooter_dfs.sort_values('FG%', ascending=False)[:20]

In [None]:
# Best points per shot attempt
shooter_dfs.sort_values('AVG_PTS', ascending=False)[:20]

In [None]:
# Don't let these players shoot it!
shooter_dfs.sort_values('AVG_PTS', ascending=True)[:20]

In [None]:
avg_cols = [col for col in shooter_dfs.columns if 'AVG' in col]
shooter_dfs[avg_cols].describe()

In [None]:
sns.set(style="darkgrid", color_codes=True)
g = sns.jointplot("AVG_PERIOD", "AVG_TOUCH_TIME", data=shooter_dfs[['AVG_PERIOD', 'AVG_TOUCH_TIME']], kind="reg",
                  xlim=(2, 3), ylim=(0, 6), color="r", size=7)

In [None]:
shots_ds.head()

In [None]:
def convert_game_clock(time):
    mi, sec = time.split(':')
    mi = int(mi)/12.0
    sec = int(sec)/60.0/12.0
    return mi+sec

In [None]:
shots_ds['converted_game_clock'] = shots_ds.GAME_CLOCK.apply(convert_game_clock)

In [None]:
shots_ds['period_and_time'] = shots_ds['PERIOD'] + shots_ds['converted_game_clock']

In [None]:
shots_ds['PERIOD'].unique()

In [None]:
sns.set(style="darkgrid", color_codes=True)

g = sns.jointplot("period_and_time", "TOUCH_TIME", data=shots_ds, kind="reg",
                  xlim=(-1, 5), ylim=(0, 24), color="r", size=7)

### Note
I can't get ggplot to graph without putting and xlab and getting the following error.  Not sure what's going on here, but it worked fine in my local python 2.7 environment.  

In [None]:
ggplot(shots_ds, aes(x='SHOT_DIST', color='SHOT_RESULT', group='SHOT_RESULT')) + geom_density() + xlab("Shot Distance")

In [None]:
g = ggplot(shots_ds, aes(x='period_and_time', color='SHOT_RESULT', group='SHOT_RESULT')) + \
  geom_density() +\
  xlab('Period and Time')

In [None]:
ggplot(shots_ds, aes(x='period_and_time', y='CLOSE_DEF_DIST')) + geom_density() + xlab("Period and Time") 

In [None]:
ggplot(shots_ds, aes(x='SHOT_DIST', y='PTS')) + \
  geom_density() + xlab("Shot Distance")

NBA teams should stop shooting free throw range shots.  As is visible on the graph above, either shoot close shots or threes.