In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
%matplotlib inline

data = pd.read_csv("../input/data.csv")

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
# Court visualization of misses and shots
df = data
court_scale, alpha = 7, 0.3
plt.figure(figsize=(2 * court_scale, court_scale*(84.0/50.0)))
# hit
plt.subplot(121)
h = df.loc[df.shot_made_flag == 1]
plt.scatter(h.loc_x, h.loc_y, color='#FDB927', alpha=alpha)
plt.title('Shots Made')
ax = plt.gca()
ax.set_ylim([-50, 900])
# miss
plt.subplot(122)
h = df.loc[df.shot_made_flag == 0]
plt.scatter(h.loc_x, h.loc_y, color='#552582', alpha=alpha)
plt.title('Shots missed')
ax = plt.gca()
ax.set_ylim([-50, 900])
plt.savefig('shots_made_and_missed.png')

In [None]:
# combined shot types
groups = df.groupby('combined_shot_type')

fig, ax = plt.subplots(figsize=(court_scale, court_scale*(84.0/50.0)))
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
alpha = 0.2
alphas, n = [], float(len(df.combined_shot_type))
for u in [i[0] for i in groups]:
    d = len(df.loc[df.combined_shot_type == u, 'combined_shot_type'])
    alphas.append(np.log1p(d))
for (name, group), alp in zip(groups, alphas):
    ax.plot(group.loc_x, group.loc_y,
            marker='.', linestyle='', ms=12,
            label=name, alpha=alp/10)
ax.legend()

In [None]:
jump_shot = df[df.combined_shot_type == 'Jump Shot']

fig, ax = plt.subplots(figsize=(court_scale, court_scale*(84.0/50.0)))
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
ax.plot(jump_shot.loc_x, jump_shot.loc_y,
            marker='.', linestyle='', ms=12,
            label='Jump Shot', color = '#FDB927', alpha=0.1)

ax.legend()

In [None]:
# time to add basketball court lines for context
from matplotlib.patches import Circle, Rectangle, Arc

def draw_court(ax=None, color='black', lw=2, outer_lines=False):
    # If an axes object isn't provided to plot onto, just get current one
    if ax is None:
        ax = plt.gca()

    # Create the various parts of an NBA basketball court

    # Create the basketball hoop
    # Diameter of a hoop is 18" so it has a radius of 9", which is a value
    # 7.5 in our coordinate system
    hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)

    # Create backboard
    backboard = Rectangle((-30, -7.5), 60, -1, linewidth=lw, color=color)

    # The paint
    # Create the outer box 0f the paint, width=16ft, height=19ft
    outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw, color=color,
                          fill=False)
    # Create the inner box of the paint, widt=12ft, height=19ft
    inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw, color=color,
                          fill=False)

    # Create free throw top arc
    top_free_throw = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
                         linewidth=lw, color=color, fill=False)
    # Create free throw bottom arc
    bottom_free_throw = Arc((0, 142.5), 120, 120, theta1=180, theta2=0,
                            linewidth=lw, color=color, linestyle='dashed')
    # Restricted Zone, it is an arc with 4ft radius from center of the hoop
    restricted = Arc((0, 0), 80, 80, theta1=0, theta2=180, linewidth=lw,
                     color=color)

    # Three point line
    # Create the side 3pt lines, they are 14ft long before they begin to arc
    corner_three_a = Rectangle((-220, -47.5), 0, 140, linewidth=lw,
                               color=color)
    corner_three_b = Rectangle((220, -47.5), 0, 140, linewidth=lw, color=color)
    # 3pt arc - center of arc will be the hoop, arc is 23'9" away from hoop
    # I just played around with the theta values until they lined up with the 
    # threes
    three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158, linewidth=lw,
                    color=color)

    # Center Court
    center_outer_arc = Arc((0, 422.5), 120, 120, theta1=180, theta2=0,
                           linewidth=lw, color=color)
    center_inner_arc = Arc((0, 422.5), 40, 40, theta1=180, theta2=0,
                           linewidth=lw, color=color)

    # List of the court elements to be plotted onto the axes
    court_elements = [hoop, backboard, outer_box, inner_box, top_free_throw,
                      bottom_free_throw, restricted, corner_three_a,
                      corner_three_b, three_arc, center_outer_arc,
                      center_inner_arc]

    if outer_lines:
        # Draw the half court line, baseline and side out bound lines
        outer_lines = Rectangle((-250, -47.5), 500, 470, linewidth=lw,
                                color=color, fill=False)
        court_elements.append(outer_lines)

    # Add the court elements onto the axes
    for element in court_elements:
        ax.add_patch(element)

    return ax

# let's draw the court
plt.figure(figsize=(12,11))
plt.scatter(df['loc_x'], df['loc_y'], color='#552582', alpha=0.3)
draw_court(outer_lines=True)

# and now draw the shots
plt.ylim(-100,500)
plt.xlim(300,-300)
plt.show()

In [None]:
cmap=plt.cm.YlOrRd_r 

# n_levels sets the number of contour lines for the main kde plot
joint_shot_chart = sns.jointplot(df['loc_x'], df['loc_y'], stat_func=None,
                                 kind='kde', space=0, color=cmap(0.1),
                                 cmap=cmap, n_levels=50)

joint_shot_chart.fig.set_size_inches(12,11)

# A joint plot has 3 Axes, the first one called ax_joint 
# is the one we want to draw our court onto and adjust some other settings
ax = joint_shot_chart.ax_joint
draw_court(ax)

# Adjust the axis limits and orientation of the plot in order
# to plot half court, with the hoop by the top of the plot
ax.set_xlim(-250,250)
ax.set_ylim(422.5, -47.5)

# Get rid of axis labels and tick marks
ax.set_xlabel('loc_x')
ax.set_ylabel('loc_y')
ax.tick_params(labelbottom='off', labelleft='off')

plt.show()

In [None]:
# define the accuracy plotting function
def get_acc(df, against):
    ct = pd.crosstab(df.shot_made_flag, df[against]).apply(lambda x:x/x.sum(), axis=0)
    x, y = ct.columns, ct.values[1, :]
    plt.figure(figsize=(7, 5))
    plt.plot(x, y)
    plt.xlabel(against)
    plt.ylabel('% shots made')
    std = ct.T.std()[1]
    std = float('%.3f'%std)
    range1 = ct.T.max()[1] - ct.T.min()[1]
    range1 = float('%.3f'%range1)
    plt.title('standard deviation: %s range: %s'%(std, range1))

In [None]:
# define the sort & enumeration function
def sort_encode(df, field):
    ct = pd.crosstab(df.shot_made_flag, df[field]).apply(lambda x:x/x.sum(), axis=0)
    temp = list(zip(ct.values[1, :], ct.columns))
    temp.sort()
    new_map = {}
    for index, (acc, old_number) in enumerate(temp):
        new_map[old_number] = index
    new_field = field + '_sort_enumerated'
    df[new_field] = df[field].map(new_map)
    return new_field

In [None]:
# sort the action_type according to the shot accuracy and create a new feature
# so that the bigger the new feature, the higher the shot accuracy
# it has liitle effect to random forest classifier
# but to some other classifiers
new_field = sort_encode(data, 'action_type')
get_acc(data, new_field)

In [None]:
t = data.combined_shot_type.value_counts()
plt.pie(t, explode=t*.05, colors = ['%f' % (i/float()) for i in range(n)])

In [None]:
print('hi')

In [None]:
n = 20
Z = np.ones(n)
Z[-1] *= 2

plt.axes([0.025,0.025,0.95,0.95])

plt.pie(Z, explode=Z*.05, colors = ['%f' % (i/float(n)) for i in range(n)])
plt.gca().set_aspect('equal')
plt.xticks([]), plt.yticks([])

In [None]:
# combined_shot_type
# almost the same as above
new_field = sort_encode(data, 'combined_shot_type')
get_acc(data, new_field)

'game_event_id' and 'game_id' may not be useful so far.
so as 'lat' and 'lon'.
loc_x' and 'loc_y' maybe useful, but they can't be used to calculate accuracies, maybe we can use
'shot_distance' instead later.

In [None]:
# minutes_remaining
get_acc(data, 'minutes_remaining')

In [None]:
# seconds_remaining
get_acc(data, 'seconds_remaining')

In [None]:
# it seems that time has little influence on Kobe's performace. 
# But we also see that his accuracies do drop at the last minute and the last second.
# Maybe we could create a new feature consisting of 0 (the last time) and 1 (regular time)
data['time_remaining'] = data.minutes_remaining*60 + data.seconds_remaining
data['time_remaining_enumerated'] = 99
data.loc[data.time_remaining<10, 'time_remaining_enumerated'] = 0
data.loc[data.time_remaining>=10, 'time_remaining_enumerated'] = 1
get_acc(data, 'time_remaining_enumerated')

In [None]:
# period
# maybe create a new feature 0 (regular periods) and 1 (period 4)
get_acc(data, 'period')

In [None]:
# playoffs
# almost no difference at all
get_acc(data, 'playoffs')

In [None]:
# season
# Kobe's performance obviously dropped during his last few seasons
# but as this is a time series data and we may not use our domain knowledge
# so I don't whether we should use this feature
data['season_start_year'] = data.season.str.split('-').str[0]
data['season_start_year'] = data['season_start_year'].astype(int)
get_acc(data, 'season_start_year')

In [None]:
# shot_distance
get_acc(data, 'shot_distance')

In [None]:
# shot_type
# Kobe does much better in 2-points than 3-points
# but this information could be included in the 'shot_distance' feature
action_map = {action: i for i, action in enumerate(data.shot_type.unique())}
data['shot_type_enumerated'] = data.shot_type.map(action_map) 
get_acc(data, 'shot_type_enumerated')

In [None]:
print(data.shot_zone_area.unique())
print(data.shot_zone_basic.unique())
print(data.shot_zone_area.value_counts())
print(data.shot_zone_basic.value_counts())

In [None]:
# shot_zone_area
# there is a lot of variance for this feature
action_map = {action: i for i, action in enumerate(data.shot_zone_area.unique())}
data['shot_zone_area_enumerated'] = data.shot_zone_area.map(action_map) 

new_field = sort_encode(data, 'shot_zone_area_enumerated')
get_acc(data, new_field)

In [None]:
# shot_zone_basic
# there is a lot of variance for this feature
action_map = {action: i for i, action in enumerate(data.shot_zone_basic.unique())}
data['shot_zone_basic_enumerated'] = data.shot_zone_basic.map(action_map) 

new_field = sort_encode(data, 'shot_zone_basic_enumerated')
get_acc(data, new_field)

In [None]:
# shot_zone_range
# there is a lot of variance for this feature
action_map = {action: i for i, action in enumerate(data.shot_zone_range.unique())}
data['shot_zone_range_enumerated'] = data.shot_zone_range.map(action_map) 

new_field = sort_encode(data, 'shot_zone_range_enumerated')
get_acc(data, new_field)

'team_id' and 'team_name' are constant and represent Lakers, so they are useless.
'game_date' is useless too

In [None]:
# opponent
# there is not much variance here
action_map = {action: i for i, action in enumerate(data.opponent.unique())}
data['opponent_enumerated'] = data.opponent.map(action_map) 

new_field = sort_encode(data, 'opponent_enumerated')
get_acc(data, new_field)

In [None]:
# matchup
# Kobe's home game performance is slightly better than away game, but not much
# create a new feature consisting of 0 (away game) and 1 (home game)
data['home_or_away'] = 99
data.loc[data.matchup.str.find('@')==-1, 'home_or_away'] = 1
data.loc[data.matchup.str.find('vs')==-1, 'home_or_away'] = 0
get_acc(data, 'home_or_away')

After the experiments above, we can group the features as below:

action: action_type, combined_shot_type

psition: loc_x, loc_y, shot_distance, shot_type, shot_zone_area, shot_zone_basic, shot_zone_range

opponent: matchup, oppoent

time: minutes_remaining, seconds_remaining, period, playoffs, season

uknown: lat, lon

useless: game_event_id, game_id, team_id, team_name, game_date, shot_id, shot_made_flag

And then we can do some basic analysis on the "useful" features:

1. action_type: 57 categories

2. combined_shot_type: 6 categories

3. loc_x & loc_y: the positions on the court, continuous variables

4. shot_distance: an "int continuous" variable, meaning the distance from the basket

5. shot_type: 2-points or 3-points (0-1)

6. shot_zone_area: six areas on the court, including right/left side, right/left side center, center
and back court

7. shot_zone_basic: seven areas on the court

8. shot_zone_range: five shot distance categories, including less-than-8-ft, 8-16-ft, 16-24-ft, 24+ft
and back_court_shot

9. matchup: 0-1 categories, away or home game

10. opponent: 33 opponents' names

11. minutes_remaining & seconds_remaining: time remaining in one period

12. period: 1-7 categories, meaning 1-4 periods and three OT

13. playoffs: 0-1 playoff or not

14. season: 1996-2015