# 1. Introduction

*Draft Version*

The intended purpose of this kernel is twofold:
1. Explore the shots taken by NBA players in the 2014-2015 season
2. Experiment with the Plotly data visualization library

To illustrate the findings I will be using the Plotly visualization library. The immediate appeal of Plotly is highly interactive charts.

In [None]:
# Relevant library import

import pandas as pd
import numpy as np
from datetime import datetime

from plotly import tools
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

pd.options.display.max_columns = 999
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
# Data import
df = pd.read_csv('../input/shot_logs.csv')


# 2. Initial Data Exploration


The purpose of this section is to examine the dataset from a high-level perspective and to build some initial intuitions that will be tested later on. This exploration will also serve as a basis for DataFrame changes in the preprocessing and feature engineering sections. 

In [None]:
# Examination of how the data looks
df.head()

Initial observations and actions:
1. Labels that will have to be encoded: *LOCATION*, *W*, *SHOT_RESULT* (complete)
2. *SHOT_RESULT* and *FGM* fields seem to be identical
2. Additional data will need to be extracted from the *MATCHUP* field
2. Change the records in the *CLOSEST_DEFENDER* column to corespond with *player_name*
4. Not sure what is the distance unit used in fields like *SHOT_DIST* and *CLOSE_DEF_DIST*

In [None]:
# High-level overview of the DataFram structure
df.info()

Initial observations:
1. *SHOT_CLOCK* variable is missing some data
2. Need to convert *GAME_CLOCK* into a time series object

In [None]:
# Examination of numeric fields in more detail
df.describe()

Initial observations:
1. *TOUCH_TIME* minimum value is -163.60 and that does not seem logical
2. The maximum value for *SHOT_NUMBER* seems to be rather small 

___

In [None]:
df[(df['GAME_ID']==21400899) & (df['SHOT_NUMBER']==1)][:5]

Shot number does not seems to only reflect individual palyers

In [None]:
df['GAME_ID'].nunique()

There are 30 teams in the NBA and each plays 82 games in the regular season, so it should be expected to see 30 * 82 / 2 = 1,230 individual game records. This leads me to believe that the dataset does not contain the entire season or there are data quality issues.

# 3. Data Preprocessing

In [None]:
# Convert all column names to lowercase for ease of typing
df.columns = df.columns.str.lower()

In [None]:
df['home_game'] = np.where(df['location'] == 'H', 1, 0)

df['game_won'] = np.where(df['w'] == 'W', 1, 0)

In [None]:
# Fill the shot_clock NaN values with game_clock values 
df['shot_clock'] = df['shot_clock'].fillna(df['game_clock'])

In [None]:
# Drop unneeded columns
df = df.drop(['shot_result', 'location', 'w'], axis=1)

# 4. Feature Engineering

In [None]:
# Extract game date and convert to datetime object
df['date'] = df['matchup'].str[:12]
df['date'] = pd.to_datetime(df['date'], format='%b %d, %Y')

In [None]:
# Extract home and away team names
df['away_team'] = df['matchup'].str[15:18]
df['home_team'] = df['matchup'].str[-3:]

# Drop the now redundant column
df = df.drop(['matchup'], axis=1)

In [None]:
# Split the DataFrame by 2/3 point field goals
two_point_shots = df['pts_type'] == 2
three_point_shots = df['pts_type'] == 3

# Split the DataFrame by missed/made field goals
missed = df['fgm'] == 0
made = df['fgm'] == 1

# Create views
missed_twos = df[two_point_shots & missed]
made_twos = df[two_point_shots & made]

missed_threes = df[three_point_shots & missed]
made_threes = df[three_point_shots & made]

# 5. In Depth Data Exploration

## 5.1 Target Variable

In [None]:
trace1 = go.Bar(
    x=df[two_point_shots]['fgm'].value_counts().index.values,
    y=df[two_point_shots]['fgm'].value_counts(),
    name='2-Point Field Goals'
)
trace2 = go.Bar(
    x=df[three_point_shots]['fgm'].value_counts().index.values,
    y=df[three_point_shots]['fgm'].value_counts(),
    name='3-Point Field Goals'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    title='Count of Made and Missed Field Goals',
    xaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1],
        ticktext = ['Missed', 'Made']
    ),
    yaxis=dict(
        title='Number of Shots'
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
print('Average 2-point conversion rate ' + str(round(len(made_twos) / len(df[two_point_shots]), 2)) +'%')
print('Average 3-point conversion rate ' + str(round(len(made_threes) / len(df[three_point_shots]), 2)) +'%')

## 5.2 Shot Accuracy Over Different Time Periods

### 5.2.1 Days of the Season

In [None]:
print('First day of the regular season ' + str(df['date'].min()))
print('Last day in the dataset ' + str(df['date'].max()))

In [None]:
shots_by_day = df.groupby(['date'])['fgm'].agg(['sum','count'])
shots_by_day['percentage'] = round((shots_by_day['sum'] / shots_by_day['count']), 2)

fig = go.Figure()

fig.add_trace(go.Scatter(
        x=shots_by_day.index,
        y=shots_by_day['percentage'].values)
             )

fig.update_layout(
    shapes=[
        go.layout.Shape(
            type="line",
            x0=shots_by_day.index.min(),
            x1=shots_by_day.index.max(),
            y0=round(df['fgm'].sum()/df['fgm'].count(),2),
            y1=round(df['fgm'].sum()/df['fgm'].count(),2),
            line=dict(
                color="LightSeaGreen",
                dash="dashdot"
            )
        )
    ]
)

fig.show()

**2.2 Quarter of the Game**

In [None]:
shots_by_period = df.groupby(['period','pts_type'])['fgm'].agg(['sum','count'])
shots_by_period['percentage'] = round((shots_by_period['sum'] / shots_by_period['count']), 2)

two_point_shots_by_period = shots_by_period.xs(2, level=1)
three_point_shots_by_period = shots_by_period.xs(3, level=1)

In [None]:
trace1 = go.Scatter(
    x=two_point_shots_by_period.index,
    y=two_point_shots_by_period['percentage'].values,
    name='2-Point Field Goal'
)
trace2 = go.Scatter(
    x=three_point_shots_by_period.index,
    y=three_point_shots_by_period['percentage'].values,
    name='3-Point Field Goal'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group',
    title='Field Goal Percentage by Quarter',
    xaxis = dict(
        tickmode = 'array',
        tickvals = [1, 2, 3, 4, 5, 6, 7],
        ticktext = ['Q1', 'Q2', 'Q3', 'Q4', 'OT1', 'OT2', 'OT3']
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

**2.3 Second of the Possession**

In [None]:
"""
import plotly.figure_factory as ff
import numpy as np

df['shot_clock'] = df['shot_clock'].dropna().astype(int)

x1 = df['shot_clock']

hist_data = [[x1]]

group_labels = ['Group 1']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, show_hist=False)

# Add title
fig.update_layout(title_text='Curve and Rug Plot')
fig.show()
"""

**3. Shot Accuracy Based on Player's **

In [None]:
trace1 = go.Scattergl(
    x=df['shot_dist'],
    y=df['close_def_dist'],
    mode='markers',
    marker=dict(
        opacity=0.5
    )
)

data=[trace1]

figure = go.Figure(data=data)

py.iplot(figure)

# Summary