In [95]:
import pandas as pd
import numpy as np
import matplotlib as plt
import scipy as sp
import seaborn as sns
import sklearn.model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [96]:
#import dataset and look at column names
shot_orig = pd.read_csv('shot_logs.csv')
shot_orig.columns

Index(['GAME_ID', 'MATCHUP', 'LOCATION', 'W', 'FINAL_MARGIN', 'SHOT_NUMBER',
       'PERIOD', 'GAME_CLOCK', 'SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME',
       'SHOT_DIST', 'PTS_TYPE', 'SHOT_RESULT', 'CLOSEST_DEFENDER',
       'CLOSEST_DEFENDER_PLAYER_ID', 'CLOSE_DEF_DIST', 'FGM', 'PTS',
       'player_name', 'player_id'],
      dtype='object')

In [97]:
#get rid of game_id, matchup, location, win, final margin, 
shot = shot_orig.loc[: , ['SHOT_NUMBER', 'PERIOD', 'GAME_CLOCK', 'SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME',
                     'SHOT_DIST', 'PTS_TYPE', 'SHOT_RESULT', 'CLOSE_DEF_DIST', 'PTS']]
shot.head()

Unnamed: 0,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSE_DEF_DIST,PTS
0,1,1,1:09,10.8,2,1.9,7.7,2,made,1.3,2
1,2,1,0:14,3.4,0,0.8,28.2,3,missed,6.1,0
2,3,1,0:00,,3,2.7,10.1,2,missed,0.9,0
3,4,2,11:47,10.3,2,1.9,17.2,2,missed,3.4,0
4,5,2,10:34,10.9,2,2.7,3.7,2,missed,1.1,0


In [98]:
# find if game_clock refers to the time left in the whole game, or just the period
# shot.GAME_CLOCK

# it seems that none of the time goes above 12, so game_clock refers to just the period

In [99]:
shot.dtypes

#we see the objects are game_clock and shot_result, and all else
# are integers and floats.

SHOT_NUMBER         int64
PERIOD              int64
GAME_CLOCK         object
SHOT_CLOCK        float64
DRIBBLES            int64
TOUCH_TIME        float64
SHOT_DIST         float64
PTS_TYPE            int64
SHOT_RESULT        object
CLOSE_DEF_DIST    float64
PTS                 int64
dtype: object

In [100]:
shot.describe()

Unnamed: 0,SHOT_NUMBER,PERIOD,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST,PTS
count,128069.0,128069.0,122502.0,128069.0,128069.0,128069.0,128069.0,128069.0,128069.0
mean,6.506899,2.469427,12.453344,2.023355,2.765901,13.571504,2.26467,4.123015,0.997314
std,4.71326,1.139919,5.763265,3.47776,3.043682,8.888964,0.441159,2.756446,1.130978
min,1.0,1.0,0.0,0.0,-163.6,0.0,2.0,0.0,0.0
25%,3.0,1.0,8.2,0.0,0.9,4.7,2.0,2.3,0.0
50%,5.0,2.0,12.3,1.0,1.6,13.7,2.0,3.7,0.0
75%,9.0,3.0,16.675,2.0,3.7,22.5,3.0,5.3,2.0
max,38.0,7.0,24.0,32.0,24.9,47.2,3.0,53.2,3.0


In [101]:
#fill the NaN in shot_clock with the mean

shot[['SHOT_CLOCK']] = shot[['SHOT_CLOCK']].fillna(shot[['SHOT_CLOCK']].mean())

In [102]:
# create and isolate list of shot_results as boolean
shot_y = shot.SHOT_RESULT == 'missed'

#create dataframe of non-object independent variables 
shot_X = shot[['SHOT_NUMBER', 'DRIBBLES', 'TOUCH_TIME', 'SHOT_CLOCK',
              'SHOT_DIST', 'CLOSE_DEF_DIST', 'PTS_TYPE']]

In [103]:
# split data into training and testing

Xtrain, Xtest, ytrain, ytest = train_test_split(shot_X, shot_y, test_size = .25,
                                               random_state = 99)

In [104]:
# attempt to find the percentage of shot_result for different kinds
# kinds of shots first

# use a logistic regression to classify whether or not shot was made

lr = LogisticRegression()
lr.fit(Xtrain, ytrain)
print(accuracy_score(lr.predict(Xtest), ytest))
print(accuracy_score(lr.predict(Xtrain), ytrain))

# simple model seems to be very inaccurate with training and testing data.
# curiously enough, the two accuracy scores are very similar, and it is
# actually higher for the test data.
# This is probably a due to the fact that basketball shots percentages are
# highly volatile in and of themselves.  Repeated basketball shots, when
# when taken in the exact same context, is still most likely comprised of
# both makes and misses.

0.6106252732837779
0.6088640409782303


In [106]:
# Fine-tuning the model
Cs = [0.001, 0.1, 1, 10, 100]

parameters = {'C' : Cs}
cgrid = GridSearchCV(lr, param_grid = parameters, cv = 5)
cgrid.fit(Xtrain, ytrain)
cgrid.best_estimator_, cgrid.best_params_, cgrid.best_score_, cgrid.grid_scores_



(LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 {'C': 0.001},
 0.6082706062404347,
 [mean: 0.60827, std: 0.00332, params: {'C': 0.001},
  mean: 0.60827, std: 0.00337, params: {'C': 0.1},
  mean: 0.60826, std: 0.00339, params: {'C': 1},
  mean: 0.60827, std: 0.00339, params: {'C': 10},
  mean: 0.60827, std: 0.00339, params: {'C': 100}])