## Predicting Kobe's shots using sklearn##

This notebook builds on the results of exploratory data analysis, feature selection, feature engineering, and model selection from:
https://www.kaggle.com/khozzy/kobe-bryant-shot-selection/kobe-shots-show-me-your-best-model

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

### Reading data

In [None]:
data = pd.read_csv('../input/data.csv')
data.set_index('shot_id', inplace=True)

The following should be explicitly made categorical:

In [None]:
data["action_type"] = data["action_type"].astype('object')
data["combined_shot_type"] = data["combined_shot_type"].astype('category')
data["period"] = data["period"].astype('object')
data["season"] = data["season"].astype('category')
data["shot_type"] = data["shot_type"].astype('category')

Let's prepare the data for the splitting into train/test set later.

In [None]:
unknown_mask = data['shot_made_flag'].isnull()
data_cl = data.copy()
target = data_cl['shot_made_flag'].copy()

The following columns are to be dropped:

In [None]:
data_cl.drop('team_id', inplace=True, axis=1) #only 1 category
data_cl.drop('lat', inplace=True, axis=1) # correlated with loc_x
data_cl.drop('lon', inplace=True, axis=1) # correlated with loc_y
data_cl.drop('game_id', inplace=True, axis=1) # should not be dependent on game id, furthermore it's contained in opponent/match
data_cl.drop('game_event_id', inplace=True, axis=1) # independent, unique for every shots in a game
data_cl.drop('team_name', inplace=True, axis=1) # always LA Lakers
data_cl.drop('shot_made_flag', inplace=True, axis=1) # target variables

### Feature Engineering ##

Time remaining:

In [None]:
data_cl['seconds_from_period_end'] = 60 * data_cl['minutes_remaining'] + data_cl['seconds_remaining']
data_cl['last_5_sec_in_period'] = data_cl['seconds_from_period_end'] < 5
# drop redundant features
data_cl.drop('minutes_remaining', axis=1, inplace=True)
data_cl.drop('seconds_remaining', axis=1, inplace=True)

Matchup -- away/home:

In [None]:
data_cl['home_play'] = data_cl['matchup'].str.contains('vs').astype('int')
data_cl.drop('matchup', axis=1, inplace=True)

Extract year and month from date of game:

In [None]:
data_cl['game_date'] = pd.to_datetime(data_cl['game_date'])
data_cl['game_year'] = data_cl['game_date'].dt.year
data_cl['game_month'] = data_cl['game_date'].dt.month
data_cl.drop('game_date', axis=1, inplace=True)

Replace 20 least common action types with value 'Other'

In [None]:
rare_action_types = data_cl['action_type'].value_counts().sort_values().index.values[:20]
data_cl.loc[data_cl['action_type'].isin(rare_action_types), 'action_type'] = 'Other'

One-hot encoding of categorical variables:

In [None]:
categorial_cols = [
    'action_type', 'combined_shot_type', 'period', 'season', 'shot_type',
    'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_year',
    'game_month', 'opponent']

for cc in categorial_cols:
    dummies = pd.get_dummies(data_cl[cc])
    dummies = dummies.add_prefix("{}_".format(cc))
    data_cl.drop(cc, axis=1, inplace=True)
    data_cl = data_cl.join(dummies)

In [None]:
data_cl.head()

## Train/test/validation Splitting ##

In [None]:
data_submit = data_cl[unknown_mask]
# Separate dataset for training
X = data_cl[~unknown_mask]
Y = target[~unknown_mask]

In [None]:
from sklearn.cross_validation import KFold, cross_val_score
seed = 1999
processors=1
num_folds=5
num_instances=len(X)
scoring='neg_log_loss'
kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)

## Logistic Regression ##

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
cv_results = cross_val_score(LogisticRegression(), X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("{0}: ({1:.3f}) +/- ({2:.3f})".format('LR', cv_results.mean(), cv_results.std()))

In [None]:
model = LogisticRegression()
model.fit(X,Y)
preds = model.predict_proba(data_submit)
submission = pd.DataFrame()
submission["shot_id"] = data_submit.index
submission["shot_made_flag"]= preds[:,0]

submission.to_csv("sub_lr.csv",index=False)

## XGBOOST

In [None]:
import xgboost as xgb

In [None]:
d_train = xgb.DMatrix(X, label=Y)
dtest = xgb.DMatrix(data_submit)

In [None]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['max_depth'] = 4
params['silent'] = 1
params['colsample_bytree'] = 0.7
params['eta'] = 0.3

In [None]:
cvp = xgb.cv(params, d_train, num_boost_round=100000, early_stopping_rounds=10, metrics=['logloss'], verbose_eval=10, stratified=True)

In [None]:
clf = xgb.train(params, d_train, num_boost_round=30)

In [None]:
preds = clf.predict(dtest)

In [None]:
submission = pd.DataFrame()
submission["shot_id"] = data_submit.index
submission["shot_made_flag"]= preds

submission.to_csv("sub_xgb.csv",index=False)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
num_trees = 200
num_features = 20

model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)

results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))

In [None]:
model.fit(X, Y)
preds_rf = model.predict_proba(data_submit)
submission = pd.DataFrame()
submission["shot_id"] = data_submit.index
submission["shot_made_flag"]= preds_rf[:,0]

submission.to_csv("sub_rf.csv",index=False)

## TensorFlow

In [None]:
import tensorflow as tf

In [None]:
CONTINUOUS_COLUMNS = ['loc_x', 'loc_y', 'shot_distance', 'seconds_from_period_end']
CATEGORICAL_COLUMNS = ['action_type', 'combined_shot_type', 'period', 'season',
                       'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range',
                      'opponent', 'last_5_sec_in_period','playoffs', 'home_play', 'game_year', 'game_month']

In [None]:
def input_fn(df):
  # Creates a dictionary mapping from each continuous feature column name (k) to
  # the values of that column stored in a constant Tensor.
  continuous_cols = {k: tf.constant(df[k].values)
                     for k in CONTINUOUS_COLUMNS}
  # Creates a dictionary mapping from each categorical feature column name (k)
  # to the values of that column stored in a tf.SparseTensor.
  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
      shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
  # Merges the two dictionaries into one.
  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
  # Converts the label column into a constant Tensor.
  label = tf.constant(target)
  # Returns the feature columns and the label.
  return feature_cols, label

def train_input_fn():
  return input_fn(df_train)

def eval_input_fn():
  return input_fn(df_test)

We need to do some feature engineering.

In [None]:
playoffs = tf.contrib.layers.sparse_column_with_keys(
  column_name="playoffs", keys=[0, 1])
homeplay = tf.contrib.layers.sparse_column_with_keys(
  column_name="homeplay", keys=[0, 1])