In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Reference
* https://www.kaggle.com/dixhom/data-analysis-for-beginners
* https://www.kaggle.com/arjoonn/preliminary-exploration

# **Import libraries**

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# **Import data**

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
df = pd.read_csv('../input/kobe-bryant-shot-selection/data.csv.zip')
df.info()


# **Exploratory Data**

* Court visualization of misses and shots

In [None]:
court_scale, alpha = 7, 0.05
plt.figure(figsize=(2 * court_scale, court_scale*(84.0/50.0)))
# hit
plt.subplot(121)
h = df.loc[df.shot_made_flag == 1]
plt.scatter(h.loc_x, h.loc_y, color='green', alpha=alpha)
plt.title('Shots Made')
ax = plt.gca()
ax.set_ylim([-50, 900])
# miss
plt.subplot(122)
h = df.loc[df.shot_made_flag == 0]
plt.scatter(h.loc_x, h.loc_y, color='red', alpha=alpha)
plt.title('Shots missed')
ax = plt.gca()
ax.set_ylim([-50, 900])
plt.savefig('shots_made_and_missed.png')

In the green dot, he made many shots from under the basket, but in the red spot, he missed a lot of goals from the under-basket and the three-point line.

* Combined shot types

In [None]:
groups = df.groupby('combined_shot_type')


fig, ax = plt.subplots(figsize=(court_scale, court_scale*(84.0/50.0)))
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
alpha = 0.2
alphas, n = [], float(len(df.combined_shot_type))
for u in [i[0] for i in groups]:
    d = len(df.loc[df.combined_shot_type == u, 'combined_shot_type'])
    alphas.append(np.log1p(d))

for (name, group), alp in zip(groups, alphas):
    ax.plot(group.loc_x, group.loc_y,
            marker='.', linestyle='', ms=12,
            label=name, alpha=alp)
ax.legend()
plt.savefig('combined_shot_type_layout.png')

In [None]:
court_scale, alpha = 5, 0.5
df['unique_first_words'] = df.action_type.str.split(' ').str[0]
uq_count = len(df['unique_first_words'].unique())
a = int(uq_count / 2) + 1

groups = df.groupby('unique_first_words')
fig, ax = plt.subplots(figsize=(2 * court_scale, a * 1.1 * court_scale*(84.0/50.0)))
X, Y = np.array([(i, 0) for i in np.arange(-400, 400, 0.1)]), np.array([(0, i) for i in np.arange(-60, 1000, 0.1)])
for index, (name, group) in enumerate(groups):
    plt.subplot(a, 2, index + 1)
    h = group.loc[group.shot_made_flag == 1, ['loc_y', 'loc_x']]
    m = group.loc[group.shot_made_flag == 0, ['loc_y', 'loc_x']]
    
    plt.plot(h.loc_x, h.loc_y,
            marker='.', linestyle='', ms=12,
            label=name, alpha=alpha, color='green')
    
    plt.plot(m.loc_x, m.loc_y,
            marker='.', linestyle='', ms=12,
            label=name, alpha=alpha, color='red')
    x_lim = group.loc_x.mean() + 3* group.loc_x.std()
    y_lim = group.loc_y.mean() + 3* group.loc_y.std()
    plt.plot(X[:, 0], X[:, 1], 'black')
    plt.plot(Y[:, 0], Y[:, 1], 'black')
    plt.xlim([-x_lim, x_lim])
    plt.ylim([-y_lim, y_lim])
    
    plt.title(name)
    plt.savefig('action_type_first_words.png')

In [None]:
court_scale, alpha = 7, 0.1

fig = plt.figure(figsize=(2 * court_scale, court_scale*(84.0/50.0)))
plt.subplot(121)
plt.scatter(df.loc_x, df.loc_y, alpha=alpha, c=df.seconds_remaining, cmap='Greens_r')
plt.title('Seconds Remaining')
plt.subplot(122)
plt.scatter(df.loc_x, df.loc_y, alpha=alpha, c=df.minutes_remaining, cmap='Greens_r')
plt.title('Minutes Remaining')
plt.savefig('time_remaining_shot_layout.png')

* Shooting accuracy with shot distance

# **Feature Engineering**

In [None]:
def get_acc(df, against):
    ct = pd.crosstab(df.shot_made_flag, df[against]).apply(lambda x:x/x.sum(), axis=0)
    x, y = ct.columns, ct.values[1, :]
    plt.figure(figsize=(7, 5))
    plt.plot(x, y)
    plt.xlabel(against)
    plt.ylabel('% shots made')
    plt.savefig(against + '_vs_accuracy.png')
get_acc(df, 'shot_distance')

In [None]:
data = df[['loc_x', 'loc_y', 'shot_made_flag']]
data = data.dropna()
def test_it(data):
    clf = RandomForestClassifier(n_jobs=-1)  # A super simple classifier
    return cross_val_score(clf, data.drop('shot_made_flag', 1), data.shot_made_flag,
                           scoring='roc_auc', cv=10
                          )
test_it(data).mean()


Reference
* https://pinkwink.kr/986

In [None]:
sns.jointplot(x="loc_x", y="loc_y", data=data, kind='kde')

In [None]:
data = df[['loc_y', 'shot_made_flag']]
data = data.dropna()
test_it(data).mean()

In [None]:
data = df[['shot_distance', 'shot_made_flag']]
data = data.dropna()
test_it(data).mean()

In [None]:
get_acc(df, 'seconds_remaining')

In [None]:
data = df[['seconds_remaining', 'shot_distance', 'shot_made_flag']].dropna()
test_it(data).mean()

In [None]:
get_acc(df, 'minutes_remaining')

In [None]:
get_acc(df, 'period')

In [None]:
print(df.season.unique())
df['season_start_year'] = df.season.str.split('-').str[0]
df['season_start_year'] = df['season_start_year'].astype(int)
get_acc(df, 'season_start_year')

In [None]:
data = df[['season_start_year', 'shot_distance', 'shot_made_flag']].dropna()
test_it(data).mean()

In [None]:
action_map = {action: i for i, action in enumerate(df.action_type.unique())}
df['action_type_enumerated'] = df.action_type.map(action_map)
get_acc(df, 'action_type_enumerated')

In [None]:
def sort_encode(df, field):
    ct = pd.crosstab(df.shot_made_flag, df[field]).apply(lambda x:x/x.sum(), axis=0)
    temp = list(zip(ct.values[1, :], ct.columns))
    temp.sort()
    new_map = {}
    for index, (acc, old_number) in enumerate(temp):
        new_map[old_number] = index
    new_field = field + '_sort_enumerated'
    df[new_field] = df[field].map(new_map)
    get_acc(df, new_field)
sort_encode(df, 'action_type_enumerated')

In [None]:
data = df[['action_type_enumerated', 'shot_distance', 'shot_made_flag']].dropna()
x = test_it(data)
data = df[['action_type_enumerated_sort_enumerated', 'shot_distance', 'shot_made_flag']].dropna()
y = test_it(data)
print(x.mean(), y.mean())

In [None]:
opponent_map = {opp: i for i, opp in enumerate(df.opponent.unique())}
df['opponent_enumerated'] = df.opponent.map(opponent_map)

sort_encode(df, 'opponent_enumerated')


In [None]:
df['away'] = df.matchup.str.contains('@')
data = df[['action_type_enumerated', 'shot_distance',
           'shot_made_flag', 'away']].dropna()
test_it(data).mean()

Reference
* boxplot : https://boxnwhis.kr/2019/02/19/boxplot.html

In [None]:
data = df[['action_type_enumerated', 'shot_distance',
           'shot_made_flag', 'away']].dropna()

estimators, scores = list(range(1, 100, 5)), []
for i in estimators:
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=i, random_state=2016)
    x = cross_val_score(clf, data.drop('shot_made_flag', 1), data.shot_made_flag,
                              scoring='roc_auc', cv=10)
    scores.append(x)
x = [i for i in estimators for j in range(10)]
sns.boxplot(x, np.array(scores).flatten())

In [None]:
depth, scores = list(range(1, 20, 1)), []
for i in depth:
    clf = RandomForestClassifier(n_jobs=-1, n_estimators=70, max_depth=i, random_state=2016)
    x = cross_val_score(clf, data.drop('shot_made_flag', 1), data.shot_made_flag,
                              scoring='roc_auc', cv=10)
    scores.append(x)
x = [i for i in depth for j in range(10)]
sns.boxplot(x, np.array(scores).flatten())


In [None]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=70, max_depth=7, random_state=2016) # a more powerful classifier

train = df.loc[~df.shot_made_flag.isnull(), ['action_type_enumerated_sort_enumerated',
                                             'shot_distance', 'shot_made_flag', 'away']]
test = df.loc[df.shot_made_flag.isnull(), ['action_type_enumerated_sort_enumerated',
                                           'shot_distance', 'shot_id', 'away']]
# Impute
mode = test.action_type_enumerated_sort_enumerated.mode()[0]
test.action_type_enumerated_sort_enumerated.fillna(mode, inplace=True)

# Train and predict
clf.fit(train.drop('shot_made_flag', 1), train.shot_made_flag)
predictions = clf.predict_proba(test.drop('shot_id', 1))

In [None]:
submission = pd.DataFrame({'shot_id': test.shot_id,
                           'shot_made_flag': predictions[:, 1]})
submission[['shot_id', 'shot_made_flag']].to_csv('submission_kobe.csv', index=False)

In [None]:
submission