In [54]:
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
import os
import pandas as pd
import numpy as np

## Read Data

In [55]:
os.chdir("/home/rk9cx/Kaggle/Kobe Shot Selection/")
df = pd.read_csv("data.csv", index_col = False, low_memory=False, parse_dates=["game_date"])
test = df[df['shot_made_flag'].isna()]
train = df[~df['shot_made_flag'].isna()]

## Preprocessing

In [None]:
#converting date into different fatures
add_datepart(train, 'game_date')
add_datepart(test, 'game_date')

In [None]:
#converting categorical variables into label coding
train_cats(train)
apply_cats(test, train)

In [None]:
#test.drop(["shot_made_flag"], axis = 1, inplace= True)
df_test, y_test, nas = proc_df(test, 'shot_made_flag')

In [None]:
#imputing missing values with median
df, y, nas = proc_df(train, 'shot_made_flag')

In [60]:
#function for train-test split
def split_vals(a,n): 
    return a[:n].copy(), a[n:].copy()

n_valid = 5000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(train, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

## Modelling - Basic Random Forest

In [61]:
#cross validation
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train, y_train)
metrics.log_loss(y_valid,m.predict_proba(X_valid))

0.7970602575310219

# Approach 2 - One-hot Encoding

## Read Data

In [64]:
os.chdir("/home/rk9cx/Kaggle/Kobe Shot Selection/")
df = pd.read_csv("data.csv", index_col = False, low_memory=False, parse_dates=["game_date"])
test = df[df['shot_made_flag'].isna()]
train = df[~df['shot_made_flag'].isna()]

## One-hot Encoding and Preprocessing

In [65]:
def make_onehot_feat_dict(df, feat_key, feat_name):
    # Create features for each day of the week
    feat_vals = df[feat_key].values
    all_vals = np.unique(feat_vals)
    N_vals = len(all_vals)
    N_feat = N_vals - 1

    # Create keys
    keys = [0]*N_feat
    for i in range(N_feat):
        keys[i] = 'f_'+feat_name+'_'+ str(all_vals[i])

    # Create value for each training example in dict
    feat_dict = {}
    for i, k in enumerate(keys):
        this_day = all_vals[i]
        feat_dict[k] = feat_vals == this_day
    return feat_dict
#converting top 2 categorical variable to one-hot encoding
action_type_dict = make_onehot_feat_dict(df, 
                                      'action_type', 'action_type')
combined_shot_type_dict = make_onehot_feat_dict(df, 
                                      'combined_shot_type', 'combined_shot_type')

In [66]:
#convert one-hot to dictionary
all_dicts = [action_type_dict, combined_shot_type_dict]
feat_dict = all_dicts[0].copy()
for d in all_dicts[1:]:
    feat_dict.update(d)

In [67]:
#subset dataframe to eliminate categorical variables
df_new = df.drop(["action_type","combined_shot_type"], axis = 1)

In [68]:
#modify dataframe to accomodate the on-hot encoded features
df_feat = pd.DataFrame.from_dict(feat_dict)
df_feat = pd.concat([df_feat, df_new], axis=1)
df_feat.shape

(30697, 84)

In [69]:
test_new = df[df['shot_made_flag'].isna()]
train_new = df[~df['shot_made_flag'].isna()]

In [None]:
#converting date into different fatures
add_datepart(train_new, 'game_date')
add_datepart(test_new, 'game_date')

In [None]:
#converting categorical variables into label coding
train_cats(train_new)
apply_cats(test_new, train_new)

In [None]:
#imputing missing values with median
df_test, y_test, nas = proc_df(test_new, 'shot_made_flag')
df, y, nas = proc_df(train_new, 'shot_made_flag')

## Splitting Data for Cross Validation

In [52]:
#function for train-test split
def split_vals(a,n): 
    return a[:n].copy(), a[n:].copy()

n_valid = 5000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(train, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

In [53]:
#cross validation
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train, y_train)
metrics.log_loss(y_valid,m.predict_proba(X_valid))

0.8525022750709504