In [41]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None) # display all columns

In [42]:
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier


In [47]:
data = pd.read_csv("../input/data.csv")

In [48]:
data['seconds_from_period_end'] = 60 * data['minutes_remaining'] + data['seconds_remaining']
data.loc[data['seconds_from_period_end'] < 5, 'last_5_sec_in_period'] = 1
data.loc[data['seconds_from_period_end'] >= 5, 'last_5_sec_in_period'] = 0

In [49]:
data.drop(["seconds_remaining"], axis=1, inplace=True)
data.drop(["team_name"], axis=1, inplace=True)
data.drop(["game_event_id"], axis=1, inplace=True)
data.drop(["lat"], axis=1, inplace=True)
data.drop(["lon"], axis=1, inplace=True)
data.drop(["game_id"], axis=1, inplace=True)
data.drop(["team_id"], axis=1, inplace=True)
data.drop(["matchup"], axis=1, inplace=True)

In [50]:
# Game date
data['game_date'] = pd.to_datetime(data['game_date']).astype('category')
data['game_year'] = data['game_date'].dt.year.astype('category')
data['game_month'] = data['game_date'].dt.month.astype('category')
data['game_day'] = data['game_date'].dt.dayofweek.astype('category')

data.drop('game_date', axis=1, inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 20 columns):
action_type                30697 non-null object
combined_shot_type         30697 non-null object
loc_x                      30697 non-null int64
loc_y                      30697 non-null int64
minutes_remaining          30697 non-null int64
period                     30697 non-null int64
playoffs                   30697 non-null int64
season                     30697 non-null object
shot_distance              30697 non-null int64
shot_made_flag             25697 non-null float64
shot_type                  30697 non-null object
shot_zone_area             30697 non-null object
shot_zone_basic            30697 non-null object
shot_zone_range            30697 non-null object
opponent                   30697 non-null object
shot_id                    30697 non-null int64
seconds_from_period_end    30697 non-null int64
last_5_sec_in_period       30697 non-null float64
game_year      

In [8]:
categorial_cols = [
    'action_type', 'combined_shot_type', 'season', 'shot_type', "game_year", "game_month", 
    'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'opponent', 'loc_x', 'loc_y'
]

for col in categorial_cols:
    dummies = pd.get_dummies(data[col])
    dummies = dummies.add_prefix("{}#".format(col))
    data.drop(col, axis=1, inplace=True)
    data = data.join(dummies)

In [9]:
train = data.loc[pd.notnull(data.shot_made_flag)]
test = data.loc[pd.isnull(data.shot_made_flag)]

In [10]:
X = train.drop(["shot_made_flag"], axis=1)
y = train["shot_made_flag"]
test = test.drop(["shot_made_flag"], axis = 1)

In [11]:
X.shape

(25697, 1119)

In [12]:
seed = 7
processors=1
num_folds=3
num_instances=len(X)
scoring='neg_log_loss'

kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)

In [13]:
model = GradientBoostingClassifier(n_estimators=100, random_state=seed)

results = cross_val_score(model, X, y, cv=kfold, scoring=scoring, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))

(-0.614) +/- (0.003)


In [20]:
model.fit(X, y)

In [38]:
y_pred = model.predict_proba(X)[:,1]

In [40]:
y_sumbit = model.predict_proba(test)
submission = pd.DataFrame()
submission["shot_id"] = test.shot_id
submission["shot_made_flag"]= y_sumbit[:,1]
submission.to_csv("../output/gbt.csv",index=False)