In [None]:
# Data Cleaning Tools
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import data
NBA_Shot_Logs = pd.read_csv('../input/nba-shot-logs/shot_logs.csv')
# Drop the features that has extremely low correlation with the predicting feature
NBA_Shot_Logs = NBA_Shot_Logs.drop(['player_id','player_name','FGM','W','GAME_ID','MATCHUP','FINAL_MARGIN','CLOSEST_DEFENDER_PLAYER_ID','CLOSEST_DEFENDER','PTS'], axis=1)
# Display result data
NBA_Shot_Logs.head()

In [None]:
# Changing the type of GAME_CLOCK so that it become an numerical feature  and can be trained in the future.
NBA_Shot_Logs['GAME_CLOCK'].str.split(':')
NBA_Shot_Logs['GAME_CLOCK'] = NBA_Shot_Logs['GAME_CLOCK'].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
NBA_Shot_Logs.rename(columns={'GAME_CLOCK':'GAME_CLOCK_SEC'}, inplace=True)

In [None]:
NBA_Shot_Logs

In [None]:
# Check for any missing value
NBA_Shot_Logs.isnull().any()

In [None]:
# Imputing shot clock is not fair for the data, so we drop all missing rows
NBA_Shot_Logs = NBA_Shot_Logs.dropna(how = 'any', axis = 0)

# Binary encoding for LOCATION and SHOT_RESULT
* Convert categorical values of LOCATION into numerical values
* Convert categorical values of SHOT_RESULT into numerical values

In [None]:
# Binary encoding for LOCATION and SHOT_RESULT
NBA_Shot_Logs['LOCATION'][NBA_Shot_Logs.LOCATION == 'H'] = 1
NBA_Shot_Logs['LOCATION'][NBA_Shot_Logs.LOCATION == 'A'] = 0
NBA_Shot_Logs['LOCATION'] = NBA_Shot_Logs['LOCATION'].astype('int32')

NBA_Shot_Logs['SHOT_RESULT'][NBA_Shot_Logs.SHOT_RESULT == 'made'] = 1
NBA_Shot_Logs['SHOT_RESULT'][NBA_Shot_Logs.SHOT_RESULT == 'missed'] = 0
NBA_Shot_Logs['SHOT_RESULT'] = NBA_Shot_Logs['SHOT_RESULT'].astype('int32')

In [None]:
#Checking the correlation between numerical features
numerical_col = NBA_Shot_Logs.select_dtypes(include = ['float64', 'int64','int32'])
corr = numerical_col.corr()
corr

In [None]:
# Collecting all variables except the one we are predicting into one list
x_variables = NBA_Shot_Logs.columns.values.tolist()
x_variables

In [None]:
#Check if we have all the features needed in the list
x_variables.pop(9)
x_variables

In [None]:
X = NBA_Shot_Logs[x_variables]
X.head()

In [None]:
# Assign SHOT_RESULT as Y which is the target feature that we are trying to predict.
Y = NBA_Shot_Logs['SHOT_RESULT']
Y.head()

In [None]:
#Import the train_test_split function to split 80% of the data into train set and 20% into test set.
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    test_size = 0.2,
                                                    random_state = 33)

In [None]:
X_train.dtypes
X_train.head()

In [None]:
from sklearn.impute import SimpleImputer
# Replace the missing values in X_train and X_test
myimp = SimpleImputer()

imputed_X_train = pd.DataFrame(myimp.fit_transform(X_train))
imputed_X_test = pd.DataFrame(myimp.transform(X_test))

imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

X_train = imputed_X_train
X_test = imputed_X_test

In [None]:
#Check the first five rows
X_train.head()

# Apply boosting method XGBOOST
* Change the format of train and test datasets
* Set up the parameters for boosting method
* Train the model
* Achieve the accuracy score

In [None]:
import xgboost as xgb
from sklearn import metrics
# Changing the format
dtest = xgb.DMatrix(X_test)
d_train_xgboost = xgb.DMatrix(X_train,label = Y_train)

# Parameter of boosting
parameters={'max_depth':10, 
            'objective':'binary:logistic',
            'eval_metric':'auc',
            'learning_rate':.05}

plst = list(parameters.items())

In [None]:
# Training model
xgb_model = xgb.train(parameters, d_train_xgboost, 50)  # train model

In [None]:
y_pred_xgb = xgb_model.predict(dtest)
y_pred_xgb

In [None]:
# We here will assign values of y_pred_xgb to three values(0.5,1,0) because we do not want too many decimals and less distinct values will help us to predict.
for i in range(0, X_test.shape[0]): 
    if y_pred_xgb[i]>=.5:       # setting threshold to .5 
       y_pred_xgb[i]=1 
    else: 
       y_pred_xgb[i]=0  

In [None]:
# Print out the accuracy score.
print ("Accuracy with XGBoost= ", metrics.accuracy_score(y_pred_xgb, Y_test))

In [None]:
# Plot model's features importance so we can see which features are more important for predicting SHOT_RESULT and which features are less important.
# We can observe that GMAE_CLOCK_SEC is the most imortant feature for predicting SHOT_RESULT and PTS_TYPE is the least important feature.
xgb.plot_importance(xgb_model)

# Apply boosting method LightBGM
* Change the format of X_train and Y_train
* Assign initial values to the parameters of LightBGM. 
* Apply gridsearch to look for the value of each feature that gives the best score one by one.

In [None]:
#lightgbm
import lightgbm as lgb

d_train_lgbm = lgb.Dataset(X_train, label=Y_train)

# Here we set a relatively high value of 0.1 to learning_rate but we will reduce it later to verify.
params = {    
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1,
          'num_leaves':30, 
          'max_depth': 10,   
          'subsample': 0.65, 
          'colsample_bytree': 0.65, 
    }
# Here we perform cross-validation with the parameters that we choose so that the length of it will be the most accurate n_estimators' value we can get.
cv_results = lgb.cv(params, d_train_lgbm, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())


In [None]:
# Here, we first choose the range of parameter max_depth and num_leaves that we want to test in order to find the most effective value. 
# Then we will use GridSearchCV with the chosen parameters to determine the most effective value of max_depth and num_leaves.
# We find that the best value for max_depth and num_leaves are 4 and 20, we will need these value for finding other paremeter's best value.
from sklearn.model_selection import GridSearchCV
params_test1={'max_depth': range(4,9,1), 'num_leaves':range(5, 100, 5)}

gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=42, max_depth=10, bagging_fraction = 0.8,feature_fraction = 0.8), 
                       param_grid = params_test1, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(X_train,Y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# Here, we first choose the range of parameter max_bin and min_data_in_leaf that we want to test in order to find the most effective value. 
# Then we will use GridSearchCV with the chosen parameters to determine the most effective value of max_bin and min_data_in_leaf. Note that we will use the value of max_depth and num_leaves that we achieved from last part.
# We find that the best value for max_bin and min_data_in_leaf are 215 and 41, we will need these value for finding other paremeter's best value.
params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)}
              
gsearch2 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=42, max_depth=4, num_leaves=20,bagging_fraction = 0.8,feature_fraction = 0.8), 
                       param_grid = params_test2, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch2.fit(X_train,Y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
# Here, we first choose the range of values of parameter feature_fraction, bagging_fraction and bagging_freq that we want to test in order to find the most effective value. 
# Then we will use GridSearchCV with the chosen parameters to determine the most effective value of feature_fraction, bagging_fraction and bagging_freq. Note that we will use the value of max_bin and min_data_in_leaf that we achieved from last part.
# We find that the best value for feature_fraction, bagging_fraction and bagging_freq are 0.8,0.6 and 0, we will need these value for finding other paremeter's best value.
params_test3={'feature_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_freq': range(0,81,10)
}
              
gsearch3 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=42, max_depth=4, num_leaves=20,max_bin=215,min_data_in_leaf=41), 
                       param_grid = params_test3, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch3.fit(X_train,Y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
# Here, we first choose the range of values of parameter lambda_l1 and lambda_l2 that we want to test in order to find the most effective value. 
# Then we will use GridSearchCV with the chosen parameters to determine the most effective value of lambda_l1 and lambda_l2. Note that we will use the value of feature_fraction, bagging_fraction and bagging_freq that we achieved from last part.
# We find that the best value for lambda_l1 and lambda_l2 are 0 and 0, we will need these value for finding other paremeter's best value.
params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
              'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]
}
              
gsearch4 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=42, max_depth=4, num_leaves=20,max_bin=215,min_data_in_leaf=41,bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8), 
                       param_grid = params_test4, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch4.fit(X_train,Y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
# Here, we first choose the range of values of parameter min_split_gain that we want to test in order to find the most effective value. 
# Then we will use GridSearchCV with the chosen parameters to determine the most effective value of min_split_gain. Note that we will use the value of lambda_l1 and lambda_l2 that we achieved from last part.
# We find that the best value for min_split_gain is 0.
params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
              
gsearch5 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=42, max_depth=4, num_leaves=20,max_bin=215,min_data_in_leaf=41,bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8,
lambda_l1=0.0,lambda_l2=0.0), 
                       param_grid = params_test5, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch5.fit(X_train,Y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

In [None]:
# We will increase n_estimators in order to let the model more accurate and decrease learning_rate to 0.01, the initial learning_rate of 0.1 is too high and 0.01 is more appropriate.
# Then, we will do another preditction based on the train set and observe an accuracy score.
model=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=1000, max_depth=4, num_leaves=20,max_bin=215,min_data_in_leaf=41,bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8,
lambda_l1=0.0,lambda_l2=0.0,min_split_gain=0)
model.fit(X_train,Y_train)
y_pre=model.predict(X_test)

import sklearn.metrics as metrics

print("acc:",metrics.accuracy_score(Y_test,y_pre))
print("auc:",metrics.roc_auc_score(Y_test,y_pre))