In [None]:
# Import necessary libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import warnings

warnings.filterwarnings("ignore")

In [None]:
# For plot sizes
plt.rcParams["figure.figsize"] = (18,8)
sns.set(rc={'figure.figsize':(18,8)})

In [None]:
os.listdir('../input')

In [None]:
# Load Part 1 data
data_ind = pd.read_csv('../input/pubg-walkthrough/Training_Data_New.csv')
print("Done loading data from part 1")

In [None]:
data_ind.drop(['Unnamed: 0', 'Id', 'groupId', 'matchId'], axis=1, inplace=True)

In [None]:
data_matchT = pd.get_dummies(data_ind['matchType'])
data_ind = pd.concat([data_ind, data_matchT], axis=1)
data_ind.drop('matchType', axis=1, inplace=True)
data_ind.head()

In [None]:
data_ind.shape

In [None]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
import tqdm

# As individuals 

I want to try out a few different algorithms and see which one does best before tuning that one.

In [None]:
y = data_ind['winPlacePerc']

In [None]:
X = data_ind
X.drop('winPlacePerc', axis=1, inplace=True)

In [None]:
y.shape

In [None]:
X.shape

In [None]:
X_train_ind, X_test_ind, y_train_ind, y_test_ind = train_test_split(X, y, test_size=0.15, random_state=12)

# Lightgbm 

In [None]:
from lightgbm import LGBMRegressor
import datetime

In [None]:
time_0 = datetime.datetime.now()

lgbm = LGBMRegressor(objective='mae', n_jobs=-1, random_state=12)

lgbm.fit(X_train_ind, y_train_ind)

time_1  = datetime.datetime.now()

print('Training took {} seconds.'.format((time_1 - time_0).seconds))
print('Mean Absolute Error is {:.5f}'.format(mae(y_test_ind, lgbm.predict(X_test_ind))))

In [None]:
import shap

In [None]:
shap.initjs()

SAMPLE_SIZE = 10000
SAMPLE_INDEX = np.random.randint(0, X_test_ind.shape[0], SAMPLE_SIZE)

X = X_test_ind.iloc[SAMPLE_INDEX]

explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X)

In [None]:
shap.summary_plot(shap_values, X)

In [None]:
shap.summary_plot(shap_values, X, plot_type='bar', color='darkred')

# Xgboost

In [None]:
# Let's also try xgboost 
import xgboost as xgb

In [None]:
regressor = xgb.XGBRegressor(objective = 'reg:squarederror')
regressor

In [None]:
regressor.fit(X_train_ind, y_train_ind)
y_pred = regressor.predict(X_test_ind)

In [None]:
# check the MAE
Mae = mae(y_test_ind, y_pred)
print('MAE %f' % (Mae))

In [None]:
xgb.plot_importance(regressor)
plt.title("xgboost.plot_importance(regressor)")
plt.show()

# Catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
cat = CatBoostRegressor(iterations = 300, eval_metric='MAE', metric_period=10)

In [None]:
cat.fit(X_train_ind, 
       y_train_ind,
      eval_set=(X_test_ind, y_test_ind),
     use_best_model=True)

In [None]:
explainer = shap.TreeExplainer(cat)
shap_values = explainer.shap_values(X)

In [None]:
# summary of features for cat model 
shap.summary_plot(shap_values, X)

Right now it appears that out of all the default models, lightgbm works the best. However, before I optimize the parameters for that model I want to see the performance for the top ten features for every model as well as the group data results, which will be explored in the next part or two. 