In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm
import xgboost
from tensorflow import keras
import tensorflow as tf
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge

Reading the csv files and taking a look at the data. A slight majority of the features are categorical while the rest are numerical. Probably the only feature than needs special treatment is gameType which is unprocessed categorical.

In [None]:
train = pd.read_csv('../input/pubg-finish-placement-prediction/train_V2.csv')
test = pd.read_csv('../input/pubg-finish-placement-prediction/test_V2.csv')
submission = pd.read_csv('../input/pubg-finish-placement-prediction/sample_submission_V2.csv')
train

The dataset contains 44496966 samples so it makes sense to only use a subset to analyze. Pandas provides a sample method so we can simply use that to create out subsample. 

In [None]:
sample_data = train.sample(n=100000)
sample_data.head()

Here we are using seaborn to plot a correlation matrix to try and figure out which features are the most correlated with the target. As expected KillPlace and Win Percentile are inversely proportional. When someone is number 1 in kills, there is a high chance that they finished in the top.  

In [None]:
corr = sample_data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(25, 10))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

The following graphs plot the distribution of features from two types of dataframes. The first is the normal dataframe. The second is a dataframe where only samples in which the WinPlacePerc is >= 0.9 (meaning they finished in the top 10%). We are doing this to see what kind of strategies tend to win. 

In [None]:
data_if_top_perc = sample_data[sample_data['winPlacePerc']>=0.9]
plt.figure(figsize=(20, 10))
ax = sns.histplot(sample_data[:10576]['walkDistance'], kde=True)
ax.set(xlabel='Distanced Walked For Everyone')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(data_if_top_perc['walkDistance'], kde=True)
ax.set(xlabel='Distanced Walked Top 90%')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(sample_data[:10576]['killPlace'], kde=True)
ax.set(xlabel='Kill Ranking For Everyone')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(data_if_top_perc['killPlace'], kde=True)
ax.set(xlabel='Kill Ranking For People In Top 90%')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(sample_data[:10576]['damageDealt'], kde=True)
ax.set(xlabel='Damage Dealt For Everyone')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(data_if_top_perc['damageDealt'], kde=True)
ax.set(xlabel='Damage Dealt For People In Top 90%')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.countplot(sample_data[:10576]['assists'])
ax.set(xlabel='Assists For Everyone')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.countplot(data_if_top_perc['assists'])
ax.set(xlabel='Assists For People In Top 90%')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.countplot(sample_data[:10576]['kills'])
ax.set(xlabel='Kills For Everyone')

In [None]:
plt.figure(figsize=(20, 10))
sns.countplot(data_if_top_perc['kills'])
ax.set(xlabel='Kills For People In Top 90%')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.countplot(sample_data[:10576]['heals'])
ax.set(xlabel='Heals For Everyone')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.countplot(data_if_top_perc['heals'])
ax.set(xlabel='Heals For Everyone')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.countplot(sample_data[:10576]['weaponsAcquired'])
ax.set(xlabel='Weapons Acquired For Everyone')

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.countplot(data_if_top_perc['weaponsAcquired'])
ax.set(xlabel='Weapons Acquired For People In Top 90%')

In [None]:
kills_over_dist = []
for num_kills, distance in zip(sample_data[:10576]['kills'].values, sample_data[:10576]['walkDistance'].values):
    if distance != 0:
        if ((num_kills/distance)*10000) <= 300:
            kills_over_dist.append((num_kills/distance)*10000)

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(kills_over_dist)
ax.set(xlabel='Kills Over Distance Walked For Everyone')

In [None]:
kills_over_dist = []
for num_kills, distance in zip(data_if_top_perc[:10576]['kills'].values, data_if_top_perc[:10576]['walkDistance'].values):
    if distance != 0:
        if ((num_kills/distance)*10000) <= 300:
            kills_over_dist.append((num_kills/distance)*10000)

In [None]:
plt.figure(figsize=(20, 10))
ax = sns.histplot(kills_over_dist[:10230])
ax.set(xlabel='Kills Over Distance Walked For Top 90%')

From the analysis, we can likely conclude that being more active and getting more kills is the key to sucess. Of course, the better you finished, the longer you have to get kills, and do other things. Therefore the last two plots divide the number of kills over distance walked so that people who lived longer (and walked more) get the playing field leveled. We still see a higher concentration of people past 0 (People who got no kills) for people in the top 90% vs everyone. 

The next part is modelling. Here we will try to predict the winPlacePerc give all the stats of a player in a given game. I use a blend of three models to make the final predictions.

In [None]:
train = train.fillna(train.mode().iloc[0])
train_features = train.drop(['Id', 'groupId', 'matchId', 'winPlacePerc', 'matchType'], axis=1).values
targets = train['winPlacePerc'].values

In [None]:
X_train, X_valid = train_test_split(train_features, shuffle=True, test_size=0.3, random_state=2021)
y_train, y_valid = train_test_split(targets, shuffle=True, test_size=0.3, random_state=2021)

In [None]:
lgb = lightgbm.LGBMRegressor()
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_valid)
mean_absolute_error(y_valid, lgb_pred)

In [None]:
xgb = xgboost.XGBRegressor()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_valid)
mean_absolute_error(y_valid, xgb_pred)

In [None]:
model = keras.models.Sequential([
    keras.layers.Dense(300, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(1),
])
model.compile(optimizer='adam', loss='mae')
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=3, batch_size=1024, 
          callbacks=[keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)])

In [None]:
nn_pred = model.predict(X_valid)
blend_train = np.c_[lgb_pred, xgb_pred, nn_pred]
rd = Ridge()
rd.fit(blend_train, y_valid)
rd_pred = rd.predict(blend_train)

In [None]:
for pred, truth in zip(rd_pred[:15], y_valid[:15]):
    print(f'Prediction: {pred} | Truth: {truth}')
print(f'\nBlend Loss: {mean_absolute_error(y_valid, rd_pred)}')