### Kaggle Competition - PUBG Finish Placement Prediction EDA Only
- https://www.kaggle.com/competitions/pubg-finish-placement-prediction
- Course Work Project: Management and Machine Learning, BIM801

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
from scipy.stats import skew, kurtosis
warnings.filterwarnings("ignore")
# Color palette: afmhot, YlOrBr, inferno
# https://intrepidgeeks.com/tutorial/python-matplotlib-package-image-color-scheme-sharing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# https://lsjsj92.tistory.com/604 : About data type memory capacity
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

In [None]:
train = pd.read_csv("../input/pubg-finish-placement-prediction/train_V2.csv",dtype=dtypes)
test = pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv",dtype=dtypes)

## General information about dataset

In [None]:
print("train size: " + str(train.shape))
print("test size: " + str(test.shape))

In [None]:
# Remove winPlacePerc missing instance  
drop_idx = train[train['winPlacePerc'].isna()].index
# train.drop(drop_idx, axis='index', inplace=True)
drop_idx

## Skewness & Kurtosis

In [None]:
col_target = 'damageDealt'
plt.figure(figsize=(12,6))
plt.title("%s_distribution"%col_target)
# print("Skewness: %f" % train[col_target].skew(nan_policy = 'omit'))
# print("Kurtosis: %f" % train[col_target].kurt(nan_policy = 'omit'))
sns.distplot(train[col_target], kde=False)

## Match / Group
- Id, matchDuration, matchId, matchType, groupId, numGroups, maxRank

In [None]:
# Are there duplicated matchId in Train/Test set?
# -> No
print("Total match count in train set: {}".format(train['matchId'].nunique()))
print("Total match count in test set: {}".format(test['matchId'].nunique()))

tmp_match_trn = train.drop_duplicates('matchId', keep='first')
tmp_match_tst = test.drop_duplicates('matchId', keep='first')
print('Trn_match + Tst_match count is: {}'.format(len(tmp_match_trn)+len(tmp_match_tst)))
tmp_match_concat = pd.concat([tmp_match_trn,tmp_match_tst])
tmp_match_concat = tmp_match_concat.drop_duplicates('matchId', keep='first')
print('Unique match count in total: {}'.format(len(tmp_match_concat)))

In [None]:
# Are there duplicated groupId in Train/Test set?
# -> No
print("Total group count in train set: {}".format(train['groupId'].nunique()))
print("Total group count in test set: {}".format(test['groupId'].nunique()))

tmp_group_trn = train.drop_duplicates('groupId', keep='first')
tmp_group_tst = test.drop_duplicates('groupId', keep='first')
print('Trn_group + Tst_group count is: {}'.format(len(tmp_group_trn)+len(tmp_group_tst)))
tmp_group_concat = pd.concat([tmp_group_trn,tmp_group_tst])
tmp_group_concat = tmp_group_concat.drop_duplicates('groupId', keep='first')
print('Unique group count in total: {}'.format(len(tmp_group_concat)))

In [None]:
# Does Same group means same WinPlacePerc? 
# => Yes, below is sample group
data = train[['Id', 'groupId', 'matchId', 'matchType', 'maxPlace','numGroups', 'winPlacePerc']].copy()
data[data['groupId']=='684d5656442f9e']

In [None]:
# If there was "K" matches, there exists "K" winners?
# tmp_wnr_trn = train[train['winPlacePerc'] == 1]
tmp_wnr_trn = train.sort_values(by = 'winPlacePerc', ascending= False)
tmp_wnr_trn = tmp_wnr_trn.drop_duplicates('matchId', keep='first')
print("There are {} winners in {} matches in trn set:".format(tmp_wnr_trn['Id'].nunique(),
                                                                 train['matchId'].nunique()))

In [None]:
# However, there exist highest winPlacePerc which is not 1.0
tmp_wnr_trn[tmp_wnr_trn['winPlacePerc'] != 1].head(3)

In [None]:
# Match Duration
data = train.copy()
plt.figure(figsize=(10,6))
plt.title("Match Duration",fontsize=15)
sns.distplot(data['matchDuration'], kde=True)
plt.show()

In [None]:
# Match Type
data = train.copy()
data['matchType'].value_counts()

In [None]:
# playersJoined
train['playersInMatch'] = train.groupby('matchId')['matchId'].transform('count')
test['playersInMatch'] = test.groupby('matchId')['matchId'].transform('count')

data = train.copy()
plt.figure(figsize=(10,6))
plt.title("Distplot for playersInMatch",fontsize=15)
sns.distplot(data['playersInMatch'], kde=False)
plt.show()

In [None]:
# trs = train.query('playersInMatch < 80')
plt.figure(figsize=(10,3))
plt.title("Boxplot for playersInMatch",fontsize=15)
sns.boxplot(data['playersInMatch'])
plt.show()

In [None]:
# External Points
data = train.copy()
data = data[['winPoints', 'rankPoints', 'killPoints']]
data.head(10)

## Kills & DamageDealt

In [None]:
print("The average person kills {:.4f} players, 99% of people have {} kills or less, while the most kills ever recorded is {}.".format(
    train['kills'].mean(),
    train['kills'].quantile(0.99), 
    train['kills'].max()))

In [None]:
# Kill Count
data = train.copy()
data['kills'].loc[data['kills'] > data['kills'].quantile(0.99)] = 8
plt.figure(figsize=(10,6))
sns.countplot(data['kills'].astype('str').sort_values(), palette='afmhot')
plt.title("Kill Count",fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(6,8))
sns.jointplot(x="winPlacePerc", y="kills", data=train, height=10, ratio=3, color="#3B9ABF")
plt.show()

- Most players cannot make a single kill to the end of the game.

In [None]:
data = train.copy()
data = data[data['kills']==0]
plt.figure(figsize=(10,6))
plt.title("Damage Dealt by 0 killers",fontsize=15)
sns.distplot(data['damageDealt'], kde=False)
plt.show()

- 0-Kill players, didn't even dealt any damage... 

In [None]:
data = train.copy()
data = data[data['kills']==0]
print("{} players ({:.4f}%) have won without a single kill!".format(len(data[data['winPlacePerc']==1]), 100*len(data[data['winPlacePerc']==1])/len(train)))

data1 = train[train['damageDealt'] == 0].copy()
print("{} players ({:.4f}%) have won without dealing damage!".format(len(data1[data1['winPlacePerc']==1]), 100*len(data1[data1['winPlacePerc']==1])/len(train)))

In [None]:
# 킬 수와 승률의 scatterplot
data = train.copy()
plt.figure(figsize=(10,6))
plt.title("Scatterplot with WinPerc and number of kills",fontsize=15)
sns.scatterplot(data['kills'], data['winPlacePerc'])
plt.show()

In [None]:
# Group Kill & winPlacePerc
data = train.copy()
data['kills_gr'] = data.groupby('groupId').kills.transform('min')

# Group Kill & winPlacePerc scatterplot
plt.figure(figsize=(10,6))
plt.title("Scatterplot with WinPerc and number of group kills",fontsize=15)
sns.scatterplot(data['kills_gr'], data['winPlacePerc'])
plt.show()

## Destroying Vehicle

In [None]:
data = train.copy()
f,ax1 = plt.subplots(figsize =(10,6))
sns.pointplot(x='vehicleDestroys',y='winPlacePerc',data=data,color='#606060',alpha=0.8)
plt.xlabel('Number of Vehicle Destroys',fontsize = 12,color='black')
plt.ylabel('Win Percentage',fontsize = 12,color='black')
plt.title('Vehicle Destroys/ Win Ratio',fontsize = 16,color='blue')
plt.grid()
plt.show()

## Correlation

In [None]:
k = 6 #number of variables for heatmap
f,ax = plt.subplots(figsize=(11, 11))
cols = train.corr().nlargest(k, 'winPlacePerc')['winPlacePerc'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
k = 5 #number of variables for heatmap
f,ax = plt.subplots(figsize=(11, 11))
cols = train.corr().nsmallest(k, 'winPlacePerc')['winPlacePerc'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

## Analysis of TOP 10% players

In [None]:
top10 = train[train["winPlacePerc"]>0.9]
print("TOP 10% overview\n")
print("Average number of kills: {:.1f}\nMinimum: {}\nThe best: {}\n95% of players within: {} kills." 
      .format(top10["kills"].mean(), top10["kills"].min(), top10["kills"].max(),top10["kills"].quantile(0.95)))

In [None]:
ride = train.query('rideDistance >0 & rideDistance <10000')
walk = train.query('walkDistance >0 & walkDistance <4000')

In [None]:
fig, ax1 = plt.subplots(figsize = (15,10))
walk.hist('walkDistance', bins=40, figsize = (15,10), ax = ax1)
walk10 = top10[top10['walkDistance']<5000]
walk10.hist('walkDistance', bins=40, figsize = (15,10), ax = ax1)

print("Average walking distance: " + str(top10['walkDistance'].mean()))

In [None]:
fig, ax1 = plt.subplots(figsize = (15,10))
ride.hist('rideDistance', bins=40, figsize = (15,10), ax = ax1)
ride10 = top10.query('rideDistance >0 & rideDistance <10000')
ride10.hist('rideDistance', bins=40, figsize = (15,10), ax = ax1)
print("Average riding distance: " + str(top10['rideDistance'].mean()))

## Anomalies

In [None]:
data = train.copy()
# Players who got more than 10 roadKills
data[data['kills'] > 40].head(2)

In [None]:
# walkDistance anomalies
display(train[train['walkDistance'] >= 10000].shape)
train[train['walkDistance'] >= 10000].head(5)

In [None]:
# rideDistance anomalies
display(train[train['rideDistance'] >= 20000].shape)
train[train['rideDistance'] >= 20000].head(5)

In [None]:
# Players who swam more than 2 km
train[train['swimDistance'] >= 2000]

In [None]:
display(train[train['weaponsAcquired'] >= 90].shape)
train[train['weaponsAcquired'] >= 90].head()

In [None]:
# 40 or more healing items used
display(train[train['heals'] >= 50].shape)
train[train['heals'] >= 50].head(10)