## PlayerUnknown's Battlegrounds - Analysis of the top 88,000 Best Players
![image.png](attachment:image.png)

### Some notes to get started
Outlier / anamoly detection

Look at players above WinRatio threshold

Prediction based on features: win percentage per game (pg)?

In [111]:
# Import the good stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold, learning_curve
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [214]:
sns.set(style="whitegrid", color_codes=True)
plt.style.use('seaborn-colorblind')

In [316]:
# Read in data
df = pd.read_csv('PUBG_Player_Statistics.csv')

In [113]:
df.head()
num_players = df.shape[0]
num_features = df.shape[1]
num_solof = len(df.columns[df.columns.str.contains('solo')])
num_duof = len(df.columns[df.columns.str.contains('duo')])
num_squadf = len(df.columns[df.columns.str.contains('squad')])

In [6]:
num_features

152

In [7]:
num_players

87898

In [15]:
len(df.columns[df.columns.str.contains('solo')])

50

In [22]:
num_duof

50

In [23]:
num_squadf

50

In [27]:
df.head(10)

Unnamed: 0,player_name,tracker_id,solo_KillDeathRatio,solo_WinRatio,solo_TimeSurvived,solo_RoundsPlayed,solo_Wins,solo_WinTop10Ratio,solo_Top10s,solo_Top10Ratio,...,squad_RideDistance,squad_MoveDistance,squad_AvgWalkDistance,squad_AvgRideDistance,squad_LongestKill,squad_Heals,squad_Revives,squad_Boosts,squad_DamageDealt,squad_DBNOs
0,BreakNeck,4405,3.14,17.65,18469.14,17,3,0.83,4,23.5,...,3751590.99,5194786.58,2626.97,4372.64,536.98,2186,234,1884,242132.73,1448
1,Blackwalk,8199,4.41,18.18,33014.86,33,6,0.36,11,33.3,...,4295917.3,6051783.67,2422.48,6009.73,734.85,2041,276,2340,269795.75,1724
2,mercedes_benz,4454,3.6,0.0,4330.44,5,0,0.0,1,20.0,...,3935265.63,5589608.74,1871.89,3011.87,725.44,1766,210,2193,292977.07,1897
3,DORA,7729,14.0,50.0,13421.82,8,4,0.67,6,75.0,...,2738998.0,3796916.0,2154.62,5578.41,587.28,1214,142,1252,181106.9,1057
4,n2tstar,0,10.5,33.33,9841.04,6,2,0.4,5,83.3,...,2347295.0,3220260.0,2098.47,5642.54,546.1,1245,120,923,160029.8,1077
5,coldoxygen,33313,5.73,31.25,22003.22,16,5,0.5,10,62.5,...,3218436.0,4733977.0,2078.93,4414.86,1212.76,2023,195,2307,238640.0,1450
6,Giken,15585,2.45,8.45,148051.25,142,12,0.18,34,23.9,...,2057423.12,2883099.31,1664.77,3022.51,564.2,1274,133,1320,132008.88,775
7,KoreaNo1,9454,2.71,26.32,26091.14,19,5,0.5,10,52.6,...,3586593.0,4854387.0,1635.86,4627.86,806.24,2199,163,1864,201659.0,1287
8,undor,24029,3.68,13.64,49282.41,44,6,0.23,13,29.5,...,1054263.0,1550975.0,2207.61,4685.61,455.67,617,61,585,80498.73,513
9,Fordune,11289,4.17,25.0,10280.77,8,2,0.25,4,50.0,...,2793909.91,4071039.87,1981.29,3635.28,546.95,1320,220,1338,123573.74,675


In [24]:
def getModeFeatures(df, mode):
    return df.columns[df.columns.str.contains(mode)] 

In [25]:
getModeFeatures(df, 'solo')

Index(['solo_KillDeathRatio', 'solo_WinRatio', 'solo_TimeSurvived',
       'solo_RoundsPlayed', 'solo_Wins', 'solo_WinTop10Ratio', 'solo_Top10s',
       'solo_Top10Ratio', 'solo_Losses', 'solo_Rating', 'solo_BestRating',
       'solo_DamagePg', 'solo_HeadshotKillsPg', 'solo_HealsPg', 'solo_KillsPg',
       'solo_MoveDistancePg', 'solo_RevivesPg', 'solo_RoadKillsPg',
       'solo_TeamKillsPg', 'solo_TimeSurvivedPg', 'solo_Top10sPg',
       'solo_Kills', 'solo_Assists', 'solo_Suicides', 'solo_TeamKills',
       'solo_HeadshotKills', 'solo_HeadshotKillRatio', 'solo_VehicleDestroys',
       'solo_RoadKills', 'solo_DailyKills', 'solo_WeeklyKills',
       'solo_RoundMostKills', 'solo_MaxKillStreaks', 'solo_WeaponAcquired',
       'solo_Days', 'solo_LongestTimeSurvived', 'solo_MostSurvivalTime',
       'solo_AvgSurvivalTime', 'solo_WinPoints', 'solo_WalkDistance',
       'solo_RideDistance', 'solo_MoveDistance', 'solo_AvgWalkDistance',
       'solo_AvgRideDistance', 'solo_LongestKill', 'solo_

In [None]:
plt.figure(1)
sns.

In [26]:
3  / 17

0.17647058823529413

In [29]:
(df.solo_WinRatio > 50).sum()

593

In [215]:
%matplotlib notebook
plt.figure(1)
df.solo_WinRatio.plot.hist();
plt.title("Distribution of Solo Win Ratio")
plt.xlabel('Solo Win Ratio Percentage')
plt.ylabel('Number of Players')
plt.show()

<IPython.core.display.Javascript object>

In [216]:
plt.figure(2)
df.solo_WinRatio[(df.solo_WinRatio > 50)].plot.hist(bins = 50);
plt.title("Distribution of Solo Win Ratio [> 50 Win Ratio]")
plt.xlabel('Solo Win Ratio Percentage')
plt.ylabel('Number of Players')
plt.show()

<IPython.core.display.Javascript object>

In [71]:
df[(df.solo_WinRatio == 100)].solo_RoundsPlayed.value_counts()

1    371
2     36
3      7
6      1
4      1
Name: solo_RoundsPlayed, dtype: int64

In [104]:
df[(df.solo_WinRatio == 100) & (df.solo_RoundsPlayed == 1)].loc[:,getModeFeatures(df, 'solo')].head(3)

Unnamed: 0,solo_KillDeathRatio,solo_WinRatio,solo_TimeSurvived,solo_RoundsPlayed,solo_Wins,solo_WinTop10Ratio,solo_Top10s,solo_Top10Ratio,solo_Losses,solo_Rating,...,solo_RideDistance,solo_MoveDistance,solo_AvgWalkDistance,solo_AvgRideDistance,solo_LongestKill,solo_Heals,solo_Revives,solo_Boosts,solo_DamageDealt,solo_DBNOs
297,4.0,100.0,1963.65,1,1,1.0,1,100.0,0,1331.23,...,0.0,3126.37,3126.37,0.0,170.6,3,0,6,434.14,0
498,5.0,100.0,1905.0,1,1,1.0,1,100.0,0,1342.36,...,4788.12,7864.76,3076.64,4788.12,24.82,1,0,5,564.9,0
682,10.0,100.0,1905.75,1,1,1.0,1,100.0,0,1341.2,...,0.0,3165.45,3165.45,0.0,224.15,6,0,9,841.32,0


In [217]:
plt.figure(3)
plt.hist(df[(df.solo_WinRatio == 100) & (df.solo_RoundsPlayed == 1)].solo_KillsPg, 10, alpha = 0.7)
plt.show()

<IPython.core.display.Javascript object>

In [138]:
x = df[(df.solo_WinRatio == 100) & (df.solo_RoundsPlayed == 1)].solo_DamagePg
y = df[(df.solo_WinRatio == 100) & (df.solo_RoundsPlayed == 1)].solo_MoveDistance

In [218]:
sns.jointplot(x, y, alpha=0.4);

<IPython.core.display.Javascript object>

In [155]:
m = df[(df.solo_WinRatio == 100) & (df.solo_RoundsPlayed == 1)].loc[:,['solo_KillsPg','solo_MoveDistancePg', 'solo_LongestKill','solo_RideDistance']]

In [219]:
sns.pairplot(m, diag_kind = 'kde')
plt.tight_layout()

<IPython.core.display.Javascript object>

In [118]:
getModeFeatures(df, 'solo')

Index(['solo_KillDeathRatio', 'solo_WinRatio', 'solo_TimeSurvived',
       'solo_RoundsPlayed', 'solo_Wins', 'solo_WinTop10Ratio', 'solo_Top10s',
       'solo_Top10Ratio', 'solo_Losses', 'solo_Rating', 'solo_BestRating',
       'solo_DamagePg', 'solo_HeadshotKillsPg', 'solo_HealsPg', 'solo_KillsPg',
       'solo_MoveDistancePg', 'solo_RevivesPg', 'solo_RoadKillsPg',
       'solo_TeamKillsPg', 'solo_TimeSurvivedPg', 'solo_Top10sPg',
       'solo_Kills', 'solo_Assists', 'solo_Suicides', 'solo_TeamKills',
       'solo_HeadshotKills', 'solo_HeadshotKillRatio', 'solo_VehicleDestroys',
       'solo_RoadKills', 'solo_DailyKills', 'solo_WeeklyKills',
       'solo_RoundMostKills', 'solo_MaxKillStreaks', 'solo_WeaponAcquired',
       'solo_Days', 'solo_LongestTimeSurvived', 'solo_MostSurvivalTime',
       'solo_AvgSurvivalTime', 'solo_WinPoints', 'solo_WalkDistance',
       'solo_RideDistance', 'solo_MoveDistance', 'solo_AvgWalkDistance',
       'solo_AvgRideDistance', 'solo_LongestKill', 'solo_

In [194]:
corr = df[(df.solo_WinRatio == 100) & (df.solo_RoundsPlayed == 1)].loc[:,getModeFeatures(df, 'solo')].corr()
t = pd.DataFrame(corr.solo_KillsPg)

In [222]:
plt.figure(num=6, figsize=(8, 10))
sns.heatmap(t)
plt.yticks(rotation=0)
plt.tight_layout()

<IPython.core.display.Javascript object>

## Are Drivers Natural Killers?

In [350]:
sns.jointplot(df.solo_KillsPg, df.solo_AvgRideDistance, alpha=0.3);

<IPython.core.display.Javascript object>

Win ratio vs miles driven

In [359]:
threshold = 600 # frequency of categories
count = df['solo_WinRatio'].value_counts()
categories_to_keep = count[count > threshold].index.tolist()
category_feature = pd.Categorical(df['solo_WinRatio'], 
                categories = categories_to_keep, ordered=True)

In [362]:
s1 = df.solo_AvgRideDistance
s2 = pd.Series(category_feature, name='solo_WinRatioCat')
df_WR_drive = pd.concat([s1, s2], axis=1).dropna()

In [363]:
plt.figure()
sns.violinplot(x='solo_WinRatioCat', y='solo_AvgRideDistance', data = df_WR_drive, split=True)
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>