In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import multiprocessing

try:
    cpus = multiprocessing.cpu_count()
except NotImplementedError:
    cpus = 2   # arbitrary default

import os
print(os.listdir("../input/"))

# Any results you write to the current directory are saved as output.

In [None]:
dataset = pd.read_csv("../input/pubg-finish-placement-prediction/train_V2.csv")
test_set = pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv")
dataset.head()


**Group Experience**

> *If the same group of players plays in different matches, they will have a different groupId each time.*

Possibly interesting concept: Number of matches played within a group and aggregations based of group history.

If groupId was kept consistent across matches we can essentially track the group's history and experience playing together.

Sadly, it seems like groupIds are only used for a match so cross-match tracking is not possible as is.

If the player IDs it might be beneficial to try to generate our own group Id's that actually mean something.

In [None]:
match_amount = len(dataset.matchId.unique())
print('Number of matches in the dataset: %d' % match_amount)
group = dataset.groupby("groupId")['matchId'].nunique()
group[group > 1]

**Match Awareness**

>*"If you know the enemy and know yourself, you need not fear the result of a hundred battles. If you know yourself but not the enemy, for every victory gained you will also suffer a defeat. If you know neither the enemy nor yourself, you will succumb in every battle.”*  ― Sun Tzu, The Art of War

As relevant advice as ever. Pretentious quotes aside, my hypothesis is that other's players performance in a match is as important as how well you perform.

It is not about being good, it is about being better than others.

To this end I propose a series of context variables:


    For individual variable in match:
         group describe of a feature
       
    

In [None]:
def get_group_values(dataframe,groupby, values):
    
    grouped_set = dataframe.groupby(groupby)
        
    for value in values:
        
        dataframe = dataframe.join(grouped_set[value].rank(ascending=False, pct=True),on=groupby,rsuffix='_percentile')
        
        description = grouped_set[value].describe()
        description.columns = [value+'_count', value+'_mean', value+'_std', value+'_min', value+'_25%', value+'_50%', value+'_75%', value+'_max']
        dataframe = pd.concat([dataframe,description], axis=1)
        
    return dataframe
    
%timeit get_group_values(dataset.head(1000), 'matchId', dataset.columns)

In [None]:
def get_group_value(args):
    
    value, grouped_dataframe = args
    
    percentile = grouped_dataframe[value].rank(ascending=False, pct=True)
    description = grouped_dataframe[value].describe()
    variable_description = pd.concat([percentile,description], axis=1)
    variable_description.columns = [value+'_percentile',value+'_count', value+'_mean', value+'_std', value+'_min', value+'_25%', value+'_50%', value+'_75%', value+'_max']
    return variable_description

def get_group_values(dataframe,groupby, values):
    
    grouped_set = dataframe.groupby(groupby)
        
    pool = multiprocessing.Pool(processes=cpus)
    work = [[value, grouped_set] for value in values]
    return pd.concat([dataframe]+pool.map(get_group_value, work), axis=1)

%timeit get_group_values(dataset.head(1000), 'matchId', dataset.columns)

**Sepxarating game modes**

> *The data comes from matches of all types: solos, duos, squads, and custom; there is no guarantee of there being 100 players per match, nor at most 4 player per group.*

First things first, we need to fix the game mode problem. In this dataset group sizes vary from 1 to just under 100. 

These game modes are fundamentally different specially those nearing 100 players as it likely is a 'zombie mode' match.

In [None]:
group_size = dataset.groupby(["matchId","groupId"])['Id'].nunique()
group_size.plot(kind='hist', logy = True, title='Group size distribution', bins=100)

Diferent game modes by squad size (Sadly, no longer necessary on V2 data):

Solo - Only 1 player groups allowed
Duo - Only two player groups allowed
Squad - Between 2 and 4 player groups
Custom - Anything goes. Several different game modes like zombies or 20vs.


In [None]:
matches = group_size.reset_index().matchId.unique()
not_solo_matches = group_size[group_size != 1].reset_index().matchId.unique()
solo_matches = np.setdiff1d(matches, not_solo_matches)
solo_matches_amount = len(solo_matches)
#not_solo_matches = pd.Series(not_solo_matches)
print('Number of matches in the dataset: %d' % match_amount)
print('Number of solo matches in the dataset: %d' % solo_matches_amount)
print('Percentage of solo matches in the dataset: %f%%' % ((solo_matches_amount/match_amount)*100))

In [None]:
not_duo_matches = group_size[group_size != 2].reset_index().matchId.unique()
duo_matches = np.setdiff1d(matches, not_duo_matches)
duo_matches_amount = len(duo_matches)
#not_duo_matches = pd.Series(not_duo_matches)
print('Number of matches in the dataset: %d' % match_amount)
print('Number of duo matches in the dataset: %d' % duo_matches_amount)
print('Percentage of duo matches in the dataset: %f%%' % ((duo_matches_amount/match_amount)*100))

In [None]:
not_squad_matches = group_size[group_size >= 5].reset_index().matchId.unique()
squad_matches = np.setdiff1d(np.setdiff1d(np.setdiff1d(matches, not_squad_matches), duo_matches),solo_matches)
squad_matches_amount = len(squad_matches)
print('Number of matches in the dataset: %d' % match_amount)
print('Number of squad matches in the dataset: %d' % squad_matches_amount)
print('Percentage of squad matches in the dataset: %f%%' % ((squad_matches_amount/match_amount)*100))

In [None]:
custom_matches = np.setdiff1d(np.setdiff1d(np.setdiff1d(matches, squad_matches), duo_matches),solo_matches)
#not_squad_matches = np.setdiff1d(groupsize.reset_index().matchID.unique(), custom_matches)
custom_matches_amount = len(custom_matches)
print('Number of matches in the dataset: %d' % match_amount)
print('Number of custom matches in the dataset: %d' % custom_matches_amount)
print('Percentage of custom matches in the dataset: %f%%' % ((custom_matches_amount/match_amount)*100))

In [None]:
labels = 'Duo', 'Squad', 'Solo' ,'Custom'
sizes = [duo_matches_amount, squad_matches_amount, solo_matches_amount, custom_matches_amount]
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
explode = (0.05, 0.05, 0.05, 0.05)
plt.pie(sizes, labels=labels, explode=explode, colors=colors)
plt.axis('equal')
plt.show()

**Game Mode as a categorical value**

We will now add the game mode to the entry to allow the model to learn the difference among game types. 
To this end we will encode the mode as a one-hot vector so it plays nicely with XGboost and ANNs. 

In [None]:
solo = set(solo_matches)
duo = set(duo_matches)
squad = set(squad_matches)
custom = set(custom_matches)

def one_hot_game_mode(Id, solo, duo, squad, custom):
    return int(Id in solo), int(Id in duo), int(Id in squad), int(Id in custom)

dataset['solo'] = 0
dataset['duo'] = 0
dataset['squad'] = 0
dataset['custom'] = 0
dataset[['solo','duo','squad','custom']] = pd.DataFrame(elem for elem in dataset.matchId.apply(one_hot_game_mode,args=(solo,duo,squad,custom)))

dataset.head()

**Feature Scaling**

The scale of some of these values are very different and might lead to problems down the line with some machine learning algorithms so we perform min-max normalisation.

This won't affect the performance of tree estimators like XGBoost or Random Forest but it is known to increase performance of gradient based machine learning methods. 

In [None]:
dataset.damageDealt.plot(kind='hist', logy=True)

In [None]:
norm_list = ["winPoints","killPoints","damageDealt","rideDistance","walkDistance", "swimDistance", "longestKill"]
dataset[norm_list] =(dataset[norm_list] - dataset[norm_list].mean())/(dataset[norm_list].max() - dataset[norm_list].min())
test_set[norm_list] =(test_set[norm_list] - dataset[norm_list].mean())/(dataset[norm_list].max() - dataset[norm_list].min())
#train.killPoints =(train.winPoints-train.winPoints.mean())/(train.winPoints.max() - train.winPoints.min())
train.winPoints.plot(kind='hist', logy=True)
train.walkDistance.plot(kind='hist', logy=True)

**Correlation Matrix**

The main correlators with success include:

1. Walking distance
2. Number of boosts
3. Number of weapons acquired

These all correlate with playtime as someone who plays for longer will end up walking more, getting more boosts (which increase life points over the cap) and acquiring more weapons.

Interestingly winPoints doesn't  correlate strongly with actually winning 

In [None]:
correlation_matrix = dataset.corr()
correlation_matrix.winPlacePerc.sort_values(ascending=False)

**Data Augmentation**

Another PUBG statistics dataset is available on Kaggle, let's see if we can obtain any useful insights.

The data contained in the dataset is comprised of 150 diferent data points pertaining to the top-players at the time of collection. 

This might prove be an issue down the line as, in my experience, the average player isn't very good. So the data might be fundamentally different depending on the sample collected. 


Another possible concern is that the data contained was collected over a year ago. This, however, should prove a non-issue as anyone familiar with the game can testify all of the development efforts have gone into sketchy monetization and not gameplay, so the meta-game should still be pretty similar.

In [None]:
aug_dataset = dataset = pd.read_csv("../input/pubgplayerstats/PUBG_Player_Statistics.csv")
print(str(aug_dataset.columns.values))

As we lack the convinient WinPlacePerc variable in our augmentation dataset what we will do is compare everything but that.

The plan is that for every player we would be able to assign the closest player in the augmented dataset and hence have an increased number of data points. This may or may not be useful.

First we need a way to convert our augmented data to look like our normal data.

Example:

Depending on our group size (Solo, Duo, Squad) we will look at diferent parts of the dataframe in order to make an apples to apples comparison. 

There are some big squads in this 

**XGBoost baseline**

XGboosting is a resilient tree-bosting machine learning algorithm known for being robust in these type of datasets without needing of a lot of pre-processing.

In [None]:
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

dataset_labels = dataset.pop('winPlacePerc')
dataset = dataset.drop(['Id','groupId','matchId'], axis=1)
test_set = test_set.drop(['Id','groupId','matchId'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(dataset, dataset_labels ,test_size=0.2)
#test = pd.read_csv("../input/test.csv")

xgb.fit(X_train,y_train, eval_metric='rmse', verbose=True)

predictions = xgb.predict(X_test)
print(explained_variance_score(predictions,y_test))
print(mean_squared_error(predictions,y_test))


In [None]:
pred = xgb.predict(test_set)
print(pred)
test = pd.read_csv("../input/sample_submission.csv")
test['winPlacePerc'] = pd.Series(pred)
test.to_csv("../submission.csv")



Now we know which matches belong to which game modes we will split the model in three and evaluate its performance.

