In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import seaborn as sns
import plotly
import plotly.offline as pyoff
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import squarify # for tree maps
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
init_notebook_mode(connected = True)
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Data fields
- DBNOs - Number of enemy players knocked.
- assists - Number of enemy players this player damaged that were killed by teammates.
- boosts - Number of boost items used.
- damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.
- headshotKills - Number of enemy players killed with headshots.
- heals - Number of healing items used.
- Id - Player’s Id
- killPlace - Ranking in match of number of enemy players killed.
- killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
- killStreaks - Max number of enemy players killed in a short amount of time.
- kills - Number of enemy players killed.
- longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
- matchDuration - Duration of match in seconds.
- matchId - ID to identify match. There are no matches that are in both the training and testing set.
- matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
- rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
- revives - Number of times this player revived teammates.
- rideDistance - Total distance traveled in vehicles measured in meters.
- roadKills - Number of kills while in a vehicle.
- swimDistance - Total distance traveled by swimming measured in meters.
- teamKills - Number of times this player killed a teammate.
- vehicleDestroys - Number of vehicles destroyed.
- walkDistance - Total distance traveled on foot measured in meters.
- weaponsAcquired - Number of weapons picked up.
- winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
- groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
- numGroups - Number of groups we have data for in the match.
- maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
- winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [None]:
train = pd.read_csv('../input/train_V2.csv')

In [None]:
test = pd.read_csv('../input/test_V2.csv')

In [None]:
train.columns

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.isnull().sum()

> NA value in winPlacePerc does not seem right, we will remove this and continue our analysis

In [None]:
train = train.dropna()

In [None]:
test.isnull().sum()

In [None]:
def extractColTypes(dataset):
    """This functions extracts numeric, categorical , datetime and boolean column types.
    Returns 4 lists with respective column types"""
    num_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['int64','float64']]
    cat_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['object']]
    date_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['datetime64[ns]']]
    bool_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['bool']]
    print ("Numeric Columns:", len(num_cols_list))
    print ("Categorical/Character Columns:", len(cat_cols_list))
    print ("Date Columns:",len(date_cols_list))
    print ("Boolean Columns:",len(bool_cols_list))
    return(num_cols_list,cat_cols_list,date_cols_list,bool_cols_list)

In [None]:
num_cols_list,cat_cols_list,date_cols_list,bool_cols_list = extractColTypes(train)

In [None]:
def generateLayoutBar(col_name):
    layout_bar = go.Layout(
        autosize=False, # auto size the graph? use False if you are specifying the height and width
        width=800, # height of the figure in pixels
        height=600, # height of the figure in pixels
        title = "Distribution of {} column".format(col_name), # title of the figure
        # more granular control on the title font 
        titlefont=dict( 
            family='Courier New, monospace', # font family
            size=14, # size of the font
            color='black' # color of the font
        ),
        # granular control on the axes objects 
        xaxis=dict( 
        tickfont=dict(
            family='Courier New, monospace', # font family
            size=14, # size of ticks displayed on the x axis
            color='black'  # color of the font
            )
        ),
        yaxis=dict(
#         range=[0,100],
            title='Percentage',
            titlefont=dict(
                size=14,
                color='black'
            ),
        tickfont=dict(
            family='Courier New, monospace', # font family
            size=14, # size of ticks displayed on the y axis
            color='black' # color of the font
            )
        ),
        font = dict(
            family='Courier New, monospace', # font family
            color = "white",# color of the font
            size = 12 # size of the font displayed on the bar
                )  
        )
    return layout_bar

In [None]:
def plotBar(dataframe_name, col_name):
    """
    Plot a bar chart for the categorical columns

    Arguments:
    dataframe name
    categorical column name

    Output:
    Plot
    """
    # create a table with value counts
    temp = dataframe_name[col_name].value_counts()
    # creating a Bar chart object of plotly
    data = [go.Bar(
            x=temp.index.astype(str),  # x axis values
            y=np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100,  # y axis values
            text=['{}%'.format(i) for i in np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100],
            # text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
            textposition='auto',  # specify at which position on the bar the text should appear
            marker=dict(color='#0047AB'),)]  # change color of the bar
    # color used here Cobalt Blue

    layout_bar = generateLayoutBar(col_name=col_name)

    fig = go.Figure(data=data, layout=layout_bar)
    return iplot(fig)


### MatchType distribution

In [None]:
for i in cat_cols_list[3:]:
    print ("Train Distribution")
    plotBar(train, i)
    print ("Test Distribution")
    plotBar(test, i)

> Observations:
  
  - squad-fpp is the most played type in train and test while normal-duo is the least

### Correlation among numeric columns

In [None]:
# Compute the correlation matrix
corr = train.corr()

In [None]:
# # Generate a mask for the upper triangle
# mask = np.zeros_like(corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr,
#             mask=mask,
            cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5,annot=True)

ax.set_title('Correlation Matrix', size=20)
plt.show()

> Observations:

Positive Correlations:
- Walking distance and killplace.
-  Weapons acquired and killplace
- DBNO is highly correlated to killplace

Negative Correlations:
- Headshots and damage dealt
-  DBNO's and damage dealt are highly negatively correlated

Interestingly enough, KillStreaks is negatively correlated with kills.

### Answering some general questions about the data

In [None]:
temp = train.groupby(['matchType']).agg({'matchDuration':np.mean})

In [None]:
data = [go.Bar(
            x=temp.index.astype(str),  # x axis values
            y=temp.values,  # y axis values
            text=['{}%'.format(i) for i in temp.values],
            # text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
            textposition='auto',  # specify at which position on the bar the text should appear
            marker=dict(color='#0047AB'),)]  # change color of the bar
    # color used here Cobalt Blue

layout_bar = generateLayoutBar(col_name='matchDuration')

fig = go.Figure(data=data, layout=layout_bar)
iplot(fig)

In [None]:
data = []
for i in train.matchType.unique():
    trace = go.Box(y = train.matchDuration[train.matchType==i])
    data.append(trace)
iplot(data)

 ###### 1. Team which played most matches
 (In the data available)


In [None]:
train.groupId.value_counts().index[0]

* ###### 2. Player with the maximum number of kills

In [None]:
player_kills_df = train.groupby(['Id']).agg({'kills': np.sum})

In [None]:
player_kills_df.sort_values(['kills'],ascending= False).head(1)

###### 3. Lifesaver

In [None]:
player_revives_df = train.groupby(['Id']).agg({'revives': np.sum})
player_revives_df.sort_values(['revives'],ascending= False).head(1)

In [None]:
# these line of code will get the max,min,mean, min for all the numeric columns
max_num_dict = {'{}_max'.format(i):np.max(train[i]) for i in num_cols_list}
min_num_dict = {'{}_min'.format(i):np.min(train[i]) for i in num_cols_list}
mean_num_dict = {'{}_mean'.format(i):np.mean(train[i]) for i in num_cols_list}
median_num_dict = {'{}_median'.format(i):np.median(train[i]) for i in num_cols_list}

 ###### 4. Longest drive

In [None]:
print(train.Id[train.rideDistance == max_num_dict['rideDistance_max']])
print(max_num_dict['rideDistance_max'])

> 40Km seems to be a lot of driving.. Let us dive a little deeper, I am curious.

In [None]:
train[train.groupId == train.groupId[train.rideDistance == max_num_dict['rideDistance_max']].values[0]]

> Observations:
- So this guy was a solo player, who just drove around and walked around without collecting anything and without killing anyone
- Interestingly enough, he is placed in 84th %.

 ###### 4. Longest swim

In [None]:
print(train.Id[train.swimDistance == max_num_dict['swimDistance_max']])
print(max_num_dict['swimDistance_max'])

> 3 Km of swimming...

In [None]:
train[train.groupId == train.groupId[train.swimDistance == max_num_dict['swimDistance_max']].values[0]]

> I am  not sure how there are 8 players in this particular group in the same match and if we see the winPlacePerc, we can be sure that they all played as one group! Let's see how many such groups are there with more than 4 players

In [None]:
group_mem_df = train.groupby(['groupId','matchId']).agg({'Id': len})

In [None]:
group_mem_df[group_mem_df['Id']>4].shape

1. From my understanding, these might be the custom games that PUBG allows , for futher reading https://pubg.gamepedia.com/Game_Modes#Custom

#### Feature Enginerring

> Let's create new features
1. Total distance travelled.

In [None]:
dist_columns = [i for i in train.columns if 'Dist' in i]

In [None]:
dist_columns

In [None]:
train[dist_columns].head(10).sum(axis = 1)

In [None]:
train['totalDistance'] = train[dist_columns].sum(axis = 1)

In [None]:
test['totalDistance'] = test[dist_columns].sum(axis = 1)

 ###### 4. Max distance travelled

In [None]:
train[train.totalDistance == np.max(train.totalDistance)]