In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
print(os.listdir("../input"))

In [None]:
os.chdir("../input")
train = pd.read_csv('train_V2.csv')
test = pd.read_csv('test_V2.csv')

In [None]:
statinfo_train = os.stat('train_V2.csv')
statinfo_test = os.stat('test_V2.csv')

print('size of train file in mb\'s is', statinfo_train.st_size/10**6)
print('size of test file in mb\'s is',statinfo_test.st_size/10**6)

In [None]:
# show first 5 rows of training and test data
train.head()

In [None]:
test.head()

In [None]:
print("the shape of our training dataset format(#rows, #columns) =", train.shape)
print("the shape of our test dataset format(#rows, #columns) =", test.shape)

# Explanation of the data contained in each field:
## Data fields
* **DBNOs** - Number of enemy players knocked.
* **assists** - Number of enemy players this player damaged that were killed by teammates.
* **boosts** - Number of boost items used.
* **damageDealt** - Total damage dealt. Note: Self inflicted damage is subtracted.
* **headshotKills** - Number of enemy players killed with headshots.
* **heals** - Number of healing items used.
* **Id**- Player’s Id
* **killPlace** - Ranking in match of number of enemy players killed.
* **killPoints** - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
* **killStreaks** - Max number of enemy players killed in a short amount of time.
* **kills** - Number of enemy players killed.
* **longestKill** - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
* **matchDuration** - Duration of match in seconds.
* **matchId** - ID to identify match. There are no matches that are in both the training and testing set.
* **matchType** - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
* **rankPoints** - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
* **revives** - Number of times this player revived teammates.
* **rideDistance** - Total distance traveled in vehicles measured in meters.
* **roadKills** - Number of kills while in a vehicle.
* **swimDistance** - Total distance traveled by swimming measured in meters.
* **teamKills** - Number of times this player killed a teammate.
* **vehicleDestroys** - Number of vehicles destroyed.
* **walkDistance** - Total distance traveled on foot measured in meters.
* **weaponsAcquired** - Number of weapons picked up.
* **winPoints** - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
* **groupId** - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
* **numGroups** - Number of groups we have data for in the match.
* **maxPlace** - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
* **winPlacePerc** - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

**lets check the amount of empty values our training data, known as `NaN`in python** 

In [None]:
train.isna().sum()

**As we see above, we only have 1 column which only has 1 `NaN`** 

In [None]:
train[train.winPlacePerc.isna()]

**Since we have 4.4+ million rows, we can drop these without worrying about losing valuable information, so thats what we gonna do**

In [None]:
train.dropna(how="any", inplace=True)
train.shape

<b>As we see below, we have 4 object columns (columns with type string), and ML models can only work numbers (number crunching algorithms). <br>
 This means that we have to decide what we are going to do with these columns:
 1. are we going to exclude them
 2. are we going to encode them to integers</b>
 
 **Looking at these columns we will only use one column which could have some information that we can use, and that is: `matchType`**  
 **So we will convert this column to integers and use it on our models later**

In [None]:
train.dtypes[train.dtypes == 'object']

**To see if we can encode this in a decent way, we have to know the amount of __unique__ values in the `matchType`column**

In [None]:
print(f"amount of unique values in the matchType column is: {train.matchType.value_counts().count()}")
print("these are the unique values:")
train.matchType.value_counts()

**So now we know that we 16 unique values, we can easily encode this column using `one hot encoding`**

In [None]:
'''
We will use one hot encoding on the matchType series, which will give each unique value its own column.
This means that we will get 16 new columns: squad-fpp, duo-fpp, squad, solo-fpp etc.
'''
one_hot = pd.get_dummies(train.matchType)
train.drop('matchType', axis=1, inplace=True)
train = train.join(one_hot)
train.head()

**now we fixed our matchType column, we have to drop the other 3 `object` columns: Id, groupId, matchId**


In [None]:
train.drop(['Id', 'groupId', 'matchId'], axis='columns', inplace=True)

In [None]:
train.corr()[round(train.corr(), 2).winPlacePerc > 0.5]

In [None]:
fig = plt.figure(1, figsize=(10,10))
plt.plot(train.walkDistance, train.winPlacePerc, 'o')
plt.show()

In [None]:
train.walkDistance.sort_values(ascending=False).head()

In [None]:
train[train.walkDistance == 25780]