In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading the data

In [None]:
train=pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/train_V2.csv')
test=pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/test_V2.csv')
sample_submission=pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/sample_submission_V2.csv')


In [None]:
sample_submission.head()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isnull().sum()

In [None]:
train.dropna(axis=0,inplace=True)

In [None]:
test.isnull().sum()

In [None]:
train.columns

## EDA

# **Description of the features**

DBNOs - Number of enemy players knocked.<br>

assists - Number of enemy players this player damaged that were killed by teammates.<br>

boosts - Number of boost items used.<br>

damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.<br>

headshotKills - Number of enemy players killed with headshots.<br>

heals - Number of healing items used.<br>

Id - Player’s Id<br>

killPlace - Ranking in match of number of enemy players killed.<br>

killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.<br>

killStreaks - Max number of enemy players killed in a short amount of time.<br>

kills - Number of enemy players killed.<br>

longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.<br>

matchDuration - Duration of match in seconds.<br>

matchId - ID to identify match. There are no matches that are in both the training and testing set.<br>

matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.<br>

rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.<br>

revives - Number of times this player revived teammates.<br>

rideDistance - Total distance traveled in vehicles measured in meters.<br>

roadKills - Number of kills while in a vehicle.<br>

swimDistance - Total distance traveled by swimming measured in meters.<br>

teamKills - Number of times this player killed a teammate.<br>

vehicleDestroys - Number of vehicles destroyed.<br>

walkDistance - Total distance traveled on foot measured in meters.<br>

weaponsAcquired - Number of weapons picked up.<br>

winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.<br>

groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.<br>

numGroups - Number of groups we have data for in the match.<br>

maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.<br>

winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.<br>


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
sns.distplot(train["DBNOs"],hist=True)
plt.show()

From the above graph, it can be clearly seen that the values of DBNOs varies from 0 to 50 but the maximum values lies betwen 0 to 15.

In [None]:
# Boxplot of DBNOs variable

train["DBNOs"].plot.box(figsize=(10,6))
plt.show()

From the above boxplot, we can clearly see the presence of outliers in the DBNOs variable.

In [None]:
train["kills"].describe()

In [None]:
# Distribution plot of no. of kills

plt.figure(figsize=(10,6))
sns.distplot(train["kills"],hist=True)
plt.title("No. of Kills")
plt.show()

In [None]:
# Grouping kills wrt matchType and plotting the barchart

plt.figure(figsize=(10,6))
train.groupby("matchType")["kills"].mean().plot.bar(color = "lightgreen")
plt.title("kills wrt matchType")
plt.show()           

The number of kills is highest for **normal-solo** and **normal-solo-fpp**.

In [None]:
fig,ax=plt.subplots(figsize=(20,15))
ax=sns.heatmap(train.corr(),annot=True)

Inferences that can be drawn the above heatmap are as follows: 

* killPoints and winPoints are highly correlated with each other having the correlation coefficient of 0.98.
* walkDistance and winPlacePerc are strongly correlated having the correlation coefficient of 0.81.
* kills and killStreaks are strongly correlated having the correlation coefficient of 0.80.
* kills and damageDealth are strongly correlated with the correlation coefficient of 0.89.

In [None]:
plt.figure(figsize=(9,7))
match_dur=train.groupby('matchType')['matchDuration'].agg('mean')
sns.barplot(x=match_dur.index,y=match_dur)
plt.gca().set_xticklabels(match_dur.index,rotation='45')
plt.gca().set_title('mean match-type duration')
plt.show()
plt.savefig('duration')


## Feature Engineering

In [None]:
train['boosts+heals'] = train['boosts']+train['heals']
train['matchDuration_min'] = train['matchDuration']/60
train['teamwork'] = train['assists'] + train['revives']
train['revives-teamKills'] = train['revives'] - train['teamKills']
train['total_distance'] = train['swimDistance'] + train['rideDistance'] + train['walkDistance']

train['headshotKills/kills'] = train['headshotKills'] / train['kills']
train['headshotKills/kills'].fillna(0, inplace=True)
train['headshotKills/kills'].replace(np.inf, 0, inplace=True)

train['killPlace/maxPlace'] = train['killPlace'] / train['maxPlace']
train['killPlace/maxPlace'].fillna(0, inplace=True)
train['killPlace/maxPlace'].replace(np.inf, 0, inplace=True)

train['walkDistance/heals'] = train['walkDistance'] / train['heals']
train['walkDistance/heals'].fillna(0, inplace=True)
train['walkDistance/heals'].replace(np.inf, 0, inplace=True)

train['walkDistance/kills'] = train['walkDistance'] / train['kills']
train['walkDistance/kills'].fillna(0, inplace=True)
train['walkDistance/kills'].replace(np.inf, 0, inplace=True)

train['killStreaks/kills'] = train['killStreaks'] / train['kills']
train['walkDistance/kills'].fillna(0, inplace=True)
train['walkDistance/kills'].replace(np.inf, 0, inplace=True)

train['total_distance/weaponsAcquired'] = train['total_distance'] / train['weaponsAcquired']
train['total_distance/weaponsAcquired'].fillna(0, inplace=True)
train['total_distance/weaponsAcquired'].replace(np.inf, 0, inplace=True)

train['heals/walkDistance'] = train['heals'] /train['walkDistance']  
train['heals/walkDistance'].fillna(0, inplace=True)
train['heals/walkDistance'].replace(np.inf, 0, inplace=True)

train['kills/walkDistance'] = train['kills'] / train['walkDistance']
train['kills/walkDistance'].fillna(0, inplace=True)
train['kills/walkDistance'].replace(np.inf, 0, inplace=True)

train['killPlace/kills'] = train['killPlace'] / train['kills']
train['killPlace/kills'].fillna(0, inplace=True)
train['killPlace/kills'].replace(np.inf, 0, inplace=True)

train['walkDistance/matchDuration'] =  train["walkDistance"] / train["matchDuration"]
train['walkDistance/matchDuration'].fillna(0, inplace=True)
train['walkDistance/matchDuration'].replace(np.inf, 0, inplace=True)


In [None]:
test['boosts+heals'] = test['boosts']+test['heals']
test['matchDuration_min'] = test['matchDuration']/60
test['teamwork'] = test['assists'] + test['revives']
test['revives-teamKills'] = test['revives'] - test['teamKills']
test['total_distance'] = test['swimDistance'] + test['rideDistance'] + test['walkDistance']

test['headshotKills/kills'] = test['headshotKills'] / test['kills']
test['headshotKills/kills'].fillna(0, inplace=True)
test['headshotKills/kills'].replace(np.inf, 0, inplace=True)

test['killPlace/maxPlace'] = test['killPlace'] / test['maxPlace']
test['killPlace/maxPlace'].fillna(0, inplace=True)
test['killPlace/maxPlace'].replace(np.inf, 0, inplace=True)

test['walkDistance/heals'] = test['walkDistance'] / test['heals']
test['walkDistance/heals'].fillna(0, inplace=True)
test['walkDistance/heals'].replace(np.inf, 0, inplace=True)

test['walkDistance/kills'] = test['walkDistance'] / test['kills']
test['walkDistance/kills'].fillna(0, inplace=True)
test['walkDistance/kills'].replace(np.inf, 0, inplace=True)

test['killStreaks/kills'] = test['killStreaks'] / test['kills']
test['walkDistance/kills'].fillna(0, inplace=True)
test['walkDistance/kills'].replace(np.inf, 0, inplace=True)

test['total_distance/weaponsAcquired'] = test['total_distance'] / test['weaponsAcquired']
test['total_distance/weaponsAcquired'].fillna(0, inplace=True)
test['total_distance/weaponsAcquired'].replace(np.inf, 0, inplace=True)

test['heals/walkDistance'] = test['heals'] /test['walkDistance']  
test['heals/walkDistance'].fillna(0, inplace=True)
test['heals/walkDistance'].replace(np.inf, 0, inplace=True)

test['kills/walkDistance'] = test['kills'] / test['walkDistance']
test['kills/walkDistance'].fillna(0, inplace=True)
test['kills/walkDistance'].replace(np.inf, 0, inplace=True)

test['killPlace/kills'] = test['killPlace'] / test['kills']
test['killPlace/kills'].fillna(0, inplace=True)
test['killPlace/kills'].replace(np.inf, 0, inplace=True)

test['walkDistance/matchDuration'] =  test["walkDistance"] / test["matchDuration"]
test['walkDistance/matchDuration'].fillna(0, inplace=True)
test['walkDistance/matchDuration'].replace(np.inf, 0, inplace=True)


In [None]:
dropped_cols = ["Id", "matchId", "groupId", "matchType"]
train.drop(dropped_cols,axis=1,inplace=True)
test.drop(dropped_cols,axis=1,inplace=True)

## Modeling

In [None]:
X = train.drop('winPlacePerc',axis=1)
y = train['winPlacePerc']

In [None]:
from sklearn.model_selection import train_test_split

test_size=0.20
seed=42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

Deleting train dataframe as it will help free up space in RAM

In [None]:
import gc
del train
gc.collect()

Using light-gbm with the following parameters

In [None]:
params2 = {
        "objective" : "regression", 
        "metric" : "mae", 
        "num_leaves" : 150, 
        "learning_rate" : 0.03, 
        "bagging_fraction" : 0.9,
        "bagging_seed" : 0, 
        "num_threads" : 4,
        "colsample_bytree" : 0.5,
        'min_data_in_leaf':1900, 
        'lambda_l2':9
}

In [None]:
import lightgbm as lgb
reg2 = lgb.LGBMRegressor(**params2, n_estimators=2000)

In [None]:
reg2.fit(X_train, y_train)

In [None]:
pred2 = reg2.predict(X_test, num_iteration=reg2.best_iteration_)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred2)

That's a good score. So, I will continue with prediction on the test set.

In [None]:
predictions = reg2.predict(test, num_iteration=reg2.best_iteration_)

In [None]:
predictions

In [None]:
sample_submission['winPlacePerc'] = predictions

In [None]:
sample_submission

In [None]:
sample_submission.to_csv('submission.csv',index=False)