In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

**Data fields**
- DBNOs - Number of enemy players knocked.
- assists - Number of enemy players this player damaged that were killed by teammates.
- boosts - Number of boost items used.
- damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.
- headshotKills - Number of enemy players killed with headshots.
- heals - Number of healing items used.
- Id - Player’s Id
- killPlace - Ranking in match of number of enemy players killed.
- killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
- killStreaks - Max number of enemy players killed in a short amount of time.
- kills - Number of enemy players killed.
- longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
- matchDuration - Duration of match in seconds.
- matchId - ID to identify match. There are no matches that are in both the training and testing set.
- matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
- rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
- revives - Number of times this player revived teammates.
- rideDistance - Total distance traveled in vehicles measured in meters.
- roadKills - Number of kills while in a vehicle.
- swimDistance - Total distance traveled by swimming measured in meters.
- teamKills - Number of times this player killed a teammate.
- vehicleDestroys - Number of vehicles destroyed.
- walkDistance - Total distance traveled on foot measured in meters.
- weaponsAcquired - Number of weapons picked up.
- winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
- groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
- numGroups - Number of groups we have data for in the match.
- maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
- winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [1]:
train_path = "../input/train_V2.csv"
train_data = pd.read_csv(train_path, index_col="Id")
train_data.head()

In [1]:
test_path = "../input/test_V2.csv"
test_data = pd.read_csv(test_path, index_col="Id")
test_data.head()

In [1]:
train_data.info()

In [1]:
columns = train_data.columns
for column in columns:
    print(f"{column}: {train_data[column].isna().sum()}")

since there is only one row with null value, we can safely drop that row

In [1]:
data = train_data.copy()
data = data.dropna()

In [1]:
columns = data.columns
for column in columns:
    print(f"{column}: {data[column].isna().sum()}")

In [1]:
num_data = data.select_dtypes(include='number')

In [1]:
num_data.info()

In [1]:
# plt.figure(figsize=(15, 15))
# sns.heatmap(data=num_data.corr(), annot=True, fmt='.1f')
# plt.show()

In [1]:
# plt.figure(figsize=(15, 15))
# sns.scatterplot(x=num_data.walkDistance, y=num_data.winPlacePerc)
# plt.show()

from above plot we can say that as the walking distance of a player increases, chances of getting better placement increases. But some players placement is good despite very less walking distance. This may be due to a player playing in squad or duo mode.

In [1]:
# plt.figure(figsize=(15, 15))
# sns.scatterplot(x=num_data.damageDealt, y=num_data.kills)
# plt.show()

In [1]:
# average number of kills
print(data.kills.mean())

In [1]:
# maximum kills by a player
print(data.kills.max())

In [1]:
y = num_data.winPlacePerc
X = num_data.drop(columns='winPlacePerc')

In [1]:
test_X = test_data.select_dtypes(include='number')

In [1]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X, y)

In [1]:
output = model.predict(test_X)

In [1]:
out = pd.DataFrame({'Id': test_X.index, 'winPlacePerc': output})

In [1]:
out.to_csv('submission.csv', index=False)