In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# import training data
train = pd.read_csv('../input/train_V2.csv')

train.head()

In [None]:
train.info()

# Missing Data

In [None]:
# check how many rows having missing data
train.isnull().sum()

In [None]:
# take a look at this row
train[train['winPlacePerc'].isnull()]

In [None]:
# look like an incorrect record
# remove this row
train.drop(2744604, inplace=True)

# EDA

## Kills

In [None]:
train['kills'].describe()

In [None]:
print('The average number of kills is {:.4f}. 99% players have {} kills or less. The most kills ever is {:.0f}.'
      .format(train['kills'].mean(), train['kills'].quantile(0.99), train['kills'].max()))

In [None]:
data = train.copy()
data.loc[data['kills'] > data['kills'].quantile(0.99)] = '8+'
plt.figure(figsize=(15, 10))
sns.countplot(data['kills'].astype('str').sort_values())
plt.title('Kill Count', fontsize=20)

Most players can't even get a kill in the game.
Let's check how much damage they did before they died.

In [None]:
data = train.copy()
data = data[data['kills']==0]
plt.figure(figsize=(15, 10))
sns.distplot(data['damageDealt'])
plt.title('Damage dealt by Player who killed 0 enemy')

In [None]:
data['damageDealt'].describe()

About 50% of them dealt 10 damages or less

In [None]:
print('{:.0f} zero-killers ({:.4%}) won the game.'
     .format(len(data[data['winPlacePerc']==1]), len(data[data['winPlacePerc']==1])/len(data)))

print('{:.0f} zero-killers ({:.4%}) won the game with 0 damage dealt.'
     .format(len(data[(data['winPlacePerc']==1) & (data['damageDealt']==0)]),
             len(data[(data['winPlacePerc']==1) & (data['damageDealt']==0)])/len(data)))

Surprisingly, a few of them managed to win the game even they can't get a kill.

In [None]:
sns.jointplot(data=train, x='winPlacePerc', y='kills', size=10, ratio=3)

Kills is correlated to winPlacePerc as expected.

In [None]:
kills = train.copy()

kills['killsCategories'] = pd.cut(kills['kills'], [-1, 0, 2, 5, 10, 100], labels=['0_kills','1-2_kills', '3-5_kills', '6-10_kills', '10+_kills'])

plt.figure(figsize=(15,8))
sns.boxplot(x="killsCategories", y="winPlacePerc", data=kills)

## Running

In [None]:
train['walkDistance'].describe()

In [None]:
print('The average walking distance is {:.1f}m. 99% players walked {:.1f}m or less. The max distance is {:.1f}m.'
     .format(train['walkDistance'].mean(), train['walkDistance'].quantile(0.99), train['walkDistance'].max()))

In [None]:
data = train.copy()
plt.figure(figsize=(15, 10))
sns.distplot(data['walkDistance'])
plt.title('Walking Distance')

In [None]:
print('{:.0f} ({:.4%}) players walked 0 meter. That means they died just reached the ground or afk.'
     .format(len(data[data['walkDistance']==0]), len(data[data['walkDistance']==0])/len(data)))

In [None]:
sns.jointplot(data=train, x='winPlacePerc', y='walkDistance', size=10, ratio=3)

Walk Distance also has a high correlation with winPlacePerc.

> ## Heals

In [None]:
print('The average number of heal items used is {:.1f}. 99% players used {:.1f} or less. Max number of used heal items is {:.0f}.'
     .format(train['heals'].mean(), train['heals'].quantile(0.99), train['heals'].max()))
print('')
print('The average number of boost items used is {:.1f}. 99% players used {:.1f} or less. Max number of used boost items is {:.0f}.'
     .format(train['boosts'].mean(), train['boosts'].quantile(0.99), train['boosts'].max()))

In [None]:
train[['heals', 'boosts']].describe()

In [None]:
# this will take a few minutes

# data = train.copy()

# f, ax1 = plt.subplots(figsize=(20, 10))
# l1 = sns.pointplot(data=data, x='heals', y='winPlacePerc', color='lime', alpha=0.8)
# l2 = sns.pointplot(data=data, x='boosts', y='winPlacePerc', color='blue', alpha=0.8)
# plt.legend([l1, l2], ['heals', 'boosts'], loc='lower center')
# plt.xlabel('Number of heal/boost items', fontsize=20, color='blue')
# plt.ylabel('Win Percentage', fontsize=20, color='blue')
# plt.title('Heals vs Boosts', fontsize=20, color='blue')
# plt.grid()

In [None]:
sns.jointplot(data=data, x='winPlacePerc', y='heals', size=10, ratio=3, color='lime')

In [None]:
sns.jointplot(data=data, x='winPlacePerc', y='boosts', size=10, ratio=3)

## Pearson Correlation between variables

In [None]:
f, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(train.corr(), annot=True, linewidth=0.5, fmt='.1f')

In [None]:
# top 5 most positive correlated variables
k = 6
f, ax = plt.subplots(figsize=(10, 10))
cols = train.corr().nlargest(k, 'winPlacePerc')['winPlacePerc'].index
cm = train[cols].corr()
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', 
                 annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)

walkDistance, boosts, weaponsAcquired, damageDealt, heals have the highest correlation with winPlacePerc.

# Feature Engineering

## Number of Players

In [None]:
train['num_players'] = train.groupby('matchId')['matchId'].transform('count')

In [None]:
data = train.copy()
data = data[data['num_players']>=75]
plt.figure(figsize=(15, 10))
sns.countplot(data['num_players'])
plt.title('Number of Players', fontsize=20)

The majority of games have nearly 100 players.

In [None]:
# normalize kills, damageDealt, maxPlace, matchDuration
train['killsNorm'] = train['kills'] * ((100 - train['num_players']) / 100 + 1)
train['damageDealtNorm'] = train['damageDealt'] * ((100 - train['num_players']) / 100 + 1)
train['maxPlaceNorm'] = train['maxPlace'] * ((100 - train['num_players']) / 100 + 1)
train['matchDurationNorm'] = train['matchDuration'] * ((100 - train['num_players']) / 100 + 1)

train[['Id', 'kills', 'killsNorm', 'damageDealt', 'damageDealtNorm', 'maxPlace', 'maxPlaceNorm',
       'matchDuration', 'matchDurationNorm']].head()

## heals, boosts, travel distance

In [None]:
train['healsAndBoosts'] = train['heals'] + train['boosts']
train['totalDistance'] = train['walkDistance'] + train['rideDistance'] + train['swimDistance']

In [None]:
# find out players who are getting kills without moving
train['killsWithoutMoving'] = ((train['kills'] > 0) & (train['totalDistance'] == 0))

In [None]:
# create feature headshot_rate
train['headshot_rate'] = train['headshotKills'] / train['kills']
train['headshot_rate'] = train['headshot_rate'].fillna(0)

# Outlier Detection

## kills without movement
no movement & no kill => maybe just AFK<br>
no movement & get kills => likely a cheater

In [None]:
display(train[train['killsWithoutMoving'] == True].shape)
train[train['killsWithoutMoving'] == True].head(10)

In [None]:
# remove outliers
train.drop(train[train['killsWithoutMoving'] == True].index, inplace=True)

## Anomalies in roadKills

In [None]:
train['roadKills'].describe()

In [None]:
train['roadKills'].value_counts()

In [None]:
# remove players with more than 10 roadKills
train.drop(train[train['roadKills'] > 10].index, inplace=True)

## Anomalies in aim (much more kills than others)

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(data=train, x='kills')
plt.title('Kills', fontsize=20)

In [None]:
# players who got more than 30 kills
display(train[train['kills'] > 30].shape)
train[train['kills'] > 30].head(10)

In [None]:
# remove outliers
train.drop(train[train['kills'] > 30].index, inplace=True)

## Anomalies in aim (headshot script)

In [None]:
plt.figure(figsize=(15, 5))
sns.distplot(train['headshot_rate'], bins=10)

Most players score in the 0%-10% region.<br>
Some players got 100% headshot rate.

In [None]:
display(train[(train['headshot_rate'] == 1) & (train['kills'] > 9)].shape)
train[(train['headshot_rate'] == 1) & (train['kills'] > 9)].head(10)

Other data fields look normal. It's hard to distinguish if these players are cheating.

## Anomalies in aim (longest kill)

In [None]:
plt.figure(figsize=(15, 5))
sns.distplot(train['longestKill'], bins=10)

Most kills happened within 500 meters

In [None]:
display(train[train['longestKill'] >= 1000].shape)
train[train['longestKill'] >= 1000].head(10)

In [None]:
# remove outliers
train.drop(train[train['longestKill'] >= 1000].index, inplace=True)

## Anomlies in traveling

In [None]:
train[['walkDistance', 'rideDistance', 'swimDistance', 'totalDistance']].describe()

In [None]:
# walkDistance
plt.figure(figsize=(15, 5))
sns.distplot(train['walkDistance'], bins=10)

In [None]:
display(train[train['walkDistance'] >= 10000].shape)
train[train['walkDistance'] >= 10000].head(10)

In [None]:
# remove outliers
train.drop(train[train['walkDistance'] >= 10000].index, inplace=True)

In [None]:
# rideDistance
plt.figure(figsize=(15, 5))
sns.distplot(train['rideDistance'], bins=10)

In [None]:
display(train[train['rideDistance'] >= 20000].shape)
train[train['rideDistance'] >= 20000].head(10)

In [None]:
# remove outliers
train.drop(train[train['rideDistance'] >= 20000].index, inplace=True)

In [None]:
# swimDistance
plt.figure(figsize=(15, 5))
sns.distplot(train['swimDistance'], bins=10)

In [None]:
display(train[train['swimDistance'] >= 2000].shape)
train[train['swimDistance'] >= 2000].head(10)

In [None]:
# remove outliers
train.drop(train[train['swimDistance'] >= 2000].index, inplace=True)

## Anomalies in supplies (weaponsAcquired)

In [None]:
plt.figure(figsize=(15, 5))
sns.distplot(train['weaponsAcquired'], bins=100)

Most players acquired less than 10 weapons.

In [None]:
display(train[train['weaponsAcquired'] >= 80].shape)
train[train['weaponsAcquired'] >= 80].head(10)

In [None]:
# remove outliers
train.drop(train[train['weaponsAcquired'] >= 80].index, inplace=True)

## Anomalies in supplies (heals)

In [None]:
plt.figure(figsize=(15, 5))
sns.distplot(train['heals'], bins=10)

In [None]:
display(train[train['heals'] >= 40].shape)
train[train['heals'] >= 40].head(10)

In [None]:
# remove outliers
train.drop(train[train['heals'] >= 40].index, inplace=True)

# Categorical Variables

In [None]:
print('There are {} different Match types in the dataset.'
     .format(train['matchType'].nunique()))

In [None]:
# one hot encode matchType
train = pd.get_dummies(train, columns=['matchType'])

# take a look
matchType_encoding = train.filter(regex='matchType')
matchType_encoding.head()

In [None]:
# turn groupId, matchId into categorical types
train['groupId'] = train['groupId'].astype('category')
train['matchId'] = train['matchId'].astype('category')

# get category coding for groupId, matchId
train['groupId_cat'] = train['groupId'].cat.codes
train['matchId_cat'] = train['matchId'].cat.codes

# get rid of old columns
train.drop(columns=['groupId', 'matchId'], inplace=True)

train[['groupId_cat', 'matchId_cat']].head()

In [None]:
# drop Id column
train.drop(columns=['Id'], inplace=True)

# Preparation for Random Forest

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train = train.drop(columns=['winPlacePerc'])
y_train = train['winPlacePerc']

del train

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [None]:
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

In [None]:
from sklearn.metrics import mean_absolute_error

def print_score(m: RandomForestRegressor):
    res = ['mae train: ', mean_absolute_error(m.predict(X_train), y_train),
           'mae val: ', mean_absolute_error(m.predict(X_test), y_test)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
# base model
m1 = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features='sqrt', n_jobs=-1)
m1.fit(X_train, y_train)
print_score(m1)

## Feature Importance

In [None]:
# top 10 most predictive features
importances = pd.DataFrame(m1.feature_importances_, index=X_train.columns,
                           columns=['importance']).sort_values('importance', ascending=False)
importances = importances.reset_index()

importances[:10]

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(data=importances[:20], x='importance', y='index')

In [None]:
# keep significant features
to_keep = importances[importances['importance'] >= 0.005]
print('Significant features:', len(to_keep))
to_keep

In [None]:
X_train = X_train[to_keep['index']]
X_test = X_test[to_keep['index']]

## Second Model

In [None]:
# train with significant features only
m2 = RandomForestRegressor(n_estimators=50, min_samples_leaf=3, max_features='sqrt', n_jobs=-1)
m2.fit(X_train, y_train)
print_score(m2)

In [None]:
importances_to_keep = pd.DataFrame(m2.feature_importances_, index=to_keep['index'],
                           columns=['importance']).sort_values('importance', ascending=False)
importances_to_keep = importances_to_keep.reset_index()

importances_to_keep[:10]

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(data=importances_to_keep, x='importance', y='index')

## Correlations

In [None]:
from scipy.cluster import hierarchy as hc
from scipy.stats import spearmanr

In [None]:
# Dendrogram
corr = np.round(spearmanr(X_train).correlation, 4)
corr_condensed = hc.distance.squareform(1 - corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(15, 10))
dendrogram = hc.dendrogram(z, labels=X_train.columns, orientation='left', leaf_font_size=15)

In [None]:
# correlation heatmap
corr = X_train.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr)

# Prediction

In [None]:
# apply feature engineering to the test set
test = pd.read_csv('../input/test_V2.csv')
test['headshot_rate'] = test['headshotKills'] / test['kills']
test['headshot_rate'] = test['headshot_rate'].fillna(0)
test['totalDistance'] = test['rideDistance'] + test['walkDistance'] + test['swimDistance']
test['num_players'] = test.groupby('matchId')['matchId'].transform('count')
# normalize
test['killsNorm'] = test['kills']*((100-test['num_players'])/100 + 1)
test['damageDealtNorm'] = test['damageDealt']*((100-test['num_players'])/100 + 1)
test['maxPlaceNorm'] = test['maxPlace']*((100-test['num_players'])/100 + 1)
test['matchDurationNorm'] = test['matchDuration']*((100-test['num_players'])/100 + 1)
test['healsAndBoosts'] = test['heals'] + test['boosts']
test['killsWithoutMoving'] = ((test['kills'] > 0) & (test['totalDistance'] == 0))

# turn groupId, matchId into categorical types
test['groupId'] = test['groupId'].astype('category')
test['matchId'] = test['matchId'].astype('category')

# Get category coding for groupId and matchID
test['groupId_cat'] = test['groupId'].cat.codes
test['matchId_cat'] = test['matchId'].cat.codes

# Remove irrelevant features from the test set
test_pred = test[to_keep['index']].copy()

# Fill NaN with 0 (temporary)
test_pred.fillna(0, inplace=True)
test_pred.head()

In [None]:
# submission
predictions = np.clip(a=m2.predict(test_pred), a_min=0.0, a_max=1.0)
pred_df = pd.DataFrame({'Id': test['Id'], 'winPlacePerc': predictions})

pred_df.to_csv('submission.csv', index=False)