In [None]:
import pandas as pd

train_df = pd.read_csv('../input/train_V2.csv')
test_df  = pd.read_csv('../input/test_V2.csv')

In [None]:
train_df.describe().T

In [None]:
# Load libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

<< Data Description >>

* **DBNOs** - Number of enemy players knocked.
* **assists** - Number of enemy players this player damaged that were killed by teammates.
* **boosts** - Number of boost items used.
* **damageDealt** - Total damage dealt. Note: Self inflicted damage is subtracted.
* **headshotKills** - Number of enemy players killed with headshots.
* **heals** - Number of healing items used.
* **Id** - Player’s Id
* **killPlace** - Ranking in match of number of enemy players killed.
* **killPoints** - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
* **killStreaks** - Max number of enemy players killed in a short amount of time.
* **kills** - Number of enemy players killed.
* **longestKill** - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
* **matchDuration** - Duration of match in seconds.
* **matchId** - ID to identify match. There are no matches that are in both the training and testing set.
* **matchType** - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
* **rankPoints** - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
* **revives** - Number of times this player revived teammates.
* **rideDistance** - Total distance traveled in vehicles measured in meters.
* **roadKills** - Number of kills while in a vehicle.
* **swimDistance** - Total distance traveled by swimming measured in meters.
* **teamKills** - Number of times this player killed a teammate.
* **vehicleDestroys** - Number of vehicles destroyed.
* **walkDistance** - Total distance traveled on foot measured in meters.
* **weaponsAcquired** - Number of weapons picked up.
* **winPoints** - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
* **groupId** - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
* **numGroups** - Number of groups we have data for in the match.
* **maxPlace** - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
* **winPlacePerc** - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [None]:
# Change categorical varible into dummy variable
train_matchType = pd.get_dummies(train_df['matchType'])
test_matchType = pd.get_dummies(test_df['matchType'])

train_df2 = train_df.drop(columns=['matchType'])
test_df2 = test_df.drop(columns=['matchType'])

train_df2 = pd.concat([train_matchType, train_df2], axis = 1)
test_df2 = pd.concat([test_matchType, test_df2], axis = 1)

In [None]:
# Drop outliers
train_df3 = train_df2[train_df2['damageDealt'] - train_df2['damageDealt'].mean() <= (3 * train_df2['damageDealt'].std())]
train_df3 = train_df2[train_df2['walkDistance'] - train_df2['walkDistance'].mean() <= (3 * train_df2['walkDistance'].std())]
train_df3 = train_df2[train_df2['rideDistance'] - train_df2['rideDistance'].mean() <= (3 * train_df2['rideDistance'].std())]
train_df3 = train_df2[train_df2['swimDistance'] - train_df2['swimDistance'].mean() <= (3 * train_df2['swimDistance'].std())]

In [None]:
# Drop missing values
train_df3 = train_df3.dropna()

In [None]:
sns.set(color_codes=True)
sns.distplot(train_df3['winPlacePerc'])

In [None]:
_, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(train_df3.iloc[:, 19:].corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

In [None]:
# Splitting training dataset into train/validation set (ratio = 7:3)
from sklearn.model_selection import train_test_split

id_cols = train_df3.loc[:, ["Id", "groupId", "matchId"]]
train_df4 = train_df3.drop(columns = id_cols)
train_df4.dropna(inplace=True)

X = train_df4.iloc[:, 0:-1]; y = train_df4.iloc[:, -1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=2015195017)

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val)

params = {'objective': 'regression', 
          'metric': 'mae', 
          'n_estimators': 20000, 
          'early_stopping_rounds': 5,
          'num_leaves': 31,
          'learning_rate': 0.05,
          'bagging_fraction': 0.7,
          'bagging_seed': 20181219,
          'num_threads': 5,
          'colsample_bytree': 0.7}

lgr = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=200, verbose_eval=1000)

In [None]:
test_df2 = test_df2.drop(columns=['Id', 'groupId', 'matchId'])

In [None]:
y_pred_lgb = lgr.predict(test_df2, num_iteration=lgr.best_iteration)

In [None]:
submit_lgb = pd.DataFrame({'Id': test_df['Id'], "winPlacePerc": y_pred_lgb} , columns=['Id', 'winPlacePerc'])
submit_lgb.head()

In [None]:
submit_lgb.to_csv("submission_lgb.csv", index=False)