# A Neural Network approach with simple Feature Engineering.

In [None]:
import pandas as pd
import re
import nltk
import keras
import tensorflow as tf
import gensim
import numpy as np
from xgboost.sklearn import XGBRegressor
import xgboost as xgb
import math
from sklearn import metrics
import matplotlib.pyplot as plt
#####################################################3
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()

This dataset can be think of a numerical dataset where only three factors are categorical. However, matchId and groupId cannot be ignored due to the mechanism of the game that the rank depends on other players performance. So, besides using the factors given here, we also need to explore more about the features on each match and each group, which is the 'Measurement'.

# Measurement For Match and Group.
The key is to define an appropriate measurement to describe each Match and each group of their level.
As you see, the matchId and groupId is just a kind of key to distinguish between each other, it make no sense if we include them in our model. Explore the features that are sufficient enough to represent the charater for match and group is very important to be a key factor in regression.

Id in this case doesn't make sense at all.

An idea is to define a number to describe how good is a player in one match compared with other players. Another idea is to define how good is a group in one match compared with other group. The problem here is to remove the duplicated information that comes from 'single' battle mode. But we do this later.

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
data = [go.Heatmap(
        z= train.corr().values,
        x=train.columns.values,
        y=train.columns.values,
        colorscale='Viridis',
        reversescale = False,
        #text = True,
        opacity = 1.0)]

layout = go.Layout(
    title='Pearson Correlation of features',
    xaxis = dict(ticks='', nticks=36),
    yaxis = dict(ticks='' ),
    width = 900, height = 700,
margin=dict(
    l=240,
),)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap')

From what shown here, I expect a better result from linear model than tree-based model. Look into some of the factors like, killPlace, walkDistance. A simple Linear regression won't perform well as the collinearity even the ridge or lasso will just give you a underestimation as most of the factors doesn't really have linearity with our target.

In [None]:
train['assists_perf'] = train.groupby(['matchId','groupId']).assists.transform('sum')/train.groupby(['matchId','groupId']).Id.transform('count')
train['assists_perf'] = train['assists_perf']/train.groupby('matchId').assists_perf.transform('max')
train['revives_perf'] = train.groupby(['matchId','groupId']).revives.transform('sum')/train.groupby(['matchId','groupId']).Id.transform('count')
train['revives_perf'] = train['revives_perf']/train.groupby('matchId').revives_perf.transform('max')
# Measurement on teamwork.

In [None]:
train['dmgdealt_perf'] = train.groupby(['matchId','groupId']).damageDealt.transform('sum')/train.groupby(['matchId','groupId']).Id.transform('count')
train['dmgdealt_perf'] = train['dmgdealt_perf']/train.groupby('matchId').dmgdealt_perf.transform('max')
# Measurement on damage.

In [None]:
test['assists_perf'] = test.groupby(['matchId','groupId']).assists.transform('sum')/test.groupby(['matchId','groupId']).Id.transform('count')
test['assists_perf'] = test['assists_perf']/test.groupby('matchId').assists_perf.transform('max')
test['revives_perf'] = test.groupby(['matchId','groupId']).revives.transform('sum')/test.groupby(['matchId','groupId']).Id.transform('count')
test['revives_perf'] = test['revives_perf']/test.groupby('matchId').revives_perf.transform('max')

In [None]:
test['dmgdealt_perf'] = test.groupby(['matchId','groupId']).damageDealt.transform('sum')/test.groupby(['matchId','groupId']).Id.transform('count')
test['dmgdealt_perf'] = test['dmgdealt_perf']/test.groupby('matchId').dmgdealt_perf.transform('max')

First, Lets see if our direction is right before formal feature engineering.

In [None]:
def modelfit(alg,dtrain,predictors,useTrainCV = True, cv_folds = 5, early_stopping_rounds = 50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label = dtrain['winPlacePerc'].values, feature_names = predictors)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round = alg.get_params()['n_estimators'], nfold = cv_folds, metrics = 'mae', early_stopping_rounds = early_stopping_rounds)
        alg.set_params(n_estimators = cvresult.shape[0])
        print('Best n_estimator = ' + str(cvresult.shape[0]))
    alg.fit(dtrain[predictors], dtrain['winPlacePerc'], eval_metric = 'mae')
    
    dtrain_predictions = alg.predict(dtrain[predictors])
    
    print('\nModel Report:')
    print('MAE: %f' % math.sqrt(metrics.mean_absolute_error(dtrain['winPlacePerc'].values, dtrain_predictions)))

In [None]:
train.columns

In [None]:
ready_train = train[['assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'maxPlace', 'numGroups', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'winPlacePerc', 'assists_perf', 'revives_perf', 'dmgdealt_perf']]
ready_test = test[['assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'maxPlace', 'numGroups', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints', 'assists_perf', 'revives_perf', 'dmgdealt_perf']]

In [None]:
predictors = ready_train.columns[ready_train.columns != 'winPlacePerc']

In [None]:
len(predictors)

In [None]:
xgb1 = XGBRegressor(objective = 'reg:logistic', learning_rate = 0.1, n_estimators = 50, max_depth = 5, min_child_weight = 1, gamma = 0, subsample = 0.8, colsample_bytree = 0.8, reg_alpha = 1, seed = 2018)

In [None]:
modelfit(xgb1, ready_train, predictors, useTrainCV = False)

In [None]:
xgb.plot_importance(xgb1)
plt.show()