# Overview
The target variable here -- winPlacePerc -- is a float. The usual assumption is that floats are continuous, but that's not the case here. If we multiply by the variable maxPlace (minus 1) we'll see that we get an integer.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from matplotlib import pyplot as plt

import tensorflow as tf

from keras.models import Model, load_model
from keras.layers import Input, Cropping2D, BatchNormalization, Dropout
from keras.layers.core import Lambda, Dense
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.losses import binary_crossentropy

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import svm

import xgboost as xgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import gc
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#options
small_train = True
only_imp_cols = False

In [None]:
try:
    train = pd.read_csv('../input/train_V2.csv')
except:
    train = pd.read_csv('~/.kaggle/competitions/pubg-finish-placement-prediction/train_V2.csv')

In [None]:
((train['maxPlace'] - 1) * train['winPlacePerc'])[:10]

Now normally discrete variables make for simple classification problems. But here we don't have a fixed number of categories. In this notebook, I'll deal with that by making it 3 different classification problems. To see why this helps, let's see what goes wrong when we simply run a normal classification. First we do some feature engineering, then set up a simple NN in keras.

Next we have some details - we'll take only 40% of the training set to make it run faster, do some feature engineering, and do some memory reduction. Input cells are hidden because these aren't important to the main point of this notebook.

In [None]:
if small_train:
    train = train[train['matchId'].isin(train['matchId'].unique()[np.random.rand(len(train['matchId'].unique())) < 0.4])].reset_index(drop=True)

In [None]:
def feature_engineering(df, train=True):
    
    drop_labels = ['Id', 'groupId', 'matchId', 'maxPlace', 'numGroups', 'matchDuration', 'matchType']
    if train:
        df['wppBin'] = np.uint16(np.round(df['winPlacePerc']*(df['maxPlace']-1)))
        df['wppBin2'] = df['wppBin'] + (100 - df['maxPlace'])
        df['wppBin3'] = df['wppBin'] + np.uint16(np.round((100 - df['maxPlace'])*.5))
        drop_labels += ['winPlacePerc', 'wppBin', 'wppBin2', 'wppBin3']
    df['killrate'] = df['kills']/df['matchDuration']
    df['totalDistance'] = df['walkDistance'] + df['swimDistance'] + df['rideDistance']
    df['avgSpeed'] = df['totalDistance'] / df['matchDuration']
    df['healsPlusBoosts'] = df['boosts'] + df['heals']
    df['kar'] = df['kills'] + df['assists'] + df['revives']
    
    le = LabelEncoder()
    le.fit(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
       'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
       'flarefpp', 'normal-duo-fpp', 'normal-duo', 'crashtpp',
       'normal-squad', 'normal-solo'])
    
    df['matchType'] = le.transform(df['matchType'])
    
    agglabels = [ele for ele in df.columns if ele not in drop_labels]
    
    groupByMatch = df.groupby(['matchId','groupId'])[agglabels]
    groupByGroup = df.groupby(['groupId', 'Id'])[agglabels]
    df = df.merge(groupByMatch.agg('min').reset_index(), suffixes=['','match_min'], on=['matchId','groupId'], how='left')
    print('Checkpoint 1/7')
    df = df.merge(groupByMatch.agg('max').reset_index(), suffixes=['','match_max'], on=['matchId','groupId'], how='left')
    print('Checkpoint 2/7')
    df = df.merge(groupByMatch.agg('mean').reset_index(), suffixes=['','match_mean'], on=['matchId','groupId'], how='left')
    print('Checkpoint 3/7')
    df = df.merge(groupByMatch.agg('mean').groupby('matchId').rank(pct=True).reset_index(), suffixes=['','match_rank'], on=['matchId', 'groupId'], how='left')

    print('Checkpoint 4/7')
    df = df.merge(groupByGroup.agg('min').reset_index(), suffixes=['','group_min'], on=['Id', 'groupId'], how='left')
    print('Checkpoint 5/7')
    df = df.merge(groupByGroup.agg('max').reset_index(), suffixes=['','group_max'], on=['Id', 'groupId'], how='left')
    print('Checkpoint 6/7')
    df = df.merge(groupByGroup.agg('mean').reset_index(), suffixes=['','group_mean'], on=['Id', 'groupId'], how='left')
    print('Checkpoint 7/7')
    
    df = df.merge(groupByGroup.agg('mean').groupby('groupId').rank(pct=True).reset_index(), suffixes=['','group_rank'], on=['Id', 'groupId'], how='left')
    df = df.merge(groupByGroup.size().reset_index(name='group_size'), how='left', on=['Id', 'groupId'])
    df = df.merge(groupByMatch.size().reset_index(name='match_size'), how='left', on=['matchId','groupId'])
    

    return df

In [None]:
def reduce_mem_usage(props, verbose=False):
    
    # slightly adapted (but mostly just copied) from here: 
    # https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65
    
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            if verbose:
                print("******************************")
                print("Column: ",col)
                print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            if verbose:
                print("dtype after: ",props[col].dtype)
                print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [None]:
train, _ = reduce_mem_usage(train)

This is where we split the training set into a train and a validate set. Note that we can't take the simple approach of randomly taking out 20% -- we need to take out whole matches.

In [None]:
msk = train['matchId'].unique()[np.random.rand(len(train['matchId'].unique())) < 0.8]
train_train = train[train['matchId'].isin(msk)].reset_index(drop=True)
train_val = train[~train['matchId'].isin(msk)].reset_index(drop=True)
train = None
msk = None

gc.collect()
train_train = feature_engineering(train_train)
train_train, _ = reduce_mem_usage(train_train)
train_val = feature_engineering(train_val)
train_val, _ = reduce_mem_usage(train_val)

In [None]:
train_cols = [ele for ele in train_train.columns if ele not in ['Id', 'groupId', 'matchId', 'winPlacePerc', 'wppBin', 'wppBin2', 'wppBin3']]
target = 'winPlacePerc'

In [None]:
m1inputs = Input(shape=(len(train_cols),))

m1l1 = BatchNormalization() (m1inputs)
m1l1 = Dense(128, activation='relu') (m1l1)
m1l1 = BatchNormalization() (m1l1)

m1l2 = Dense(128, activation='relu')(m1l1)
m1l2 = BatchNormalization() (m1l2)
m1l2 = Dropout(0.75) (m1l2)

m1l3 = Dense(128, activation='relu')(m1l2)
m1l3 = BatchNormalization() (m1l3)

m1l4 = Dense(128, activation='relu')(m1l3)
m1l4 = BatchNormalization() (m1l4)
m1l4 = Dropout(0.75) (m1l4)

m1output = Dense(100, activation='softmax') (m1l4)

In [None]:
model1 = Model(inputs=[m1inputs], outputs=[m1output])
adam = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, amsgrad=False)
model1.compile(optimizer=adam, loss='binary_crossentropy', metrics=['acc'])

In [None]:
enc = OneHotEncoder()
enc.fit(np.arange(100).reshape(-1,1))
#enc.fit(train_train['wppBin'].values.reshape(-1,1))
#enc.fit(np.expand_dims(np.array(range(100)),axis=0))

In [None]:
model1.fit(train_train[train_cols], enc.transform(train_train['wppBin'].values.reshape(-1,1)), 
           epochs=20, batch_size=4096, 
           validation_data=(train_val[train_cols], enc.transform(train_val['wppBin'].values.reshape(-1,1))),
           verbose=2)

In [None]:
val_preds = model1.predict(train_val[train_cols], verbose=True)

In [None]:
val_preds_float = np.argmax(val_preds, axis=1) / (train_val['maxPlace'] + 1)

In [None]:
plt.scatter(val_preds_float, train_val[target], marker='.')
plt.plot([0, 1], [0,1], color='r')
plt.xlabel('predicted')
plt.ylabel('actual')

In [None]:
print('estimated score from verify set:')
np.mean(np.abs(val_preds_float - train_val[target]))

That's not a good score. In addition, we see a couple problems. First, we have predicted values over 1, which we know shouldn't happen. This occurs because the network hasn't figured out that it should only look at bins up to maxPlace. Second, it almost always undershoots the actual value, but the effect is smallest at the bottom -- for the worst players. This makes sense, because the worst player will always be a 0, while a mid-level player may be anwhere from 10 to 50. So to fix these problems, let's make 2 more networks like this, except we'll change how maxPlace is turned into a discrete variable. For this network, we always had the worst player at 1. For the second, we'll always have the best player at 100, and for the third, the middle player will always be at 50.

In [None]:
m2inputs = Input(shape=(len(train_cols),))

m2l1 = BatchNormalization() (m2inputs)
m2l1 = Dense(128, activation='relu') (m2l1)
m2l1 = BatchNormalization() (m2l1)

m2l2 = Dense(128, activation='relu')(m2l1)
m2l2 = BatchNormalization() (m2l2)
m2l2 = Dropout(0.75) (m2l2)

m2l3 = Dense(128, activation='relu')(m2l2)
m2l3 = BatchNormalization() (m2l3)

m2l4 = Dense(128, activation='relu')(m2l3)
m2l4 = BatchNormalization() (m2l4)
m2l4 = Dropout(0.75) (m2l4)

m2output = Dense(100, activation='softmax') (m2l4)

In [None]:
model2 = Model(inputs=[m2inputs], outputs=[m2output])
adam = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, amsgrad=False)
model2.compile(optimizer=adam, loss='binary_crossentropy', metrics=['acc'])
model2.fit(train_train[train_cols], enc.transform(train_train['wppBin2'].values.reshape(-1,1)), 
           epochs=20, 
           batch_size=4096, 
           validation_data=(train_val[train_cols], enc.transform(train_val['wppBin2'].values.reshape(-1,1))),
           verbose=2)

In [None]:
m3inputs = Input(shape=(len(train_cols),))

m3l1 = BatchNormalization() (m3inputs)
m3l1 = Dense(128, activation='relu') (m3l1)
m3l1 = BatchNormalization() (m3l1)

m3l2 = Dense(128, activation='relu')(m3l1)
m3l2 = BatchNormalization() (m3l2)
m3l2 = Dropout(0.75) (m3l2)

m3l3 = Dense(128, activation='relu')(m3l2)
m3l3 = BatchNormalization() (m3l3)

m3l4 = Dense(128, activation='relu')(m3l3)
m3l4 = BatchNormalization() (m3l4)
m3l4 = Dropout(0.75) (m3l4)

m3output = Dense(100, activation='softmax') (m3l4)

In [None]:
model3 = Model(inputs=[m3inputs], outputs=[m3output])
adam = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, amsgrad=False)
model3.compile(optimizer=adam, loss='binary_crossentropy', metrics=['acc'])
model3.fit(train_train[train_cols], enc.transform(train_train['wppBin3'].values.reshape(-1,1)), 
           epochs=20, 
           batch_size=4096, 
           validation_data=(train_val[train_cols], enc.transform(train_val['wppBin3'].values.reshape(-1,1))),
           verbose=2)

In [None]:
predsbin1 = model1.predict(train_val[train_cols], verbose=True)
predsbin2 = model2.predict(train_val[train_cols], verbose=True)
predsbin3 = model3.predict(train_val[train_cols], verbose=True)

In [None]:
tpredsbin1 = model1.predict(train_train[train_cols], verbose=True)
tpredsbin2 = model2.predict(train_train[train_cols], verbose=True)
tpredsbin3 = model3.predict(train_train[train_cols], verbose=True)

In [None]:
trainMaxPlace = train_train['maxPlace']
valMaxPlace = train_val['maxPlace']
trainTar = train_train[target]
valTar = train_val[target]
train_train = None
train_verify = None
gc.collect()

In [None]:
preds1 = np.argmax(predsbin1,axis=1) / (valMaxPlace + 1)
preds2 = (np.argmax(predsbin2,axis=1) - (100 - valMaxPlace)) / (valMaxPlace + 1)
preds3 = (np.argmax(predsbin3,axis=1) - np.uint16(np.round((100 - valMaxPlace)*.5))) / (valMaxPlace + 1)

In [None]:
tpreds1 = np.argmax(tpredsbin1,axis=1) / (trainMaxPlace + 1)
tpreds2 = (np.argmax(tpredsbin2,axis=1) - (100 - trainMaxPlace)) / (trainMaxPlace + 1)
tpreds3 = (np.argmax(tpredsbin3,axis=1) - np.uint16(np.round((100 - trainMaxPlace)*.5))) / (trainMaxPlace + 1)

In [None]:
print('Training scores:')
print(np.mean(np.abs(tpreds1 - trainTar)))
print(np.mean(np.abs(tpreds2 - trainTar)))
print(np.mean(np.abs(tpreds3 - trainTar)))

print('Validation scores')

print(np.mean(np.abs(preds1 - valTar)))
print(np.mean(np.abs(preds2 - valTar)))
print(np.mean(np.abs(preds3 - valTar)))

In [None]:
train_outs = pd.DataFrame({'p1': tpreds1, 
                           'p2': tpreds2, 
                           'p3': tpreds3, 
                           'maxPlace': trainMaxPlace, 
                           'winPlacePerc': trainTar})

val_outs = pd.DataFrame({'p1': preds1, 
                           'p2': preds2, 
                           'p3': preds3, 
                           'maxPlace': valMaxPlace, 
                           'winPlacePerc': valTar})

train_outs.to_csv('train_out.csv')
val_outs.to_csv('validate_out.csv')

# Combining the 3 models

For this we'll use a simple XGB regressor, which trains very quickly since we only have 4 attributes. The performance improvement is considerable.

In [None]:
clf = xgb.XGBRegressor(max_depth=4,
                       silent=False,
                      learning_rate=0.6,
                      n_estimators=100,
                      n_jobs=-1)

boost_cols = ['p1', 'p2', 'p3', 'maxPlace']
target = 'winPlacePerc'

clf.fit(train_outs[boost_cols], train_outs[target],
        eval_set=[(train_outs[boost_cols], train_outs[target]), (val_outs[boost_cols], val_outs[target])], 
        verbose=False, eval_metric='mae', early_stopping_rounds=5)

In [None]:
xgb_val_preds = clf.predict(val_outs[boost_cols])
print('validation set mae: {}'.format(np.mean(np.abs(xgb_val_preds - val_outs[target]))))

In [None]:
try:
    test = pd.read_csv('../input/test_V2.csv')
except:
    test = pd.read_csv('~/.kaggle/competitions/pubg-finish-placement-prediction/test_V2.csv')

test, _ = reduce_mem_usage(test)
test = feature_engineering(test, train=False)
test_ids = test['Id']
test = test[train_cols]

In [None]:
test, _ = reduce_mem_usage(test)

In [None]:
preds1 = model1.predict(test, verbose=True)
preds2 = model2.predict(test, verbose=True)
preds3 = model3.predict(test, verbose=True)

In [None]:
testMaxPlace = test['maxPlace']
test = None
gc.collect()

In [None]:
preds1 = np.argmax(preds1,axis=1) / (testMaxPlace + 1)
preds2 = (np.argmax(preds2,axis=1) - (100 - testMaxPlace)) / (testMaxPlace + 1)
preds3 = (np.argmax(preds3,axis=1) - np.uint16(np.round((100 - testMaxPlace)*.5))) / (testMaxPlace + 1)

In [None]:
test_outs = pd.DataFrame({'p1': preds1, 
                         'p2': preds2, 
                         'p3': preds3, 
                         'maxPlace': testMaxPlace})

test_outs.to_csv('test_out.csv')

In [None]:
test_preds = clf.predict(test_outs[boost_cols])

In [None]:
submission = pd.DataFrame({'Id': test_ids, 'winPlacePerc': test_preds})
submission.to_csv('submission.csv', index=False)

# Conclusion

Thanks for reading! You may notice that I only used a small portion of the training set - this is because if I use all of it, the notebook runs out of memory. If you have suggestions on this (or any other way to improve this), please comment below! Thank you!