# PUBG finish placement prediction

## This is a fully functioning python notebook that reaches the MAE of 0.02. GridSearchCV and keras NN parts were excluded from execution due to RAM limitations. However, their code is presented in comments. 

In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
print(os.listdir("../input"))

import warnings
warnings.filterwarnings('ignore')
import gc, sys
gc.enable()
import time

# Feature engineering

In [0]:
INPUT_DIR = "../input/"

In [0]:
# This function is required to handle the data feature editing. Otherwise, we will run out of RAM
# at least 16 GB of RAM is required
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df,display=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_time = time.time()
    if display:
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    if display:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        print('\nTime elapsed %.0f sec\n'%(time.time()-start_time))

    return df

In [0]:
# without feature engineering, our model will not be predicting very well
def feature_engineering(is_train=True,debug=True):
    """
    Although a gamer can be very good at the game, 
    but if other players of other groups in the same match is better than that player, he will still get lower score.
    This feature editing part takes into account each match groups' size and min, max, mean features. match size= how many people in the match  
    11 additional features are added based on game analysis.
    """
    start_time = time.time()
    #test_idx = None
    if is_train: 
        print("processing train.csv")
        if debug == True:
            df = reduce_mem_usage( pd.read_csv('../input/train_V2.csv', nrows=10000), True)
        else:
            df = reduce_mem_usage(pd.read_csv('../input/train_V2.csv'), True)           

        df = df[df['maxPlace'] > 1]
    else:
        print("processing test.csv")
        df = reduce_mem_usage(pd.read_csv('../input/test_V2.csv'), True)
        #test_idx = df.Id
    
    print("remove some columns")
    target = 'winPlacePerc'

    print("Adding Features")
 
    df['headshotrate'] = df['kills']/df['headshotKills']
    df['killStreakrate'] = df['killStreaks']/df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['killsPerWalkDistance'] = df['kills'] / df['walkDistance']
    df["skill"] = df["headshotKills"] + df["roadKills"]

    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    
    print("Replacing Na's in DF")
    df.fillna(0, inplace=True) # this is important for making group means, min,max.
    #gc.collect() # clean any residual unused var.s
    
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    
    
    y = None
    
    
    if is_train: 
        print("get target")
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print("get group mean feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    else: df_out = df[['matchId','groupId']]

    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])

    gc.collect()
    
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])

    gc.collect()
    
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    #df_out = reduce_mem_usage(df_out)
    #del agg, agg_rank
    gc.collect()
    
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    #df_out = reduce_mem_usage(df_out)
    #del agg
    gc.collect()
    
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
  
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    
    #feature_names = list(df_out.columns)

    del df, agg, agg_rank
    gc.collect()
    print('\nTime elapsed for feature engineering: %.0f sec'%(time.time()-start_time))
    return df_out, y #, feature_names, test_idx

In [0]:
# Process the training data :
x_train, y_train = feature_engineering(True, False)

In [0]:
x_train = reduce_mem_usage(x_train, True)

In [0]:
print(x_train.shape)
x_train.head()

In [0]:
y_train.shape

# GridSearchCV

In [0]:
'''
# GridSearch parameters
from sklearn.model_selection import train_test_split, GridSearchCV
gridParams = {
    'learning_rate': [ 0.03, 0.04, 0.05],
    'num_leaves': [ 33, 36, 39]
}
X_tr, X_test, y_tr, y_test = train_test_split(x_train, y_train, test_size=0.40, random_state=46)
del x_train, y_train
mdl = lgb.LGBMRegressor(metric='mae',
     objective="regression", 
    n_estimators=20000, 
    bagging_fraction=0.7,
    bagging_seed=0, 
    num_threads=4,
    colsample_bytree=0.7, 
    num_boost_round=100)

grid = GridSearchCV(mdl, gridParams, verbose=4, cv=None, n_jobs=-1, scoring='neg_mean_absolute_error')
'''

In [0]:
#grid.fit(X_test,y_test)

In [0]:
#grid.best_params_

In [0]:
#grid.best_score_

# Keras NN

In [0]:
# scaling does not help in gradient boosting algorithms most of the time!!!
# check performance without scaling 
#from sklearn import preprocessing
# Scale the data to be in the range (-1 , 1)
#scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=False).fit(x_train)

In [0]:
'''
# For NN training, the following lines of code are required, then skip to submission part instead of training on GBM: 
from sklearn.model_selection import train_test_split
# splitting data into test and train dataset
X_tr, X_test, y_tr, y_test = train_test_split(x_train, y_train, test_size=0.20,random_state=34)
del x_train, y_train
gc.collect()

from sklearn import preprocessing
#1 Scale the data to be in the range (0 , 1)
scaler = preprocessing.QuantileTransformer().fit(X_tr)
X_tr=scaler.transform(X_tr)
X_test = scaler.transform(X_test)
'''

In [0]:
'''
 model = keras.Sequential([
 keras.layers.Dense(2048,kernel_initializer='he_normal', activation=tf.nn.relu, input_shape=(X_tr.shape[1],)), # neurons with relu activation, first layer with input 
 keras.layers.BatchNormalization(),
 keras.layers.Dropout(0.3), # dropout for reducing the overfitting problem
 keras.layers.Dense(1024, kernel_initializer='he_normal', activation=tf.nn.relu), # 2nd hidden layer
 keras.layers.BatchNormalization(),   
 keras.layers.Dropout(0.3),
 keras.layers.Dense(512,kernel_initializer='he_normal', activation=tf.nn.relu), # 3rd hidden layer
 keras.layers.BatchNormalization(),
 keras.layers.Dropout(0.3),
 keras.layers.Dense(256,kernel_initializer='he_normal', activation=tf.nn.relu),
 keras.layers.BatchNormalization(),
 keras.layers.Dropout(0.3),   
 keras.layers.Dense(64, kernel_initializer='he_normal', activation=tf.nn.relu),
 keras.layers.BatchNormalization(),
 keras.layers.Dropout(0.3),   
 keras.layers.Dense(1, kernel_initializer='normal', activation='sigmoid')]) #  output layer 

model.compile(loss='mse', #this loss method is useful for numeric prediction
 optimizer=tf.train.AdamOptimizer(learning_rate=0.001), metrics=['mae'])
model.summary()
history1 = model.fit(X_tr, y_tr, epochs = 200, batch_size = 20480, verbose=1, validation_data = (X_test, y_test))
'''

In [0]:
'''
df_test = pd.read_csv('../input/test_V2.csv')
df_test = feature_engineering(False, False)
df_test = scaler.transform(df_test)
preds = model.predict(df_test)
'''


# Light GBM model

In [0]:
from sklearn.model_selection import train_test_split
# splitting data into test and train dataset
X_tr, X_test, y_tr, y_test = train_test_split(x_train, y_train, test_size=0.10, random_state=46)
del x_train, y_train
gc.collect()

In [0]:
# the most important part is to tune the parameters of the model correctly. GridSearchCV is a good start
import lightgbm as lgb

parameters = {"objective" : "regression", "metric" : "mae", 'n_estimators':20000, 'early_stopping_rounds':350,
              "num_leaves" : 35, "learning_rate" : 0.03, "bagging_fraction" : 0.7,
               "bagging_seed" : 0, "num_threads" : 4,"colsample_bytree" : 0.7, 'num_boost_round': 60000
             } 

In [0]:
# training the model
train_data = lgb.Dataset(X_tr, label=y_tr)
test_data = lgb.Dataset(X_test, label=y_test)
del X_tr, X_test, y_tr, y_test
gc.collect()

In [0]:
model = lgb.train(parameters, train_set = train_data, valid_sets=[train_data, test_data], verbose_eval=500) 
# overfitting does not happen as long as we keep num_leaves low
#0.0312363
#0.0307479

In [0]:
del test_data, train_data
gc.collect()

In [0]:

# test data editing and scaling
x_test, _ = feature_engineering(False, False)
x_test = reduce_mem_usage(x_test, True)

In [0]:
# predicting test dataset
preds = model.predict(x_test, num_iteration = model.best_iteration)
del x_test
gc.collect()

In [0]:
preds

In [0]:
# making the predictions in [0, 1] range
preds = preds.reshape(-1)
#preds[preds > 1] = 1
#preds[preds < 0] = 0

# Submission

In [0]:
df_sub = pd.read_csv(INPUT_DIR + 'test_V2.csv')

In [0]:
# Submission dealing with edge cases based on MaxPlace
# Credits: https://www.kaggle.com/anycode/simple-nn-baseline-4
df_sub['winPlacePerc'] = preds
df_sub.loc[df_sub.maxPlace == 0, "winPlacePerc"] = 0
df_sub.loc[df_sub.maxPlace == 1, "winPlacePerc"] = 1
subset = df_sub.loc[df_sub.maxPlace > 1]
gap = 1.0 / (subset.maxPlace.values - 1)
new_perc = np.around(subset.winPlacePerc.values / gap) * gap
df_sub.loc[df_sub.maxPlace > 1, "winPlacePerc"] = new_perc

df_sub.loc[(df_sub.maxPlace > 1) & (df_sub.numGroups == 1), "winPlacePerc"] = 0
assert df_sub["winPlacePerc"].isnull().sum() == 0

In [0]:
df_sub.head()

In [0]:
my_submission = pd.DataFrame({'Id': df_sub.Id, 'winPlacePerc': df_sub['winPlacePerc']})
my_submission.to_csv('submission1.csv', index=False)

In [0]:
my_submission['winPlacePerc'].head(10)