# **Final Notebook for BIM801 Project**
## *Kaggle competition / 22 Spring*
### PUBG Finish Placement Prediction
- Including feature engineering, model, predict
- Applied XGB model

In [None]:
# Analysis * Visualize * Processing
import pandas as pd
import numpy as np
import itertools
import sys
import time
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.5g}'.format
import warnings
warnings.filterwarnings(action='ignore')
from tqdm import tqdm 
import gc
from timeit import default_timer as timer

# Train * Test * Model 
from lightgbm import LGBMRegressor
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV   #Perforing grid search

In [None]:
""" UDF for Reduce memory use of dataframe by change data type
"""
# Thanks and credited to GUILLAUME MARTIN
# https://www.kaggle.com/code/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
#         start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
INPUT_DIR = "../input/pubg-finish-placement-prediction/"

In [None]:
""" UDF for transform inf values into certain value
"""
def fillInf(df, val):
    numcols = df.select_dtypes(include='number').columns
    cols = numcols[numcols != 'winPlacePerc']
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    for c in cols: df[c].fillna(val, inplace=True)

In [None]:
""" UDF for generate Group aggregated features 
"""
# Need to predict the order of places for groups within each match.
# Train on group-level instead of the user-level
def grouping(df, agg_col, sum_col):
    group = df.groupby(['matchId','groupId','matchType'])
    # group size, mean, sum, max, min
    gSize = group.size().to_frame('gSize') # players
    gMean = group.mean()
    gSum = group[sum_col].sum().rename(columns=lambda s: '_gSum.' + s)
    gMax = group[agg_col].max().rename(columns=lambda s: '_gMax.' + s)
    gMin = group[agg_col].min().rename(columns=lambda s: '_gMin.' + s)
    return pd.concat([gSize, gMean, gSum, gMax, gMin], axis=1).reset_index()

## **Feature Engineering**

In [None]:
def feature_engineering(is_train=True):
    if is_train:
        print("processing TRAIN set")
        df = reduce_mem_usage(pd.read_csv(INPUT_DIR + 'train_V2.csv'))
        # Take the matches that have more than 1 player
        df = df[df['maxPlace'] > 1]
        # Anormal data row drop
        df.drop(df.query('rideDistance == 0 and roadKills > 0').index, inplace=True)
    else:
        print("processing TEST set")
        df = reduce_mem_usage(pd.read_csv(INPUT_DIR + 'test_V2.csv'))
    
# LOG Transform features
    log_target = ['assists', 'boosts', 'DBNOs', 'headshotKills', 'heals', 
              'kills', 'revives', 'roadKills', 'teamKills', 'vehicleDestroys', 
              'weaponsAcquired', 'walkDistance']
    for col in log_target:
        df[col] = df[col].apply(lambda x: np.log1p(x))
    
# Rank to percentile
    match = df.groupby('matchId')
    df['killPlacePerc'] = match['kills'].rank(pct=True).values
    df['walkDistancePerc'] = match['walkDistance'].rank(pct=True).values
    df['damageDealtPerc'] = match['damageDealt'].rank(pct=True).values
    del match
    gc.collect()

# Drop external point features
    df.drop(['rankPoints','killPoints','winPoints'], axis=1, inplace=True)

# Linear combination features
    print("i am doing lcf")
    df['_totalDistance'] = (df['rideDistance']*0.5 + df["walkDistance"]*0.2 + df["swimDistance"]*0.3) / df['matchDuration']
    df['_healthItems'] = df['heals'] + df['boosts']
    df['_teamWork'] = df['revives'] + df['assists']
    df['_over1km'] = df['longestKill'].apply(lambda x: 1 if x > 1000 else 0)
    df['_headshotKillRate'] = df['headshotKills'] / df['kills']
    df['_killsOverWalkDistance'] = df['kills'] / df['walkDistance']
    df['_killsOverDistance'] = df['kills'] / df['_totalDistance']
    df['_killPlacePerc'] = df['killPlace'] / df['maxPlace']
    df['killStreakrate'] = df['killStreaks'] / df['kills']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['_totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df["skill"] = df["headshotKills"] + df["roadKills"]
    fillInf(df, 0)
    
    
    y = None
    target = 'winPlacePerc'
    
# Grouping features (size, mean, max, min)
    sum_col = ['kills','assists','teamKills','revives','damageDealt','walkDistance', '_totalDistance', '_healthItems']
    agg_col = list(df.columns)
    exclude_agg_col = ['Id','matchId','groupId','matchType','matchDuration','maxPlace','numGroups']
    for c in exclude_agg_col:
        agg_col.remove(c)
        
    if is_train:
        y = pd.DataFrame(np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64))
        df.drop(target, axis=1, inplace=True)
        agg_col.remove(target)
    
    df = reduce_mem_usage(grouping(df, agg_col, sum_col))
    for c in sum_col:
        df['_perc.gMean_gMax.' + c] = df[c] / df['_gMax.' + c]
        
# Match feature - NumCols
    numcols = df.select_dtypes(include='number').columns.values
    numcols = numcols[numcols != target]
    cols = np.r_[numcols,['matchId']]   
    
    # Match Rank
    match = df[cols].groupby('matchId')
    matchRank = match.rank(pct=True).rename(columns=lambda s: '_rank.' + s)
    df = reduce_mem_usage(pd.concat([df, matchRank], axis=1))
    del matchRank
    gc.collect()

    # Match Sum
    cols = np.r_[agg_col,['matchId','gSize']]
    match = df[cols].groupby('matchId')
    matchSum = match.sum().rename(columns=lambda s: '_mSum.' + s).reset_index()
    df = reduce_mem_usage(pd.merge(df, matchSum))
    del matchSum
    gc.collect()
    
    # Ranking of Kills & killPlace in each match
    minKills = df.sort_values(['matchId','groupId','kills','killPlace']).groupby(['matchId','groupId','kills']).first().reset_index().copy()
    for n in np.arange(4):
        c = 'kills_' + str(n) + '_Place'
        nKills = (minKills['kills'] == n)
        minKills.loc[nKills, c] = minKills[nKills].groupby(['matchId'])['killPlace'].rank().values
        df = pd.merge(df, minKills[nKills][['matchId','groupId',c]], how='left')
        df[c].fillna(0, inplace=True)
    df = reduce_mem_usage(df)
    del minKills, nKills

    
# Enemy info
    df['_enemy.sum.gSize'] = df['_mSum.gSize'] - df['gSize'] # 해당 매치에서 우리팀을 뺀 플레이어 수
    df['_enemy.kills'] = (df['_mSum.kills'] - df['_gSum.kills']) / df['_enemy.sum.gSize'] # 해당 매치에서 에너미 한 명이 평균적으로 몇 명을 죽였는지
    df['_enemy.damageDealt'] = (df['_mSum.damageDealt'] - df['_gSum.damageDealt']) / df['_enemy.sum.gSize'] # 해당 매치에서 에너미 한 명이 평균적으로 넣은 딜량
    for c in agg_col:
        df['_perc.gMax_mSum.' + c] = df['_gMax.' + c] / df['_mSum.' + c]  # 그룹 맥스 / 매치 총량  (for agg_col)
        if c in sum_col:
            df['_perc.gSum_mSum.' + c] = df['_gSum.' + c] / df['_mSum.' + c]  #그룹 총 / 매치 총량 (for sum_col)
    fillInf(df, 0)
    
# Match Max
    matchMax = match.max().rename(columns=lambda s: '_mMax.' + s).reset_index()
    df = reduce_mem_usage(pd.merge(df, matchMax))
    del matchMax
    gc.collect()
    for c in agg_col:
        df['_perc.gMax_mMax.' + c] = df['_gMax.' + c] / df['_mMax.' + c] # 그룹 맥스 / 매치 맥스 (for agg_col)
        df.drop(['_mMax.' + c], axis=1, inplace=True)
    fillInf(df, 0)
    
# Rank of Top / bottom player of each group in match
    killBottomPlayer = df[['matchId','_gMin.kills','_gMax.killPlace']].copy()
    group = killBottomPlayer.groupby(['matchId','_gMin.kills'])
    killBottomPlayer['_rank.bottomPlayer'] = group.rank().values
    df = pd.merge(df, killBottomPlayer)

    killTopPlayer = df[['matchId','_gMax.kills','_gMin.killPlace']].copy()
    group = killTopPlayer.groupby(['matchId','_gMax.kills'])
    killTopPlayer['_rank.topPlayer'] = group.rank().values
    df = pd.merge(df, killTopPlayer)

    del killBottomPlayer, killTopPlayer
    gc.collect()

# killPlace rank of group and kills
# MatchType mapping
    mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) or ('crash' in x) else 'squad'
    df['matchTypeCat'] = df['matchType'].map(mapper)
    
# Drop constant feature
    const_column = [col for col in df.columns if df[col].nunique() == 1]

# Label Encoding
    cols = [col for col in df.columns if col not in ['Id','matchId','groupId']]
    for i, t in df.loc[:, cols].dtypes.iteritems():
        if t == object:
            df[i] = pd.factorize(df[i])[0]
    print('Final df shape', df.shape)
    return df, y, const_column

In [None]:
# Create Train & Test data set including feature engineering
X_train, y, trn_cc = feature_engineering(True)
X_test, _, tst_cc = feature_engineering(False)

In [None]:
# Drop constant value filled column for both Train & Test set
const_cols = np.r_[trn_cc, tst_cc]
X_train.drop(const_cols, axis=1, inplace=True)
X_test.drop(const_cols, axis=1, inplace=True)

## Model & Predict

In [None]:
# make a copy of id columns for later predict result concat
X_test_id = X_test[['matchId','groupId']].copy()
# drop matchId,groupId
X_train.drop(['matchId','groupId'], axis=1, inplace=True)
X_test.drop(['matchId','groupId'], axis=1, inplace=True)

print(X_train.shape, X_test.shape)

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import minmax_scale
import lightgbm as lgb

# params for xgb
params={'eta': 0.05, # learning rate
        'n_estimators' : 4000,
        'objective': 'reg:linear',    # default for regression
        'eval_metric':'mae',
        'num_leaves': 31,
        'min_child_weight': 2,        # default = 1, 리프 노드에 포함되는 최소 관측지의 수
        'sub_sample': 0.8,            # default = 1, 샘플링 비율 지정 -> 과적합 제어
        'verbosity': 0,
        'gamma' : 0,
        'max_depth' : 7,
        'random_state' : 42,
       }
mts = list() # MatchTypes
fis = list() # Feature Importance
pred = np.zeros(X_test.shape[0])

In [None]:
"""Notice that, the prediction was executed grouped by matchType (solo, duo, squad)
"""
for mt in tqdm(X_train['matchTypeCat'].unique()):
    idx = X_train[X_train['matchTypeCat'] == mt].index
    reg = xgb.XGBRegressor(**params)
    reg.fit(X_train.loc[idx], y.loc[idx])

    idx = X_test[X_test['matchTypeCat'] == mt].index
    pred[idx] = reg.predict(X_test.loc[idx])
    mts.append(mt)
    fis.append(reg.feature_importances_)

In [None]:
for mt, feature_importance in zip(mts, fis): 
    # Plot feature importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    sorted_idx = sorted_idx[len(feature_importance) - 30:]
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(12,6))
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X_train.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance [matchTypeCat:' + str(mt) + ']')
    plt.show()
# print columns sort by feature importance
# X_train.columns[np.argsort(-feature_importance)].values

In [None]:
X_test_id['winPlacePerc'] = pred
group = X_test_id.groupby(['matchId'])
X_test_id['_rank.winPlacePerc'] = group['winPlacePerc'].rank(method='min')
X_test = pd.concat([X_test, X_test_id], axis=1)
sub_group = group.count().reset_index()['matchId'].to_frame()
X_test = pd.merge(X_test, sub_group)

## Post processing 
### WinPlacePerc for adjust by the scoring rule of the winPlacePerc

In [None]:
# wpp 값을 더 정확한 (매치 상황에서 나올 수 있는) 값으로 보정해주는 과정

fullgroup = (X_test['numGroups'] == X_test['maxPlace'])
# full group (201,366 개) --> calculate from rank
subset = X_test.loc[fullgroup]
X_test.loc[fullgroup, 'winPlacePerc'] = (subset['_rank.winPlacePerc'].values - 1) / (subset['maxPlace'].values - 1)

# not full group (684,872 개) --> align with maxPlace
subset = X_test.loc[~fullgroup]
gap = 1.0 / (subset['maxPlace'].values - 1)
new_perc = np.around(subset['winPlacePerc'].values / gap) * gap  # half&up
X_test.loc[~fullgroup, 'winPlacePerc'] = new_perc

In [None]:
# 위의 작업이 잘 작동하는지 확인 
X_test.loc[~fullgroup, '_pred.winPlace'] = np.around(X_test.loc[~fullgroup, 'winPlacePerc'].values / gap) + 1
X_test.loc[~fullgroup & (X_test['matchId'] == '000b598b79aa5e'),
           ['matchId','groupId','winPlacePerc','maxPlace','numGroups','_pred.winPlace','_rank.winPlacePerc']
          ].sort_values(['matchId','_pred.winPlace'])

In [None]:
# edge cases handling
X_test.loc[X_test['maxPlace'] == 0, 'winPlacePerc'] = 0
X_test.loc[X_test['maxPlace'] == 1, 'winPlacePerc'] = 1  # nothing
X_test.loc[(X_test['maxPlace'] > 1) & (X_test['numGroups'] == 1), 'winPlacePerc'] = 0
X_test['winPlacePerc'].describe()

## Submit file generate

In [None]:
test = pd.read_csv('../input/test_V2.csv')

submission = pd.merge(test, X_test[['matchId','groupId','winPlacePerc']])
submission = submission[['Id','winPlacePerc']]
sub_file_name = "xgb_0609_ne4k"
submission.to_csv("../build/{}.csv".format(sub_file_name), index=False)