In [None]:
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor


DATASET_ROOT = '/kaggle/input/pubg-finish-placement-prediction'
TRAIN_PATH = DATASET_ROOT + '/train_V2.csv'
TEST_PATH = DATASET_ROOT + '/test_V2.csv'

features = [
  'kills', 'killsPerc',
  'killPlace', 'killPlacePerc', 
  'walkDistance', 'walkDistancePerc',
  'totalDistance', 'totalDistancePerc',
  'killPlace_maxPlace_Ratio', 
  'killsPerc_walkDistancePerc_Ratio',
  'killPlacePerc_walkDistancePerc_Ratio',
]


def reduce_mem_usage(df):
  for col in df.columns:
    col_type = df[col].dtype

    if col_type != object:
      c_min = df[col].min()
      c_max = df[col].max()
      
      if str(col_type)[:3] == 'int':
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
          df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
          df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
          df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
          df[col] = df[col].astype(np.int64)  
      else:
        if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
        else:
          df[col] = df[col].astype(np.float64)

  return df


def preprocess_point(df):
  df.loc[(df['rankPoints'] < 0), 'rankPoints'] = 0
  df.loc[(df['killPoints'] < 0), 'killPoints'] = 0

  df['points'] = df['rankPoints'] + df['killPoints']

  df['pointsPerc'] = df.groupby('matchId')['points'].rank(pct=True)

  return df


def preprocess_teamplay(df):
  df['DBNOsPerc'] = df.groupby('matchId')['DBNOs'].rank(pct=True)

  return df


def preprocess_kill(df):
  df['killsPerc'] = df.groupby('matchId')['kills'].rank(pct=True)
  df['longestKillPerc'] = df.groupby('matchId')['longestKill'].rank(pct=True)
  df['killPlacePerc'] = df.groupby('matchId')['killPlace'].rank(pct=True)

  df['killStreaksPerKill'] = df['killStreaks'] / df['kills']
  df['headShotPerKill'] = df['headshotKills'] / df['kills']
  df['damageDealtPerKill'] = df['damageDealt'] / df['kills']
  df['matchDurationPerKill'] = df['matchDuration'] / df['kills']

  df['killPlace_maxPlace_Ratio'] = df['killPlace'] / df['maxPlace']
  
  return df


def preprocess_distance(df):
  df['totalDistance'] = df['rideDistance'] + df['swimDistance'] + df['walkDistance']

  df['walkDistancePerc'] = df.groupby('matchId')['walkDistance'].rank(pct=True)
  df['totalDistancePerc'] = df.groupby('matchId')['totalDistance'].rank(pct=True)

  df['totalDistancePerDuration'] = df['totalDistance'] / df['matchDuration']
  df['walkDistancePerDuration'] = df['walkDistance'] / df['matchDuration']
  df['walkDistancePerKill'] = df['walkDistance'] / df['kills']
  df['totalDistancePerKill'] = df['totalDistance'] / df['kills']
  df['totalDistancePerHeal'] = df['totalDistance'] / df['heals']
  
  df['killsPerc_walkDistancePerc_Ratio'] = df['killsPerc'] / df['walkDistancePerc']
  df['killsPerc_totalDistancePerc_Ratio'] = df['killsPerc'] / df['totalDistancePerc']
  df['killPlacePerc_walkDistancePerc_Ratio'] = df['killPlacePerc'] / df['walkDistancePerc']
  df['killPlacePerc_totalDistancePerc_Ratio'] = df['killPlacePerc'] / df['totalDistancePerc']

  return df


def preprocess_item(df):
  df['items'] = df['boosts'] + df['heals']

  df['itemsPerc'] = df.groupby('matchId')['items'].rank(pct=True)
  df['weaponsAcquiredPerc'] = df.groupby('matchId')['weaponsAcquired'].rank(pct=True)

  df['totalDistancePerItem'] = df['totalDistance'] / df['items']
  df['totalDistancePerWeapon'] = df['totalDistance'] / df['weaponsAcquired']

  return df


def preprocess_inf(df):
  df[df == np.Inf] = np.NaN
  df[df == np.NINF] = np.NaN
  df.fillna(0, inplace=True)

  return df


def preprocess_groupby_groupId(df, fn):
  df_group = df.groupby(['matchId', 'groupId'])[features].agg(fn)
  df = pd.merge(df, df_group, suffixes=['', f'_group_{fn}'], how='left', on=['matchId', 'groupId'])

  df_group_rank = df_group.groupby('matchId')[features].rank(pct=True)
  df = pd.merge(df, df_group_rank, suffixes=['', f'_group_{fn}_rank'], how='left', on=['matchId', 'groupId'])

  del df_group
  del df_group_rank
  gc.collect()

  return df


def preprocess_group_size(df):
  df_group = df.groupby(['matchId', 'groupId']).size().reset_index(name='groupSize')
  df = pd.merge(df, df_group, how='left', on=['matchId', 'groupId'])

  del df_group
  gc.collect()

  return df


def preprocess_groupby_matchId(df, fn):
  df_group = df.groupby(['matchId'])[features].agg(fn)
  df = pd.merge(df, df_group, suffixes=['', f'_match_{fn}'], how='left', on=['matchId'])

  del df_group
  gc.collect()

  return df


def preprocess_match_size(df):
  df_group = df.groupby(['matchId']).size().reset_index(name='matchSize')
  df = pd.merge(df, df_group, how='left', on=['matchId'])

  del df_group
  gc.collect()

  return df


def preprocess_data(isTrain=True):
  df = pd.read_csv(TRAIN_PATH if isTrain else TEST_PATH)
  df = reduce_mem_usage(df)
  
  df = preprocess_point(df)
  df = preprocess_teamplay(df)
  df = preprocess_kill(df)
  df = preprocess_distance(df)
  df = preprocess_item(df)
  df = preprocess_inf(df)
  df = reduce_mem_usage(df)

  df = preprocess_groupby_groupId(df, 'mean')
  df = preprocess_groupby_groupId(df, 'min')
  df = preprocess_groupby_groupId(df, 'max')
  df = preprocess_group_size(df)
  df = reduce_mem_usage(df)

  df = preprocess_groupby_matchId(df, 'mean')
  df = preprocess_groupby_matchId(df, 'min')
  df = preprocess_groupby_matchId(df, 'max')
  df = preprocess_match_size(df)
  df = reduce_mem_usage(df)

  except_attribs = [
    'Id', 'groupId', 'matchId',

    'matchType',
    'winPlacePerc',

    'DBNOs',
    'assists',
    'cat_0',
    'cat_1',
    'cat_10',
    'cat_11',
    'cat_12',
    'cat_13',
    'cat_14',
    'cat_15',
    'cat_2',
    'cat_3',
    'cat_4',
    'cat_5',
    'cat_6',
    'cat_7',
    'cat_8',
    'cat_9',
    'damageDealt',
    'headShotPerKill',
    'headshotKills',
    'heals',
    'killPlacePerc_group_max_rank',
    'killPlacePerc_group_min_rank',
    'killPlacePerc_match_max',
    'killPlacePerc_match_mean',
    'killPlacePerc_match_min',
    'killPlacePerc_totalDistancePerc_Ratio',
    'killPlacePerc_walkDistancePerc_Ratio',
    'killPlace_match_max',
    'killPlace_match_mean',
    'killPlace_match_min',
    'killPlace_maxPlace_Ratio_group_max_rank',
    'killPlace_maxPlace_Ratio_group_min_rank',
    'killPoints',
    'killsPerc_group_max_rank',
    'killsPerc_group_min_rank',
    'killsPerc_match_max',
    'killsPerc_walkDistancePerc_Ratio',
    'kills_match_max',
    'kills_match_min',
    'longestKillPerc',
    'matchSize',
    'points',
    'pointsPerc',
    'rankPoints',
    'roadKills',
    'teamKills',
    'totalDistance',
    'totalDistancePerHeal',
    'totalDistancePerItem',
    'totalDistancePerc_group_max_rank',
    'totalDistancePerc_group_min_rank',
    'totalDistancePerc_match_max',
    'totalDistance_match_min',
    'vehicleDestroys',
    'walkDistancePerDuration',
    'walkDistancePerc_group_max_rank',
    'walkDistancePerc_group_min_rank',
    'walkDistancePerc_match_max',
    'walkDistancePerc_match_mean',
    'walkDistance_match_max',
    'walkDistance_match_min',
  ]

  y = df['winPlacePerc'] if isTrain else None
  Id = None if isTrain else df[['Id']] # to dataframe

  attribs = list(df.columns)

  for attrib in except_attribs:
    if attrib in attribs:
      df.drop(attrib, axis=1, inplace=True)

  df = reduce_mem_usage(df)

  return df, y, Id


def main():
  X_train, y_train, _ = preprocess_data() # train load

  model = LGBMRegressor(
      n_estimators=200,
      num_leaves=100,
  )
  model.fit(X_train, y_train)

  print('{:.6f}'.format(mean_absolute_error(y_train, model.predict(X_train))))

  del X_train
  del y_train
  gc.collect()

  X_test, _, X_Id = preprocess_data(False) # test load

  # save submission
  submission = X_Id.copy()
  submission['winPlacePerc'] = model.predict(X_test)
  submission.to_csv('submission.csv', index=False)


main()