In [None]:
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

print(pd.__version__)

In [None]:
dtype={
    'matchType': 'category'
}

In [None]:
%%time
train_val_path = '/kaggle/input/pubg-finish-placement-prediction/train_V2.csv'
test_path = '/kaggle/input/pubg-finish-placement-prediction/test_V2.csv'

train_val_data = pd.read_csv(train_val_path, dtype=dtype)
train_val_data.dropna(inplace=True)

print(train_val_data.shape)

In [None]:
player_columns = [
    'DBNOs',
    'kills',
    'assists',

    'revives',
    'teamKills',

    'damageDealt',
    'killStreaks',
    'headshotKills',
    'longestKill',
    'roadKills',

    'boosts',
    'heals',
    'weaponsAcquired',

    'killPlace',

    'killPoints',
    'winPoints',
    'rankPoints',

    'rideDistance',
    'swimDistance',
    'walkDistance',

    'vehicleDestroys',
]

def aggregate(data, player_columns, drop_killPlace=False):
    player_columns = player_columns.copy()

    if drop_killPlace:
        if 'killPlace' in player_columns:
            player_columns.remove(drop_killPlace)
    
    group_agg_dict = dict(
        matchId = ('matchId', 'first'),
        groupPlayers = ('groupId', 'size'),
        matchType = ('matchType', 'first'),
    )
    if 'winPlacePerc' in data.columns:
        group_agg_dict.update(winPlacePerc = ('winPlacePerc', 'first'))
    for column in player_columns:
        group_agg_dict.update({
            f'group_mean_{column}': (column, 'mean'),
            f'group_max_{column}': (column, 'max'),
            f'group_min_{column}': (column, 'min'),
        })
    
    group_agg = data.groupby('groupId').agg(
        **group_agg_dict,
    )
    
    match_agg_dict = dict(
        matchPlayers = ('matchId', 'size'),
        matchDuration = ('matchDuration', 'first'),
        numGroups = ('numGroups', 'first'),
        maxPlace = ('maxPlace', 'first'),
    )
    for column in player_columns:
        match_agg_dict.update({
            f'match_mean_{column}': (column, 'mean'),
            f'match_max_{column}': (column, 'max'),
            f'match_min_{column}': (column, 'min'),
        })
        
    match_agg = data.groupby('matchId').agg(
        **match_agg_dict,
    )

    result = group_agg.merge(
        right = match_agg,
        left_on = 'matchId',
        right_index = True,
    )
    
    rank_columns = []
    for column in player_columns:
        rank_columns.append((result.groupby('matchId')[f'group_mean_{column}'].rank(method='average') / result.numGroups).rename(f'rank_group_mean_{column}'))
        rank_columns.append((result.groupby('matchId')[f'group_max_{column}'].rank(method='average') / result.numGroups).rename(f'rank_group_max_{column}'))
        rank_columns.append((result.groupby('matchId')[f'group_min_{column}'].rank(method='average') / result.numGroups).rename(f'rank_group_min_{column}'))

    scaled_columns = []
#     for column in player_columns:
#         scaled_columns.append((result[f'group_mean_{column}'] / result[f'match_mean_{column}'].clip(0.001, None)).rename(f'mean_scaled_group_mean_{column}'))
#         scaled_columns.append((result[f'group_max_{column}'] / result[f'match_mean_{column}'].clip(0.001, None)).rename(f'mean_scaled_group_max_{column}'))
#         scaled_columns.append((result[f'group_min_{column}'] / result[f'match_mean_{column}'].clip(0.001, None)).rename(f'mean_scaled_group_min_{column}'))
#         scaled_columns.append((result[f'group_max_{column}'] / result[f'match_max_{column}'].clip(0.001, None)).rename(f'max_scaled_group_max_{column}'))
    
    del group_agg, match_agg
    gc.collect()
    
    result = pd.concat([result, *rank_columns, *scaled_columns], axis=1)
    return result

In [None]:
from sklearn.model_selection import GroupShuffleSplit

train_idx, val_idx = next(GroupShuffleSplit(
    n_splits = 1,
    test_size = 0.2,
    random_state = 42,
).split(train_val_data, groups=train_val_data.matchId))

train_data = train_val_data.iloc[train_idx]
val_data = train_val_data.iloc[val_idx]

In [None]:
import heapq as hq
import pandas as pd

def _topological_sort(match_data):

    # out_edge[groupA] = [groupB, ...]
    # : for some member a in groupA, b in groupB
    #   kills[a] = kills[b] and killPlace[a] < killPlace[b]

    groupIds = match_data.groupId.unique()
    out_edge = {groupId: [] for groupId in groupIds}
    in_degree = {groupId: 0 for groupId in groupIds}
    winPlacePercRegr = match_data.groupby('groupId').winPlacePercRegr.first()
    winPlace = pd.Series(0, index=winPlacePercRegr.index)
    priority_queue = []

    for kills, group in match_data.groupby('kills'):
        iterator = iter(group.sort_values('killPlace').groupId)
        prev_group = next(iterator)
        for curr_group in iterator:
            if prev_group != curr_group:
                out_edge[prev_group].append(curr_group)
                in_degree[curr_group] += 1
            prev_group = curr_group
    
    for group in groupIds:
        if in_degree[group] == 0:
            hq.heappush(
                priority_queue,
                (-winPlacePercRegr[group], group),
            )
    
    curr_winPlace = 1
    while priority_queue:
        _, top_group = hq.heappop(priority_queue)
        winPlace[top_group] = curr_winPlace
        curr_winPlace += 1
        for group in out_edge[top_group]:
            in_degree[group] -= 1
            if in_degree[group] == 0:
                hq.heappush(
                    priority_queue,
                    (-winPlacePercRegr[group], group),
                )
    
    winPlacePerc = 1 - (winPlace-1) / (winPlace.max()-1)

    result = match_data.merge(
        right = winPlacePerc.rename('winPlacePerc'),
        left_on = 'groupId',
        right_index = True,
    )
    result.winPlacePerc.fillna(0, inplace=True)

    return result

def winplace_by_killPlace_topological_sort(matchIds, groupIds, kills, killPlaces, winPlacePercRegr, winPlacePerc_agg, tqdm_progress=False):
    """
    get winPlacePerc from topological sort by killPlace data leakage with priority of winPlacePercRegr
    """
    
    data = pd.DataFrame({
        'matchId': matchIds,
        'groupId': groupIds,
        'kills': kills,
        'killPlace': killPlaces,
    })
    data['winPlacePercRegr'] = winPlacePercRegr.groupby(groupIds).transform(winPlacePerc_agg)

    result = data.groupby('matchId', group_keys=False).apply(_topological_sort)

    return result.winPlacePerc

In [None]:
def fit(regressor, train_data, train_agg):
    regressor.fit(
        X = train_agg.drop(['matchId', 'winPlacePerc'], axis=1),
        y = train_agg.winPlacePerc,
    )

def predict(regressor, val_data, val_agg):
    val_regr = pd.Series(
        data = regressor.predict(val_agg.drop(['matchId', 'winPlacePerc'], axis=1, errors='ignore')),
        index = val_agg.index,
    )

    val_regr_player = val_data[['matchId', 'groupId', 'kills', 'killPlace']].merge(
        right = val_regr.rename('winPlacePercRegr'),
        left_on = 'groupId',
        right_index = True,
    )
    
    pred = winplace_by_killPlace_topological_sort(
        matchIds = val_regr_player.matchId,
        groupIds = val_regr_player.groupId,
        kills = val_regr_player.kills,
        killPlaces = val_regr_player.killPlace,
        winPlacePercRegr = val_regr_player.winPlacePercRegr,
        winPlacePerc_agg = 'first',
    )

    return pred

In [None]:
regressor = lgb.LGBMRegressor(
    n_estimators=500,
    num_leaves=63,
    random_state=42,
    learning_rate=0.1,
)
regressor.get_params()

In [None]:
%%time
train_agg = aggregate(
    data = train_data,
    player_columns = player_columns,
)
gc.collect()

In [None]:
%%time
fit(regressor, train_data, train_agg)
gc.collect()

In [None]:
del train_agg
gc.collect()

In [None]:
%%time
val_agg = aggregate(
    data = val_data,
    player_columns = player_columns,
)
gc.collect()

In [None]:
%%time
pred = predict(regressor, val_data.drop('winPlacePerc', axis=1), val_agg)
score = mean_absolute_error(val_data.winPlacePerc, pred.reindex(val_data.index))
print(score)

In [None]:
del val_agg
gc.collect()

In [None]:
%%time
test_data = pd.read_csv(test_path, dtype=dtype)
print(test_data.shape)

In [None]:
%%time
test_agg = aggregate(
    data = test_data,
    player_columns = player_columns,
)
gc.collect()

In [None]:
%%time
pred = predict(regressor, test_data, test_agg)

In [None]:
submission = pd.concat([test_data.Id, pred], axis=1)

In [None]:
submission_path = 'submission.csv'
submission.to_csv(submission_path, index=False)