In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import forest, RandomForestRegressor
import lightgbm as lgb
import gc
import feather
import warnings; 
warnings.filterwarnings('ignore')
INPUT_DIR = "../input/"

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#Function to display all
def display_all(df):
    with pd.option_context("display.max_rows", 500, "display.max_columns", 500):
        display(df)

#Function to reduce memory usage
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

In [3]:
#Dataframe pre-processing
def dataproc1(df):
    df['playersJoined'] = df.groupby('matchId')['matchId'].transform('count')
    df['headshotrate'] = df['kills']/df['headshotKills']
    df['killStreakrate'] = df['killStreaks']/df['kills']
    df['DBNOs_over_kills'] = df['DBNOs']/df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['heals_over_boosts'] = df['heals'] / df['boosts']
    df['healthitems_norm'] = df['heals']/1.37 + df['boosts']/1.1
    df['healthitems_over_kills'] = df['healthitems'] / df['kills']
    df['healthitems_norm_over_kills'] = df['healthitems_norm'] / df['kills']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_x_heals'] = df['walkDistance'] * df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['walkDistance_x_kills'] = df['walkDistance'] * df['kills']
    df['walkDistance_over_healthitems'] = df['walkDistance'] / df['healthitems']
    df['walkDistance_x_healthitems'] = df['walkDistance'] * df['healthitems']
    df["skill"] = df["headshotKills"] + df["roadKills"]
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['killPlace_over_numGroups'] = df['killPlace'] / df['numGroups']
    df['rideDistance'] = (df['rideDistance']/500)
    df['walkDistance'] = (df['walkDistance']/500)
    df['swimDistance'] = (df['swimDistance']/500)
    df["total_time_by_distance"] = (df["rideDistance"]/4.5+df["walkDistance"]+df["swimDistance"]*1.5)
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['distance_x_weapons'] = df['totalDistance'] * df['weaponsAcquired']
    df['total_time_by_distance_over_weapons'] = df['total_time_by_distance'] / df['weaponsAcquired']
    df['total_time_by_distance_x_weapons'] = df['total_time_by_distance'] * df['weaponsAcquired']
    df['killPlace_over_total_time_by_distance'] = df['killPlace'] / df['total_time_by_distance']
    df['killPlace_x_total_time_by_distance'] = df['killPlace'] * df['total_time_by_distance']
    df['killPlace_over_totalDistance'] = df['killPlace'] / df['totalDistance']
    df['killPlace_x_totalDistance'] = df['killPlace'] * df['totalDistance']    
    df['boosts_over_total_time_by_distance'] = df['boosts'] / df['total_time_by_distance']
    df['boosts_x_total_time_by_distance'] = df['boosts'] * df['total_time_by_distance']
    df['boosts_over_totalDistance'] = df['boosts'] / df['totalDistance']
    df['boosts_x_totalDistance'] = df['boosts'] * df['totalDistance']    
    df['teamwork'] = df['assists'] + df['revives'] - df['teamKills']
    df['total_items_acquired'] = (df["boosts"] + df["heals"] + df["weaponsAcquired"])
    df['total_items_acquired_norm'] = (df["boosts"]/1.1 + df["heals"]/1.37 + df["weaponsAcquired"]/3.66)
    df['total_items_acquired_over_total_time_by_distance'] = df['total_items_acquired'] / df['total_time_by_distance']
    df['total_items_acquired_x_total_time_by_distance'] = df['total_items_acquired'] * df['total_time_by_distance']
    df['total_items_acquired_norm_over_total_time_by_distance'] = df['total_items_acquired_norm'] / df['total_time_by_distance']
    df['total_items_acquired_norm_x_total_time_by_distance'] = df['total_items_acquired_norm'] * df['total_time_by_distance']
    df['heals_over_total_time_by_distance'] = df['heals'] / df['total_time_by_distance']
    df['heals_x_total_time_by_distance'] = df['heals'] * df['total_time_by_distance']    
    df['heals_over_totalDistance'] = df['heals'] / df['totalDistance']
    df['heals_x_totalDistance'] = df['heals'] * df['totalDistance']    
    df['kills_over_total_time_by_distance'] = df['kills'] / df['total_time_by_distance']
    df['kills_x_total_time_by_distance'] = df['kills'] * df['total_time_by_distance']
    df['kills_over_totalDistance'] = df['kills'] / df['totalDistance']
    df['kills_x_totalDistance'] = df['kills'] * df['totalDistance']    
    df['killsNorm'] = df['kills']*((100-df['playersJoined'])/100 + 1)
    df['damageDealtNorm'] = df['damageDealt']*((100-df['playersJoined'])/100 + 1)
    df['maxPlaceNorm'] = df['maxPlace']*((100-df['playersJoined'])/100 + 1)
    df['killPlace_over_maxPlaceNorm'] = df['killPlace'] / df['maxPlaceNorm']
    df['killPlace_over_playersJoined'] = df['killPlace'] / df['playersJoined'] 
    df['matchDurationNorm'] = df['matchDuration']*((100-df['playersJoined'])/100 + 1)
    df['killPlace_over_matchDuration'] = df['killPlace'] / df['matchDuration']
    df['killPlace_over_matchDurationnorm'] = df['killPlace'] / df['matchDurationNorm']    
    df['killPlacePerc'] = (df['playersJoined'] - df['killPlace']) / (df['playersJoined'] - 1)
    df['L1'] = df['roadKills'] + df['vehicleDestroys'] + df['teamKills']
    df['L2'] = df['revives'] + df['headshotKills'] + df['assists']
    df['L3'] = df['killStreaks'] + df['DBNOs'] + df['kills'] + df['boosts'] + df['heals']
    df['points'] = df['killPoints']+df['rankPoints'] + df['winPoints']
    df['L1_over_total_time_by_distance'] = df['L1'] / df['total_time_by_distance']
    df['L1_x_total_time_by_distance'] = df['L1'] * df['total_time_by_distance']
    df['L1_over_totalDistance'] = df['L1'] / df['totalDistance']
    df['L1_x_totalDistance'] = df['L1'] * df['totalDistance']    
    df['L2_over_total_time_by_distance'] = df['L2'] / df['total_time_by_distance']
    df['L2_x_total_time_by_distance'] = df['L2'] * df['total_time_by_distance']
    df['L2_over_totalDistance'] = df['L2'] / df['totalDistance']
    df['L2_x_totalDistance'] = df['L2'] * df['totalDistance']    
    df['L3_over_total_time_by_distance'] = df['L3'] / df['total_time_by_distance']
    df['L3_x_total_time_by_distance'] = df['L3'] * df['total_time_by_distance']
    df['L3_over_totalDistance'] = df['L3'] / df['totalDistance']
    df['L3_x_totalDistance'] = df['L3'] * df['totalDistance']
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    df.fillna(0, inplace=True)  
    reduce_mem_usage(df)
    return df

#List of features
def dataproc2(df, is_train='TRUE'):
    print("Starting dataproc2")
    features = list(df.columns)
    if is_train=='TRUE':
        features.remove(target)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    features.remove("numGroups")
    features.remove("playersJoined")
    features.remove("roadKills")
    features.remove("vehicleDestroys")
    imp_cols=['matchId', 'groupId']
    print("Step 1")
    tmp = df.groupby(['matchId','groupId'])[features].agg('mean')
    df_out = tmp.reset_index()[['matchId','groupId']]
    if is_train=='TRUE':
        imp_cols.extend([target])
        tmp3 = df.groupby(['matchId','groupId'])[target].agg('mean')
        df_out = df_out.merge(tmp3.reset_index(), how='left', on=['matchId','groupId'])
        del tmp3
        gc.collect()
    print("Step 2")
    features.remove("maxPlace")
    features.remove("matchDuration")
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_mean').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['killPlace_mean', 'matchDuration_mean', 'killPlace_over_maxPlace_mean', 'killPlace_over_numGroups_mean', 'boosts_x_total_time_by_distance_mean', 'killsNorm_mean', 'killPlace_over_playersJoined_mean', 'killPlacePerc_mean'])
    df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_mean_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['DBNOs_mean_rank', 'kills_mean_rank', 'killStreaks_mean_rank', 'longestKill_mean_rank', 'walkDistance_mean_rank', 'killStreakrate_mean_rank', 'walkDistance_x_kills_mean_rank', 'total_time_by_distance_mean_rank', 'total_time_by_distance_x_weapons_mean_rank', 'killPlace_over_total_time_by_distance_mean_rank', 'killPlace_x_total_time_by_distance_mean_rank', 'boosts_x_total_time_by_distance_mean_rank', 'total_items_acquired_x_total_time_by_distance_mean_rank', 'total_items_acquired_norm_x_total_time_by_distance_mean_rank', 'kills_over_total_time_by_distance_mean_rank', 'kills_x_total_time_by_distance_mean_rank', 'kills_x_totalDistance_mean_rank', 'killsNorm_mean_rank', 'killPlacePerc_mean_rank'])
    df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 3")
    tmp = df.groupby(['matchId','groupId'])[features].agg('median')
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_median').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['killPlace_over_numGroups_median', 'killsNorm_median', 'killPlace_over_playersJoined_median', 'killPlacePerc_median'])
    df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_median_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['assists_median_rank', 'killPlace_median_rank', 'kills_median_rank', 'killStreaks_median_rank', 'longestKill_median_rank', 'revives_median_rank', 'walkDistance_median_rank', 'killStreakrate_median_rank', 'DBNOs_over_kills_median_rank', 'walkDistance_over_kills_median_rank', 'walkDistance_x_kills_median_rank', 'killPlace_over_numGroups_median_rank', 'total_time_by_distance_median_rank', 'total_time_by_distance_x_weapons_median_rank', 'killPlace_over_total_time_by_distance_median_rank', 'killPlace_x_total_time_by_distance_median_rank', 'killPlace_over_totalDistance_median_rank', 'total_items_acquired_x_total_time_by_distance_median_rank', 'total_items_acquired_norm_x_total_time_by_distance_median_rank', 'kills_over_total_time_by_distance_median_rank', 'kills_x_total_time_by_distance_median_rank', 'kills_over_totalDistance_median_rank', 'kills_x_totalDistance_median_rank', 'killsNorm_median_rank', 'killPlace_over_matchDuration_median_rank', 'killPlacePerc_median_rank', 'L3_over_total_time_by_distance_median_rank'])
    df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 4")
    tmp = df.groupby(['matchId','groupId'])[features].agg('max')
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_max').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['damageDealt_max', 'killPlace_max', 'kills_max', 'walkDistance_max', 'walkDistance_over_kills_max', 'killPlace_over_maxPlace_max', 'killPlace_over_numGroups_max', 'boosts_x_total_time_by_distance_max', 'killsNorm_max', 'killPlace_over_maxPlaceNorm_max', 'killPlace_over_playersJoined_max', 'killPlacePerc_max'])
    df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_max_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['boosts_max_rank', 'DBNOs_max_rank', 'killPlace_max_rank', 'kills_max_rank', 'killStreaks_max_rank', 'longestKill_max_rank', 'walkDistance_max_rank', 'weaponsAcquired_max_rank', 'killStreakrate_max_rank', 'walkDistance_over_kills_max_rank', 'killPlace_over_maxPlace_max_rank', 'killPlace_over_numGroups_max_rank', 'total_time_by_distance_max_rank', 'totalDistance_max_rank', 'distance_x_weapons_max_rank', 'total_time_by_distance_x_weapons_max_rank', 'killPlace_over_total_time_by_distance_max_rank', 'killPlace_x_total_time_by_distance_max_rank', 'boosts_x_total_time_by_distance_max_rank', 'boosts_x_totalDistance_max_rank', 'total_items_acquired_x_total_time_by_distance_max_rank', 'total_items_acquired_norm_x_total_time_by_distance_max_rank', 'kills_x_total_time_by_distance_max_rank', 'kills_x_totalDistance_max_rank', 'killsNorm_max_rank', 'killPlace_over_maxPlaceNorm_max_rank', 'killPlace_over_playersJoined_max_rank', 'killPlace_over_matchDuration_max_rank', 'killPlace_over_matchDurationnorm_max_rank', 'killPlacePerc_max_rank', 'points_max_rank', 'L3_x_totalDistance_max_rank'])
    df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 5")
    tmp = df.groupby(['matchId','groupId'])[features].agg('min')
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_min').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['kills_min', 'longestKill_min', 'killStreakrate_min', 'walkDistance_over_kills_min', 'killPlace_over_maxPlace_min', 'killPlace_over_numGroups_min', 'killPlace_over_total_time_by_distance_min', 'kills_over_total_time_by_distance_min', 'killsNorm_min', 'killPlace_over_maxPlaceNorm_min', 'killPlace_over_playersJoined_min', 'killPlace_over_matchDuration_min', 'killPlace_over_matchDurationnorm_min', 'killPlacePerc_min'])
    df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_min_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['assists_min_rank', 'DBNOs_min_rank', 'kills_min_rank', 'killStreaks_min_rank', 'longestKill_min_rank', 'rankPoints_min_rank', 'revives_min_rank', 'swimDistance_min_rank', 'walkDistance_min_rank', 'weaponsAcquired_min_rank', 'killStreakrate_min_rank', 'DBNOs_over_kills_min_rank', 'heals_over_boosts_min_rank', 'healthitems_over_kills_min_rank', 'healthitems_norm_over_kills_min_rank', 'walkDistance_over_kills_min_rank', 'walkDistance_x_kills_min_rank', 'killPlace_over_maxPlace_min_rank', 'total_time_by_distance_min_rank', 'total_time_by_distance_x_weapons_min_rank', 'killPlace_over_total_time_by_distance_min_rank', 'killPlace_x_total_time_by_distance_min_rank', 'killPlace_over_totalDistance_min_rank', 'teamwork_min_rank', 'total_items_acquired_x_total_time_by_distance_min_rank', 'kills_over_total_time_by_distance_min_rank', 'kills_x_total_time_by_distance_min_rank', 'kills_over_totalDistance_min_rank', 'kills_x_totalDistance_min_rank', 'killsNorm_min_rank', 'damageDealtNorm_min_rank', 'killPlace_over_maxPlaceNorm_min_rank', 'killPlace_over_matchDuration_min_rank', 'killPlacePerc_min_rank', 'L3_min_rank', 'points_min_rank', 'L3_over_total_time_by_distance_min_rank'])
    df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 6")
    tmp = df.groupby(['matchId'])[features].agg('mean')
    df_out = df_out.merge(tmp.add_suffix('_match_mean').reset_index(), how='left', on=['matchId'])   
    imp_cols.extend(['assists_match_mean', 'boosts_match_mean', 'damageDealt_match_mean', 'headshotKills_match_mean', 'heals_match_mean', 'kills_match_mean', 'killStreaks_match_mean', 'longestKill_match_mean', 'swimDistance_match_mean', 'weaponsAcquired_match_mean', 'headshotrate_match_mean', 'killStreakrate_match_mean', 'DBNOs_over_kills_match_mean', 'heals_over_boosts_match_mean', 'healthitems_over_kills_match_mean', 'healthitems_norm_over_kills_match_mean', 'walkDistance_over_heals_match_mean', 'walkDistance_over_kills_match_mean', 'walkDistance_x_kills_match_mean', 'walkDistance_over_healthitems_match_mean', 'skill_match_mean', 'distance_over_weapons_match_mean', 'total_time_by_distance_over_weapons_match_mean', 'killPlace_over_total_time_by_distance_match_mean', 'killPlace_x_total_time_by_distance_match_mean', 'killPlace_over_totalDistance_match_mean', 'killPlace_x_totalDistance_match_mean', 'boosts_over_total_time_by_distance_match_mean', 'boosts_over_totalDistance_match_mean', 'teamwork_match_mean', 'total_items_acquired_norm_match_mean', 'total_items_acquired_over_total_time_by_distance_match_mean', 'total_items_acquired_norm_over_total_time_by_distance_match_mean', 'heals_over_total_time_by_distance_match_mean', 'heals_over_totalDistance_match_mean', 'kills_over_total_time_by_distance_match_mean', 'kills_x_total_time_by_distance_match_mean', 'kills_over_totalDistance_match_mean', 'kills_x_totalDistance_match_mean', 'killsNorm_match_mean', 'damageDealtNorm_match_mean', 'L2_match_mean', 'points_match_mean', 'L1_over_totalDistance_match_mean', 'L2_over_total_time_by_distance_match_mean', 'L2_x_total_time_by_distance_match_mean', 'L2_over_totalDistance_match_mean', 'L2_x_totalDistance_match_mean', 'L3_over_total_time_by_distance_match_mean', 'L3_over_totalDistance_match_mean'])
    df_out = df_out[imp_cols]
    del tmp
    gc.collect()
    tmp = df.groupby(['matchId'])[features].agg('median')
    df_out = df_out.merge(tmp.add_suffix('_match_median').reset_index(), how='left', on=['matchId'])   
    imp_cols.extend(['damageDealt_match_median', 'killStreaks_match_median', 'walkDistance_match_median', 'killStreakrate_match_median', 'distance_over_weapons_match_median', 'total_time_by_distance_over_weapons_match_median', 'killPlace_over_total_time_by_distance_match_median', 'killPlace_over_totalDistance_match_median', 'killPlace_x_totalDistance_match_median', 'total_items_acquired_over_total_time_by_distance_match_median', 'total_items_acquired_norm_over_total_time_by_distance_match_median', 'kills_over_total_time_by_distance_match_median', 'kills_x_totalDistance_match_median', 'damageDealtNorm_match_median', 'L3_match_median', 'L3_over_total_time_by_distance_match_median', 'L3_x_total_time_by_distance_match_median', 'L3_over_totalDistance_match_median', 'L3_x_totalDistance_match_median'])
    df_out = df_out[imp_cols]
    del tmp
    gc.collect()
    tmp = df.groupby(['matchId'])[features].agg('max')
    df_out = df_out.merge(tmp.add_suffix('_match_max').reset_index(), how='left', on=['matchId'])   
    imp_cols.extend(['damageDealt_match_max', 'longestKill_match_max', 'rankPoints_match_max', 'rideDistance_match_max', 'swimDistance_match_max', 'walkDistance_match_max', 'heals_over_boosts_match_max', 'healthitems_norm_over_kills_match_max', 'walkDistance_over_heals_match_max', 'walkDistance_x_heals_match_max', 'walkDistance_over_kills_match_max', 'walkDistance_x_kills_match_max', 'walkDistance_over_healthitems_match_max', 'walkDistance_x_healthitems_match_max', 'total_time_by_distance_match_max', 'totalDistance_match_max', 'distance_over_weapons_match_max', 'distance_x_weapons_match_max', 'total_time_by_distance_over_weapons_match_max', 'total_time_by_distance_x_weapons_match_max', 'killPlace_over_total_time_by_distance_match_max', 'killPlace_x_total_time_by_distance_match_max', 'killPlace_over_totalDistance_match_max', 'killPlace_x_totalDistance_match_max', 'boosts_over_total_time_by_distance_match_max', 'boosts_x_total_time_by_distance_match_max', 'boosts_over_totalDistance_match_max', 'boosts_x_totalDistance_match_max', 'total_items_acquired_norm_match_max', 'total_items_acquired_over_total_time_by_distance_match_max', 'total_items_acquired_x_total_time_by_distance_match_max', 'total_items_acquired_norm_over_total_time_by_distance_match_max', 'heals_over_total_time_by_distance_match_max', 'heals_x_total_time_by_distance_match_max', 'heals_over_totalDistance_match_max', 'heals_x_totalDistance_match_max', 'kills_over_total_time_by_distance_match_max', 'kills_x_total_time_by_distance_match_max', 'kills_over_totalDistance_match_max', 'kills_x_totalDistance_match_max', 'killsNorm_match_max', 'damageDealtNorm_match_max', 'L3_match_max', 'points_match_max', 'L1_x_total_time_by_distance_match_max', 'L1_over_totalDistance_match_max', 'L2_over_total_time_by_distance_match_max', 'L2_x_total_time_by_distance_match_max', 'L2_over_totalDistance_match_max', 'L2_x_totalDistance_match_max', 'L3_over_total_time_by_distance_match_max', 'L3_x_total_time_by_distance_match_max', 'L3_over_totalDistance_match_max', 'L3_x_totalDistance_match_max'])
    df_out = df_out[imp_cols]
    del tmp
    gc.collect()
    print("Step 7")
    tmp = df.groupby(['matchId','groupId'])[features].agg('sum')
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_sum').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['longestKill_sum', 'walkDistance_over_kills_sum', 'killPlace_over_maxPlace_sum', 'killPlace_over_numGroups_sum', 'damageDealtNorm_sum', 'maxPlaceNorm_sum', 'killPlace_over_playersJoined_sum', 'killPlacePerc_sum'])
    df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_sum_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    imp_cols.extend(['kills_sum_rank', 'killStreaks_sum_rank', 'longestKill_sum_rank', 'rankPoints_sum_rank', 'walkDistance_sum_rank', 'winPoints_sum_rank', 'killStreakrate_sum_rank', 'killPlace_over_maxPlace_sum_rank', 'total_time_by_distance_sum_rank', 'totalDistance_sum_rank', 'distance_x_weapons_sum_rank', 'total_time_by_distance_x_weapons_sum_rank', 'killPlace_over_total_time_by_distance_sum_rank', 'killPlace_x_total_time_by_distance_sum_rank', 'killPlace_over_totalDistance_sum_rank', 'boosts_x_total_time_by_distance_sum_rank', 'boosts_x_totalDistance_sum_rank', 'total_items_acquired_x_total_time_by_distance_sum_rank', 'total_items_acquired_norm_x_total_time_by_distance_sum_rank', 'kills_x_total_time_by_distance_sum_rank', 'killsNorm_sum_rank', 'killPlacePerc_sum_rank', 'L3_sum_rank', 'points_sum_rank'])
    df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 8")
    df_out = df_out.assign(agg_group_size=df.groupby('groupId').groupId.transform('count'))
    df_out = df_out.assign(agg_match_size=df.groupby('matchId').Id.transform('nunique'))
    print("Step 9")
    del df_out["matchId"]
    del df_out["groupId"]
    reduce_mem_usage(df_out)
    gc.collect()
    return df_out

In [4]:
%%time
df = pd.read_csv(INPUT_DIR + 'train_V2.csv', nrows=10000)
df = df[df['maxPlace'] > 1]
reduce_mem_usage(df)
target = 'winPlacePerc'
df = dataproc1(df)
#df.reset_index().to_feather('tmp/20181217_df_308_features')
df_out = dataproc2(df)
del df
gc.collect()
#df_out.to_feather('tmp/20181217_df_out_308_features')
y_train = df_out[target]
x_train = df_out.drop(target, axis=1)
del df_out
gc.collect()

Starting dataproc2
Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9
CPU times: user 10 s, sys: 428 ms, total: 10.5 s
Wall time: 3.24 s


In [5]:
%%time
def run_lgb(train_X, train_y):
    params = {"objective" : "regression", "metric" : "mae", 'n_estimators':4,
              "num_leaves" : 51, "learning_rate" : 0.05, "bagging_fraction" : 0.7,
               "bagging_seed" : 0, "num_threads" : -1,"colsample_bytree" : 0.7
             }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    model = lgb.train(params, lgtrain, verbose_eval=1)
    
    #pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return model

# Training the model #
m = run_lgb(x_train, y_train)

CPU times: user 1.69 s, sys: 77.8 ms, total: 1.76 s
Wall time: 315 ms


In [6]:
df = pd.read_csv(INPUT_DIR + 'test_V2.csv', nrows=10000)
reduce_mem_usage(df)
df = dataproc1(df)
df_out = dataproc2(df, is_train='FALSE')
y_pred = m.predict(df_out)

Starting dataproc2
Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9


In [26]:
display_all(df.head())

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,playersJoined,headshotrate,killStreakrate,DBNOs_over_kills,healthitems,heals_over_boosts,healthitems_norm,healthitems_over_kills,healthitems_norm_over_kills,walkDistance_over_heals,walkDistance_x_heals,walkDistance_over_kills,walkDistance_x_kills,walkDistance_over_healthitems,walkDistance_x_healthitems,skill,killPlace_over_maxPlace,killPlace_over_numGroups,total_time_by_distance,totalDistance,distance_over_weapons,distance_x_weapons,total_time_by_distance_over_weapons,total_time_by_distance_x_weapons,killPlace_over_total_time_by_distance,killPlace_x_total_time_by_distance,killPlace_over_totalDistance,killPlace_x_totalDistance,boosts_over_total_time_by_distance,boosts_x_total_time_by_distance,boosts_over_totalDistance,boosts_x_totalDistance,teamwork,total_items_acquired,total_items_acquired_norm,total_items_acquired_over_total_time_by_distance,total_items_acquired_x_total_time_by_distance,total_items_acquired_norm_over_total_time_by_distance,total_items_acquired_norm_x_total_time_by_distance,heals_over_total_time_by_distance,heals_x_total_time_by_distance,heals_over_totalDistance,heals_x_totalDistance,kills_over_total_time_by_distance,kills_x_total_time_by_distance,kills_over_totalDistance,kills_x_totalDistance,killsNorm,damageDealtNorm,maxPlaceNorm,killPlace_over_maxPlaceNorm,killPlace_over_playersJoined,matchDurationNorm,killPlace_over_matchDuration,killPlace_over_matchDurationnorm,killPlacePerc,L1,L2,L3,points,L1_over_total_time_by_distance,L1_x_total_time_by_distance,L1_over_totalDistance,L1_x_totalDistance,L2_over_total_time_by_distance,L2_x_total_time_by_distance,L2_over_totalDistance,L2_x_totalDistance,L3_over_total_time_by_distance,L3_x_total_time_by_distance,L3_over_totalDistance,L3_x_totalDistance
0,9329eb41e215eb,676b23c24e70d6,45b576ab7daa7f,0,0,51.46875,0,0,0,73,0,0,0,0.0,1884,squad-fpp,28,28,1500,0,0.0,0,0.0,0,0,1.175781,1,0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2.607422,2.607422,1.175781,1.175781,1.175781,1.175781,1.175781,1.175781,62.074829,85.875,62.074829,85.875,0.0,0.0,0.0,0.0,0,1,0.273193,0.850098,1.175781,0.2323,0.321289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102.4375,55.71875,1.310547,73.0,3750.0,0.038757,0.01947,0.0,0,0,0,1500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,639bd0dcd7bda8,430933124148dd,42a9a0b906c928,0,4,179.125,0,0,2,11,0,2,1,362.0,1811,duo-fpp,48,47,1503,2,9.335938,0,0.0,0,0,4.035156,6,0,1,0.0,0.5,0.0,6,0.5,5.097656,3.0,2.548828,1008.5,4034.0,1008.5,4034.0,336.25,12104.0,0,0.229126,0.234009,6.109375,13.367188,2.228516,80.25,1.018555,36.65625,1.80072,67.1875,0.822737,147.125,0.654785,24.4375,0.299072,53.46875,2,12,6.734375,1.964844,73.3125,1.102539,41.15625,0.327393,12.21875,0.149536,26.734375,0.327393,12.21875,0.149536,26.734375,3.980469,356.5,95.5,0.115173,11.0,3604.0,0.006073,0.003052,0.0,0,2,9,1503,0.0,0.0,0.0,0.0,0.327393,12.21875,0.149536,26.734375,1.473633,54.96875,0.67334,120.3125
2,63d5c8ef8dfe91,0b45f5db20ba99,87e7e4477a048e,1,0,23.40625,0,0,4,49,0,0,0,0.0,1793,squad-fpp,28,27,1565,0,0.0,0,0.0,0,0,1.576172,4,0,1,0.0,0.0,0.0,4,0.0,2.919922,0.0,0.0,197.0,3152.0,0.0,0.0,197.0,3152.0,0,1.75,1.814453,1.576172,1.576172,0.394043,6.304688,0.394043,6.304688,31.091372,77.25,31.091372,77.25,0.0,0.0,0.0,0.0,1,8,4.011719,5.074219,12.609375,2.546875,6.324219,2.537109,6.304688,2.537109,6.304688,0.0,0.0,0.0,0.0,0.0,46.59375,55.71875,0.879395,49.0,3568.0,0.027328,0.013733,0.0,0,1,4,1565,0.0,0.0,0.0,0.0,0.634277,1.576172,0.634277,1.576172,2.537109,6.304688,2.537109,6.304688
3,cf5b81422591d1,b7497dbdc77f4a,1b9a94f1af67f1,0,0,65.5,0,0,0,54,0,0,0,0.0,1834,duo-fpp,45,44,1465,0,0.0,0,0.0,0,0,3.623047,3,0,2,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.200195,1.227539,3.623047,3.623047,1.208008,10.875,1.208008,10.875,14.900662,195.75,14.900662,195.75,0.0,0.0,0.0,0.0,0,3,0.819824,0.827637,10.875,0.226196,2.970703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,129.75,89.125,0.605957,27.0,3632.0,0.029449,0.01487,-52.0,0,0,0,1465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ee6a295187ba21,6604ce20a1d230,40754a93016066,0,4,330.25,1,2,1,7,0,3,1,60.0625,1326,squad-fpp,28,27,1480,1,0.0,0,0.0,0,0,5.929688,4,0,3,1.5,0.333252,0.333252,5,0.25,4.367188,1.666992,1.455078,2964.0,2964.0,988.0,8896.0,593.0,14816.0,2,0.25,0.259277,5.929688,5.929688,1.482422,23.71875,1.482422,23.71875,1.180837,41.5,1.180837,41.5,0.674805,23.71875,0.674805,23.71875,1,9,5.460938,1.518555,53.34375,0.920898,32.375,0.168701,5.929688,0.168701,5.929688,0.505859,17.78125,0.505859,17.78125,5.910156,650.5,55.15625,0.126953,2.333984,2612.0,0.00528,0.00268,-2.0,0,3,10,1480,0.0,0.0,0.0,0.0,0.505859,17.78125,0.505859,17.78125,1.686523,59.28125,1.686523,59.28125


In [24]:
long_file = df[['Id', 'matchId', 'groupId', 'maxPlace', 'numGroups']]
short_file = long_file.groupby(['matchId','groupId']).first().reset_index()
short_file['winPlacePerc'] = y_pred
short_file["rank"] = short_file.groupby(["matchId"])["winPlacePerc"].rank()
short_file["winPlacePerc"] = (short_file["rank"] - 1) / (short_file["numGroups"] - 1)
long_file = long_file.merge(short_file[['matchId', 'groupId', 'winPlacePerc','rank']], suffixes=["","_y"], on=["matchId", "groupId"], how="left")
long_file = long_file.set_index('Id')
long_file.loc[long_file['numGroups'] == 1, "winPlacePerc"] = 0
gap = 1/(long_file['maxPlace'].values - 1)
long_file['winPlacePerc'] = np.around(long_file['winPlacePerc'] / gap) * gap
long_file.loc[long_file['maxPlace'] == 0, "winPlacePerc"] = 0
long_file.loc[long_file['maxPlace'] == 1, "winPlacePerc"] = 1
long_file['winPlacePerc'] = np.around(long_file['winPlacePerc']*10000) / 10000
long_file = long_file.reset_index()
long_file = long_file[["Id", "winPlacePerc"]]
display_all(long_file.head())


Unnamed: 0,Id,winPlacePerc
0,9329eb41e215eb,0.0
1,639bd0dcd7bda8,0.0
2,63d5c8ef8dfe91,0.0
3,cf5b81422591d1,0.0227
4,ee6a295187ba21,0.0741


In [25]:
long_file.to_csv("submission.csv", index=False)