In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import forest, RandomForestRegressor
import lightgbm as lgb
import gc
import feather
import warnings; 
warnings.filterwarnings('ignore')
INPUT_DIR = "../input/"

In [31]:
#Function to display all
def display_all(df):
    with pd.option_context("display.max_rows", 500, "display.max_columns", 500):
        display(df)

#Function to reduce memory usage
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

In [32]:
#Dataframe pre-processing
def dataproc1(df):
    df['playersJoined'] = df.groupby('matchId')['matchId'].transform('count')
    df['headshotrate'] = df['kills']/df['headshotKills']
    df['killStreakrate'] = df['killStreaks']/df['kills']
    df['DBNOs_over_kills'] = df['DBNOs']/df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['heals_over_boosts'] = df['heals'] / df['boosts']
    df['healthitems_norm'] = df['heals']/2.679982 + df['boosts']/1.715794
    df['healthitems_over_kills'] = df['healthitems'] / df['kills']
    df['healthitems_norm_over_kills'] = df['healthitems_norm'] / df['kills']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_x_heals'] = df['walkDistance'] * df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['walkDistance_x_kills'] = df['walkDistance'] * df['kills']
    df['walkDistance_over_healthitems'] = df['walkDistance'] / df['healthitems']
    df['walkDistance_x_healthitems'] = df['walkDistance'] * df['healthitems']
    df["skill"] = df["headshotKills"] + df["roadKills"]
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['killPlace_over_numGroups'] = df['killPlace'] / df['numGroups']
    df['rideDistance'] = (df['rideDistance']/500)
    df['walkDistance'] = (df['walkDistance']/500)
    df['swimDistance'] = (df['swimDistance']/500)
    df["total_time_by_distance"] = (df["rideDistance"]/4.5+df["walkDistance"]+df["swimDistance"]*1.5)
    df["total_time_by_distance_rank"] = df.groupby('matchId')['total_time_by_distance'].rank(pct=True)
    df["total_time_by_distance_place"] = df['total_time_by_distance_rank'] * df['playersJoined']
    df["total_time_by_distance_perc"] = (df['playersJoined'] - df['total_time_by_distance_place'])/(df['playersJoined'] - 1)
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['distance_x_weapons'] = df['totalDistance'] * df['weaponsAcquired']
    df['total_time_by_distance_over_weapons'] = df['total_time_by_distance'] / df['weaponsAcquired']
    df['total_time_by_distance_x_weapons'] = df['total_time_by_distance'] * df['weaponsAcquired']
    df['killPlace_over_total_time_by_distance'] = df['killPlace'] / df['total_time_by_distance']
    df['killPlace_x_total_time_by_distance'] = df['killPlace'] * df['total_time_by_distance']
    df['killPlace_over_totalDistance'] = df['killPlace'] / df['totalDistance']
    df['killPlace_x_totalDistance'] = df['killPlace'] * df['totalDistance']    
    df['boosts_over_total_time_by_distance'] = df['boosts'] / df['total_time_by_distance']
    df['boosts_x_total_time_by_distance'] = df['boosts'] * df['total_time_by_distance']
    df['boosts_over_totalDistance'] = df['boosts'] / df['totalDistance']
    df['boosts_x_totalDistance'] = df['boosts'] * df['totalDistance']    
    df['teamwork'] = df['assists'] + df['revives'] - df['teamKills']
    df['total_items_acquired'] = (df["boosts"] + df["heals"] + df["weaponsAcquired"])
    df['total_items_acquired_norm'] = (df["boosts"]/1.715794 + df["heals"]/2.679982 + df["weaponsAcquired"]/2.456543)
    df['total_items_acquired_over_total_time_by_distance'] = df['total_items_acquired'] / df['total_time_by_distance']
    df['total_items_acquired_x_total_time_by_distance'] = df['total_items_acquired'] * df['total_time_by_distance']
    df['total_items_acquired_+_total_time_by_distance'] = df['total_items_acquired'] + df['total_time_by_distance']
    df['total_items_acquired_+_total_time_by_distance_norm'] = df['total_items_acquired']/5.346595 + df['total_time_by_distance']/2.661833    
    df['total_items_acquired_norm_over_total_time_by_distance'] = df['total_items_acquired_norm'] / df['total_time_by_distance']
    df['total_items_acquired_norm_x_total_time_by_distance'] = df['total_items_acquired_norm'] * df['total_time_by_distance']
    df['total_items_acquired_norm_+_total_time_by_distance'] = df['total_items_acquired_norm'] + df['total_time_by_distance']
    
    df['total_items_acquired_norm_+_total_time_by_distance_norm'] = df['total_items_acquired_norm']/3.417951 + df['total_time_by_distance']/2.661833

    df['total_items_acquired_x_total_time_by_distance_rank'] = df.groupby('matchId')['total_items_acquired_x_total_time_by_distance'].rank(pct=True)
    df['total_items_acquired_x_total_time_by_distance_place'] = df['total_items_acquired_x_total_time_by_distance_rank'] * df['playersJoined']
    df['total_items_acquired_x_total_time_by_distance_perc'] = (df['playersJoined'] - df['total_items_acquired_x_total_time_by_distance_place']) / (df['playersJoined'] - 1)
    df['heals_over_total_time_by_distance'] = df['heals'] / df['total_time_by_distance']
    df['heals_x_total_time_by_distance'] = df['heals'] * df['total_time_by_distance']    
    df['heals_over_totalDistance'] = df['heals'] / df['totalDistance']
    df['heals_x_totalDistance'] = df['heals'] * df['totalDistance']    
    df['kills_over_total_time_by_distance'] = df['kills'] / df['total_time_by_distance']
    df['kills_x_total_time_by_distance'] = df['kills'] * df['total_time_by_distance']
    df['kills_over_totalDistance'] = df['kills'] / df['totalDistance']
    df['kills_x_totalDistance'] = df['kills'] * df['totalDistance']    
    df['killsNorm'] = df['kills']*((100-df['playersJoined'])/100 + 1)
    df['damageDealtNorm'] = df['damageDealt']*((100-df['playersJoined'])/100 + 1)
    df['maxPlaceNorm'] = df['maxPlace']*((100-df['playersJoined'])/100 + 1)
    df['killPlace_over_maxPlaceNorm'] = df['killPlace'] / df['maxPlaceNorm']
    df['killPlace_over_playersJoined'] = df['killPlace'] / df['playersJoined'] 
    df['killPlace_-_playersJoined'] = df['killPlace'] - df['playersJoined']
    df['killPlace_-_playersJoined_norm'] = df['killPlace']/27.46293 - df['playersJoined']/6.686392
    df['matchDurationNorm'] = df['matchDuration']*((100-df['playersJoined'])/100 + 1)
    df['killPlace_over_matchDuration'] = df['killPlace'] / df['matchDuration']
    df['killPlace_over_matchDurationnorm'] = df['killPlace'] / df['matchDurationNorm'] 
    df['killPlace_-_matchDurationnorm'] = df['killPlace'] - df['matchDurationNorm']
    df['killPlace_-_matchDurationnorm_rank'] = df.groupby('matchId')['killPlace_-_matchDurationnorm'].rank(pct=True)
    df['killPlace_-_matchDurationnorm_place'] = df['killPlace_-_matchDurationnorm_rank'] * df['playersJoined']
    df['killPlace_-_matchDurationnorm_perc'] = (df['playersJoined'] - df['killPlace_-_matchDurationnorm_place']) / (df['playersJoined'] - 1)
    df['killPlace_-_matchDurationnorm_norm'] = df['killPlace']/27.46293 - df['matchDurationNorm']/303.5284
    df['killPlacePerc'] = (df['playersJoined'] - df['killPlace']) / (df['playersJoined'] - 1)
    df['L1'] = df['roadKills'] + df['vehicleDestroys'] + df['teamKills']
    df['L2'] = df['revives'] + df['headshotKills'] + df['assists']
    df['L3'] = df['killStreaks'] + df['DBNOs'] + df['kills'] + df['boosts'] + df['heals']
    df['points'] = df['killPoints']+df['rankPoints'] + df['winPoints']
    df['L1_over_total_time_by_distance'] = df['L1'] / df['total_time_by_distance']
    df['L1_x_total_time_by_distance'] = df['L1'] * df['total_time_by_distance']
    df['L1_over_totalDistance'] = df['L1'] / df['totalDistance']
    df['L1_x_totalDistance'] = df['L1'] * df['totalDistance']    
    df['L2_over_total_time_by_distance'] = df['L2'] / df['total_time_by_distance']
    df['L2_x_total_time_by_distance'] = df['L2'] * df['total_time_by_distance']
    df['L2_over_totalDistance'] = df['L2'] / df['totalDistance']
    df['L2_x_totalDistance'] = df['L2'] * df['totalDistance']    
    df['L3_over_total_time_by_distance'] = df['L3'] / df['total_time_by_distance']
    df['L3_x_total_time_by_distance'] = df['L3'] * df['total_time_by_distance']
    df['L3_over_totalDistance'] = df['L3'] / df['totalDistance']
    df['L3_x_totalDistance'] = df['L3'] * df['totalDistance']
    df['damage1'] = df['damageDealt'] + df['kills'] * 100
    df['damage2'] = df['damageDealt'] + df['kills'] * 100 + df['DBNOs'] * 100
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    df.fillna(0, inplace=True)  
    reduce_mem_usage(df)
    return df

#List of features
def dataproc2(df, is_train='TRUE'):
    print("Starting dataproc2")
    features = list(df.columns)
    if is_train=='TRUE':
        features.remove(target)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    features.remove("numGroups")
    features.remove("playersJoined")
    features.remove("roadKills")
    features.remove("vehicleDestroys")
    features.remove("healthitems")
    features.remove("healthitems_norm")
    features.remove("killPoints")
    features.remove("L1")
    features.remove("L1_over_total_time_by_distance")
    features.remove("L1_x_totalDistance")
    features.remove("matchDurationNorm")
    features.remove("teamKills")
    features.remove("total_items_acquired")
    #imp_cols=['matchId', 'groupId', 'winPlacePerc']
    print("Step 1")
    tmp = df.groupby(['matchId','groupId'])[features].agg('mean')
    df_out = tmp.reset_index()[['matchId','groupId']]
    if is_train=='TRUE':
        tmp3 = df.groupby(['matchId','groupId'])[target].agg('mean')
        df_out = df_out.merge(tmp3.reset_index(), how='left', on=['matchId','groupId'])
        del tmp3
        gc.collect()
    print("Step 2")
    print(df_out.shape)
    reduce_mem_usage(df_out)    
    features.remove("matchDuration")
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_mean').reset_index(), how='left', on=['matchId', 'groupId'])
    #imp_cols.extend(['boosts_mean','killPlace_mean','kills_mean','longestKill_mean','matchDuration_mean','walkDistance_mean','walkDistance_over_kills_mean','killPlace_over_maxPlace_mean','killPlace_over_numGroups_mean','total_items_acquired_mean','weapons_over_totaltime_mean','killPlace_over_maxPlaceNorm_mean','killPlace_over_maxPlacenorm_mean','killPlacePerc_mean'])
    #df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_mean_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    #imp_cols.extend(['boosts_mean_rank','DBNOs_mean_rank','kills_mean_rank','killStreaks_mean_rank','longestKill_mean_rank','rankPoints_mean_rank','walkDistance_mean_rank','weaponsAcquired_mean_rank','killStreakrate_mean_rank','walkDistance_over_kills_mean_rank','killsPerWalkDistance_mean_rank','total_time_by_distance_mean_rank','total_time_by_distance51_mean_rank','totalDistance_mean_rank','distance_over_weapons_mean_rank','killPlace_over_totalDistance_mean_rank','total_items_acquired_mean_rank','weapons_over_totaltime_mean_rank','kills_over_totaltime_mean_rank','killsNorm_mean_rank','killPlace_over_maxPlacenorm_mean_rank','killPlacePerc_mean_rank'])
    #df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 3")
    print(df_out.shape)
    reduce_mem_usage(df_out)    
    tmp = df.groupby(['matchId','groupId'])[features].agg('median')
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_median').reset_index(), how='left', on=['matchId', 'groupId'])
    #imp_cols.extend(['longestKill_median','walkDistance_median','walkDistance_over_kills_median','killPlace_over_maxPlace_median','killPlace_over_numGroups_median','distance_over_weapons_median','total_items_acquired_median','weapons_over_totaltime_median','killPlace_over_maxPlacenorm_median','killPlacePerc_median'])
    #df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_median_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    #imp_cols.extend(['assists_median_rank','boosts_median_rank','DBNOs_median_rank','kills_median_rank','killStreaks_median_rank','longestKill_median_rank','rankPoints_median_rank','revives_median_rank','rideDistance_median_rank','swimDistance_median_rank','teamKills_median_rank','walkDistance_median_rank','weaponsAcquired_median_rank','killStreakrate_median_rank','healthitems_median_rank','walkDistance_over_heals_median_rank','walkDistance_over_kills_median_rank','killsPerWalkDistance_median_rank','killPlace_over_maxPlace_median_rank','total_time_by_distance_median_rank','total_time_by_distance51_median_rank','totalDistance_median_rank','distance_over_weapons_median_rank','killPlace_over_totalDistance_median_rank','teamwork_median_rank','total_items_acquired_median_rank','weapons_over_totaltime_median_rank','kills_over_totaltime_median_rank','killsNorm_median_rank','killPlace_over_maxPlaceNorm_median_rank','killPlace_over_maxPlacenorm_median_rank','killPlacePerc_median_rank'])
    #df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 4")
    print(df_out.shape)
    reduce_mem_usage(df_out)    
    tmp = df.groupby(['matchId','groupId'])[features].agg('max')
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_max').reset_index(), how='left', on=['matchId', 'groupId'])
    #imp_cols.extend(['boosts_max','damageDealt_max','killPlace_max','kills_max','longestKill_max','walkDistance_max','killsPerWalkDistance_max','killPlace_over_maxPlace_max','killPlace_over_numGroups_max','total_time_by_distance_max','total_time_by_distance51_max','distance_over_weapons_max','total_items_acquired_max','weapons_over_totaltime_max','killsNorm_max','damageDealtNorm_max','killPlace_over_maxPlaceNorm_max','killPlace_over_maxPlacenorm_max','killPlacePerc_max'])
    #df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_max_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    #imp_cols.extend(['assists_max_rank','boosts_max_rank','DBNOs_max_rank','killPlace_max_rank','killPoints_max_rank','kills_max_rank','killStreaks_max_rank','longestKill_max_rank','rankPoints_max_rank','walkDistance_max_rank','weaponsAcquired_max_rank','winPoints_max_rank','killStreakrate_max_rank','healthitems_max_rank','walkDistance_over_heals_max_rank','walkDistance_over_kills_max_rank','killsPerWalkDistance_max_rank','killPlace_over_maxPlace_max_rank','killPlace_over_numGroups_max_rank','total_time_by_distance_max_rank','total_time_by_distance51_max_rank','totalDistance_max_rank','distance_over_weapons_max_rank','killPlace_over_total_time_by_distance_max_rank','killPlace_over_total_time_by_distance51_max_rank','killPlace_over_totalDistance_max_rank','total_items_acquired_max_rank','weapons_over_totaltime_max_rank','kills_over_totaltime_max_rank','killsNorm_max_rank','killPlace_over_maxPlaceNorm_max_rank','killPlace_over_maxPlacenorm_max_rank','killPlacePerc_max_rank'])
    #df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 5")
    print(df_out.shape)
    reduce_mem_usage(df_out)    
    tmp = df.groupby(['matchId','groupId'])[features].agg('min')
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_min').reset_index(), how='left', on=['matchId', 'groupId'])
    #imp_cols.extend(['killPlace_min','kills_min','longestKill_min','walkDistance_min','killStreakrate_min','walkDistance_over_kills_min','killsPerWalkDistance_min','killPlace_over_maxPlace_min','killPlace_over_numGroups_min','distance_over_weapons_min','total_items_acquired_min','weapons_over_totaltime_min','kills_over_totaltime_min','killsNorm_min','damageDealtNorm_min','killPlace_over_maxPlaceNorm_min','killPlace_over_maxPlacenorm_min','killPlacePerc_min'])
    #df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_min_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    #imp_cols.extend(['assists_min_rank','boosts_min_rank','damageDealt_min_rank','DBNOs_min_rank','heals_min_rank','killPoints_min_rank','kills_min_rank','killStreaks_min_rank','longestKill_min_rank','rankPoints_min_rank','revives_min_rank','rideDistance_min_rank','swimDistance_min_rank','teamKills_min_rank','walkDistance_min_rank','weaponsAcquired_min_rank','winPoints_min_rank','headshotrate_min_rank','killStreakrate_min_rank','healthitems_min_rank','walkDistance_over_heals_min_rank','walkDistance_over_kills_min_rank','killsPerWalkDistance_min_rank','total_time_by_distance_min_rank','total_time_by_distance51_min_rank','totalDistance_min_rank','distance_over_weapons_min_rank','killPlace_over_total_time_by_distance_min_rank','killPlace_over_total_time_by_distance51_min_rank','killPlace_over_totalDistance_min_rank','teamwork_min_rank','total_items_acquired_min_rank','weapons_over_totaltime_min_rank','heals_over_totaltime_min_rank','kills_over_totaltime_min_rank','killsNorm_min_rank','damageDealtNorm_min_rank','killPlace_over_maxPlaceNorm_min_rank','killPlace_over_maxPlacenorm_min_rank','killPlacePerc_min_rank'])
    #df_out = df_out[imp_cols]
    del tmp
    del tmp2
    gc.collect()
    print("Step 6")
    print(df_out.shape)
    reduce_mem_usage(df_out)    
    tmp = df.groupby(['matchId'])[features].agg('mean')
    df_out = df_out.merge(tmp.add_suffix('_match_mean').reset_index(), how='left', on=['matchId'])   
    #imp_cols.extend(['assists_match_mean','boosts_match_mean','damageDealt_match_mean','DBNOs_match_mean','headshotKills_match_mean','heals_match_mean','kills_match_mean','killStreaks_match_mean','longestKill_match_mean','rankPoints_match_mean','revives_match_mean','rideDistance_match_mean','swimDistance_match_mean','teamKills_match_mean','walkDistance_match_mean','weaponsAcquired_match_mean','headshotrate_match_mean','killStreakrate_match_mean','healthitems_match_mean','walkDistance_over_heals_match_mean','walkDistance_over_kills_match_mean','killsPerWalkDistance_match_mean','skill_match_mean','headshotKills_over_kills_match_mean','distance_over_weapons_match_mean','killPlace_over_total_time_by_distance_match_mean','killPlace_over_total_time_by_distance51_match_mean','killPlace_over_totalDistance_match_mean','teamwork_match_mean','total_items_acquired_match_mean','weapons_over_totaltime_match_mean','heals_over_totaltime_match_mean','kills_over_totaltime_match_mean','killsNorm_match_mean','damageDealtNorm_match_mean'])
    #df_out = df_out[imp_cols]
    del tmp
    gc.collect()
    tmp = df.groupby(['matchId'])[features].agg('median')
    df_out = df_out.merge(tmp.add_suffix('_match_median').reset_index(), how='left', on=['matchId'])   
    #imp_cols.extend(['damageDealt_match_median','walkDistance_match_median','killStreakrate_match_median','totalDistance_match_median','distance_over_weapons_match_median','killPlace_over_total_time_by_distance_match_median','killPlace_over_total_time_by_distance51_match_median','killPlace_over_totalDistance_match_median','weapons_over_totaltime_match_median','kills_over_totaltime_match_median','damageDealtNorm_match_median'])
    #df_out = df_out[imp_cols]
    del tmp
    gc.collect()
    tmp = df.groupby(['matchId'])[features].agg('max')
    df_out = df_out.merge(tmp.add_suffix('_match_max').reset_index(), how='left', on=['matchId'])   
    #imp_cols.extend(['boosts_match_max','damageDealt_match_max','heals_match_max','longestKill_match_max','rankPoints_match_max','rideDistance_match_max','swimDistance_match_max','walkDistance_match_max','weaponsAcquired_match_max','headshotrate_match_max','healthitems_match_max','walkDistance_over_heals_match_max','walkDistance_over_kills_match_max','killsPerWalkDistance_match_max','total_time_by_distance_match_max','total_time_by_distance51_match_max','totalDistance_match_max','distance_over_weapons_match_max','killPlace_over_total_time_by_distance_match_max','killPlace_over_total_time_by_distance51_match_max','killPlace_over_totalDistance_match_max','total_items_acquired_match_max','weapons_over_totaltime_match_max','heals_over_totaltime_match_max','kills_over_totaltime_match_max','killsNorm_match_max','damageDealtNorm_match_max'])
    #df_out = df_out[imp_cols]
    del tmp
    gc.collect()
    print("Step 7")
    print(df_out.shape)
    reduce_mem_usage(df_out)
    tmp = df.groupby(['matchId','groupId'])[features].agg('sum')
    tmp2 = tmp.groupby('matchId')[features].rank(pct=True)
    df_out = df_out.merge(tmp.add_suffix('_sum').reset_index(), how='left', on=['matchId', 'groupId'])
    del tmp
    gc.collect()
    print(df_out.shape)
    #imp_cols.extend(['damageDealt_sum','longestKill_sum','rankPoints_sum','walkDistance_sum','walkDistance_over_kills_sum','killsPerWalkDistance_sum','killPlace_over_maxPlace_sum','killPlace_over_numGroups_sum','distance_over_weapons_sum','weapons_over_totaltime_sum','kills_over_totaltime_sum','killsNorm_sum','damageDealtNorm_sum','maxPlaceNorm_sum','matchDurationNorm_sum','killPlace_over_maxPlacenorm_sum','killPlacePerc_sum'])
    #df_out = df_out[imp_cols]
    df_out = df_out.merge(tmp2.add_suffix('_sum_rank').reset_index(), how='left', on=['matchId', 'groupId'])
    print(df_out.shape)
    #imp_cols.extend(['assists_sum_rank','boosts_sum_rank','DBNOs_sum_rank','killPoints_sum_rank','kills_sum_rank','killStreaks_sum_rank','longestKill_sum_rank','rankPoints_sum_rank','walkDistance_sum_rank','weaponsAcquired_sum_rank','winPoints_sum_rank','killStreakrate_sum_rank','healthitems_sum_rank','walkDistance_over_heals_sum_rank','walkDistance_over_kills_sum_rank','killPlace_over_numGroups_sum_rank','total_time_by_distance_sum_rank','total_time_by_distance51_sum_rank','totalDistance_sum_rank','distance_over_weapons_sum_rank','killPlace_over_total_time_by_distance_sum_rank','killPlace_over_totalDistance_sum_rank','teamwork_sum_rank','total_items_acquired_sum_rank','weapons_over_totaltime_sum_rank','kills_over_totaltime_sum_rank','killsNorm_sum_rank','killPlacePerc_sum_rank'])
    #df_out = df_out[imp_cols]
    del tmp2
    gc.collect()
    print("Step 8")
    reduce_mem_usage(df_out)    
    print(df_out.shape)
    df_out = df_out.assign(agg_group_size=df.groupby('groupId').groupId.transform('count'))
    print(df_out.shape)
    df_out = df_out.assign(agg_match_size=df.groupby('matchId').Id.transform('nunique'))
    print(df_out.shape)
    print("Step 9")
    del df_out["matchId"]
    del df_out["groupId"]
    reduce_mem_usage(df_out)
    gc.collect()
    return df_out

In [34]:
%%time
df = pd.read_csv(INPUT_DIR + 'train_V2.csv')
df = df[df['maxPlace'] > 1]
reduce_mem_usage(df)
target = 'winPlacePerc'
df = dataproc1(df)
df.reset_index().to_feather('tmp/20190109_df')
df_out = dataproc2(df)
del df
gc.collect()
df_out.to_feather('tmp/20190109_df_out')
y_train = df_out[target]
x_train = df_out.drop(target, axis=1)
del df_out
gc.collect()

Starting dataproc2
Step 1
Step 2
(2026744, 3)
Step 3
(2026744, 206)
Step 4
(2026744, 408)
Step 5
(2026744, 610)
Step 6
(2026744, 812)
Step 7
(2026744, 1115)
(2026744, 1216)
(2026744, 1317)
Step 8
(2026744, 1317)
(2026744, 1318)
(2026744, 1319)
Step 9
CPU times: user 41min 21s, sys: 14min 59s, total: 56min 20s
Wall time: 26min 29s


In [19]:
#df = feather.read_dataframe('tmp/20190109_df')
#target = 'winPlacePerc'
#y_train = df_out[target]
#x_train = df_out.drop(target, axis=1)
#del df_out
#gc.collect()

In [1]:
m0 = RandomForestRegressor(n_jobs=-1, n_estimators=1, bootstrap=False, verbose=True, max_depth=3, scoring="roc_auc")
%time m0.fit(x_train, y_train)
print(mean_absolute_error(m0.predict(x_train), y_train))

NameError: name 'RandomForestRegressor' is not defined

In [36]:
def dectree_max_depth(tree):
    children_left = tree.children_left
    children_right = tree.children_right

    def walk(node_id):
        if (children_left[node_id] != children_right[node_id]):
            left_max = 1 + walk(children_left[node_id])
            right_max = 1 + walk(children_right[node_id])
            return max(left_max, right_max)
        else: # leaf
            return 1

    root_node_id = 0
    return walk(root_node_id)

t=m0.estimators_[0].tree_
dectree_max_depth(t)

4

In [37]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)
fi = rf_feat_importance(m0, x_train); display_all(fi)
fi.to_csv("tmp/20190109_fi_2.csv")
col = pd.Series(x_train.columns)
col.to_csv("tmp/20190109_col_2.csv")
fi_imp=fi[:359]
fi_imp.to_csv("tmp/20190109_fi_imp_2.csv")

Unnamed: 0,cols,imp
159,total_items_acquired_norm_x_total_time_by_dist...,0.793644
512,killPlace_max_rank,0.090806
565,total_items_acquired_norm_+_total_time_by_dist...,0.083481
346,killPlace_over_total_time_by_distance_median_rank,0.011606
539,total_time_by_distance_max_rank,0.011082
581,killPlace_over_playersJoined_max_rank,0.009381
0,assists_mean,0.000000
875,heals_x_totalDistance_match_mean,0.000000
881,damageDealtNorm_match_mean,0.000000
880,killsNorm_match_mean,0.000000


In [38]:
m00 = RandomForestRegressor(n_jobs=-1, n_estimators=1, bootstrap=False, verbose=True)
%time m00.fit(x_train, y_train)
print(mean_absolute_error(m00.predict(x_train), y_train))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 180.0min finished


CPU times: user 1h 19min 33s, sys: 1min 22s, total: 1h 20min 55s
Wall time: 3h 1min 13s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.1s finished


1.2206105987112095e-05


In [39]:
t=m00.estimators_[0].tree_
dectree_max_depth(t)

87

In [40]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)
fi = rf_feat_importance(m00, x_train); display_all(fi)
fi.to_csv("tmp/20190109_fi_2.csv")
col = pd.Series(x_train.columns)
col.to_csv("tmp/20190109_col_2.csv")
fi_imp=fi[:359]
fi_imp.to_csv("tmp/20190109_fi_imp_2.csv")

Unnamed: 0,cols,imp
159,total_items_acquired_norm_x_total_time_by_dist...,7.135310e-01
584,killPlace_over_matchDuration_max_rank,8.180896e-02
565,total_items_acquired_norm_+_total_time_by_dist...,7.512592e-02
486,killPlace_-_matchDurationnorm_rank_max,1.329984e-02
346,killPlace_over_total_time_by_distance_median_rank,1.053825e-02
539,total_time_by_distance_max_rank,1.023182e-02
538,killPlace_over_numGroups_max_rank,1.016219e-02
690,killPlace_-_matchDurationnorm_perc_min,3.909288e-03
580,killPlace_over_maxPlaceNorm_max_rank,3.757532e-03
775,kills_over_total_time_by_distance_min_rank,3.696052e-03
