In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import gc, sys
gc.enable()

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
def feature_engineering(is_train=True):
    if is_train: 
        print("processing train.csv")
        df = pd.read_csv("../input/pubg-finish-placement-prediction/train_V2.csv")

        df = df[df['maxPlace'] > 1]
    else:
        print("processing test.csv")
        df = pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv")
        
    
    # df = reduce_mem_usage(df)
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    
    # df = df[:100]
    
    print("remove some columns")
    target = 'winPlacePerc'
    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    
    features.remove("matchType")
    
    
    y = None
    
    print("get target")
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print("get group mean feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    else: df_out = df[['matchId','groupId']]

    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group sum feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('sum')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_sum", "_sum_rank"], how='left', on=['matchId', 'groupId'])
    
    df_out=reduce_mem_usage(df_out)
    
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    df_out=reduce_mem_usage(df_out)
    
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    df_out=reduce_mem_usage(df_out)
    
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    
    

    X = np.array(df_out)
    
    feature_names = list(df_out.columns)

    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y, feature_names

In [4]:
x_train, y, feature_names = feature_engineering(True)

processing train.csv
remove some columns
get target
get group mean feature
get group sum feature
Memory usage of dataframe is 1670037056.00 MB
Memory usage after optimization is: 549704888.00 MB
Decreased by 67.1%
get group max feature
get group min feature
Memory usage of dataframe is 2085519576.00 MB
Memory usage after optimization is: 898304856.00 MB
Decreased by 56.9%
get group size feature
get match mean feature
get match size feature
Memory usage of dataframe is 1250501048.00 MB
Memory usage after optimization is: 1003695544.00 MB
Decreased by 19.7%


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split (x_train, y, test_size=0.33, random_state=42)

In [6]:
import os
import time
import warnings
warnings.filterwarnings("ignore")
# data manipulation

import numpy as np
import pandas as pd
# plot
import matplotlib.pyplot as plt
# model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

In [7]:
train_data=lgb.Dataset(X_train, label=y_train)
val_data= lgb.Dataset(X_val, label=y_val)

params = {
    'num_leaves': 144,
    'learning_rate': 0.1,
    'n_estimators': 1500,
    'max_depth':12,
    'max_bin':55,
    'bagging_fraction':0.8,
    'bagging_freq':5,
    'feature_fraction':0.9,
    'verbose':50, 
    'early_stopping_rounds':100
    }

params['metric'] = 'auc'
lgb_model= lgb.train(params, train_data, valid_sets=val_data, num_boost_round=3000, early_stopping_rounds=100)
y_pred=lgb_model.predict(X_val)
print(mean_absolute_error(y_val,y_pred))

[1]	valid_0's auc: 0.997994
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.997862
[3]	valid_0's auc: 0.995971
[4]	valid_0's auc: 0.996317
[5]	valid_0's auc: 0.997389
[6]	valid_0's auc: 0.997481
[7]	valid_0's auc: 0.997897
[8]	valid_0's auc: 0.997941
[9]	valid_0's auc: 0.997963
[10]	valid_0's auc: 0.997965
[11]	valid_0's auc: 0.998006
[12]	valid_0's auc: 0.998047
[13]	valid_0's auc: 0.9981
[14]	valid_0's auc: 0.998088
[15]	valid_0's auc: 0.998087
[16]	valid_0's auc: 0.998102
[17]	valid_0's auc: 0.998102
[18]	valid_0's auc: 0.998088
[19]	valid_0's auc: 0.998053
[20]	valid_0's auc: 0.998043
[21]	valid_0's auc: 0.998058
[22]	valid_0's auc: 0.998085
[23]	valid_0's auc: 0.998109
[24]	valid_0's auc: 0.998097
[25]	valid_0's auc: 0.998106
[26]	valid_0's auc: 0.998126
[27]	valid_0's auc: 0.998111
[28]	valid_0's auc: 0.998155
[29]	valid_0's auc: 0.998185
[30]	valid_0's auc: 0.998198
[31]	valid_0's auc: 0.998228
[32]	valid_0's auc: 0.998225
[33]	valid_0's auc: 0

In [8]:
y_pred=lgb_model.predict(X_val)
print(mean_absolute_error(y_val,y_pred))
del X_val
del y_val

0.029730648535462194


In [9]:
x_test, y_test, feature_names = feature_engineering(False)

processing test.csv
remove some columns
get target
get group mean feature
get group sum feature
Memory usage of dataframe is 1593759376.00 MB
Memory usage after optimization is: 461831814.00 MB
Decreased by 71.0%
get group max feature
get group min feature
Memory usage of dataframe is 1978660002.00 MB
Memory usage after optimization is: 792575568.00 MB
Decreased by 59.9%
get group size feature
get match mean feature
get match size feature
Memory usage of dataframe is 1179846140.00 MB
Memory usage after optimization is: 893152616.00 MB
Decreased by 24.3%


In [10]:
y_test_pred=lgb_model.predict(x_test)

In [11]:
df=pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv")
var=pd.DataFrame(columns=['Id','winPlacePerc'])
var['Id']= df['Id']

var['winPlacePerc'] = y_test_pred

submission = var[['Id', 'winPlacePerc']]
submission.to_csv('submission.csv', index=False)