This fork adds feature engineering that compares individual stats to the average for that particular match.

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error
import gc

from sklearn.model_selection import GridSearchCV

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [None]:
# Thanks and credited to https://www.kaggle.com/gemartin who created this wonderful mem reducer
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
concat = pd.concat([train, test])
del train
del test
gc.collect()

In [None]:
concat = reduce_mem_usage(concat)

In [None]:
def count_transform(df, cols):
    for c in cols:
        df[c + "_count"] = df.groupby(c)[c].transform('count')
    
    return df
        

Count the number of people in each group and each match.

In [None]:
concat = count_transform(concat, ['groupId', 'matchId'])

From my experience with the game, there are people that jump into hotly contested zones and face the action right away, and others that jump into sparcely populated areas where they are safe from other players (at least initially). To try and tease out these 2 different types of players, I track several stats divided by their distance moved. The idea being that people who killed acquired a lot of weapons without walking far probably dropped in a highly contested zone.

In [None]:
per_dist_stats = ['assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'kills',
       'teamKills', 'vehicleDestroys', 'weaponsAcquired']

In [None]:
concat['LogWalk'] = np.log1p(concat['walkDistance'])

In [None]:
for stat in per_dist_stats:
    concat[stat + '_perLogWalk'] = concat[stat] / concat['LogWalk']

In [None]:
concat['grpSizeMult'] = concat['groupId_count'] / (concat['matchId_count'] / concat['numGroups'])

For this kernel, I am tracking relative stats for each match. A person may have a high winPoints, but that means less if everyone else in the match also has high winPoints.

In [None]:
match_stats = ['DBNOs',
 'assists',
 'boosts',
 'damageDealt',
 'headshotKills',
 'heals',
 'killPlace',
 'killPoints',
 'killStreaks',
 'kills',
 'longestKill',
 'revives',
 'rideDistance',
 'roadKills',
 'swimDistance',
 'vehicleDestroys',
 'walkDistance',
 'weaponsAcquired',
 'winPoints',
 'LogWalk',
 'assists_perLogWalk',
 'boosts_perLogWalk',
 'damageDealt_perLogWalk',
 'DBNOs_perLogWalk',
 'headshotKills_perLogWalk',
 'heals_perLogWalk',
 'kills_perLogWalk',
 'teamKills_perLogWalk',
 'vehicleDestroys_perLogWalk',
 'weaponsAcquired_perLogWalk',]

In [None]:
for stat in match_stats:
    concat['matchRel_' + stat] = concat[stat] / concat.groupby('matchId')[stat].transform('mean')

In [None]:
drop_features = ["winPlacePerc", "Id", "groupId", "matchId"]
feats = [c for c in concat.columns if c not in drop_features]

In [None]:
aggs = {
    'grpSizeMult' : ['mean'],
    'groupId_count' : ['mean'],
    'matchId_count' : ['mean'],
    'winPlacePerc' : ['mean'],
}

for c in feats:
    if c not in aggs:
        aggs[c] = ['mean', 'min', 'max', 'std']
        
new_cols = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [None]:
groups = concat.groupby('groupId').agg(aggs)


In [None]:
groups.columns = new_cols

In [None]:
del concat
gc.collect()

In [None]:
groups = reduce_mem_usage(groups)

In [None]:
params = {
    'num_leaves': 144,
    'learning_rate': 0.1,
    'n_estimators': 800,
    'max_depth':13,
    'max_bin':55,
    'bagging_fraction':0.8,
    'bagging_freq':5,
    'feature_fraction':0.9
    }

In [None]:
# LightGBM with KFold or Stratified KFold
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['winPlacePerc_mean'].notnull()]
    test_df = df[df['winPlacePerc_mean'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
        
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()

    drop_features = ["winPlacePerc_mean", "Id", "groupId", "matchId"]
    feats = [f for f in train_df.columns if f not in drop_features]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['winPlacePerc_mean'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['winPlacePerc_mean'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['winPlacePerc_mean'].iloc[valid_idx]

        # LightGBM parameters
        reg = LGBMRegressor(num_leaves=params['num_leaves'], learning_rate=params['learning_rate'], 
                            n_estimators=params['n_estimators'], max_depth=params['max_depth'],
                            max_bin = params['max_bin'], bagging_fraction = params['bagging_fraction'], 
                            bagging_freq = params['bagging_freq'], feature_fraction = params['feature_fraction']
                           )

        reg.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
            eval_metric= 'mae', verbose= 50, early_stopping_rounds= 200)

        oof_preds[valid_idx] = reg.predict(valid_x)
        sub_preds += reg.predict(test_df[feats]) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = reg.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d MAE : %.6f' % (n_fold + 1, mean_absolute_error(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full MAE score %.6f' % mean_absolute_error(train_df['winPlacePerc_mean'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['winPlacePerc'] = sub_preds
        #test_df[['Id', 'winPlacePerc']].to_csv("submission.csv", index= False)
    return feature_importance_df, test_df[['winPlacePerc']]

In [None]:
feat_importances, test_df = kfold_lightgbm(groups, num_folds=5, stratified=False, debug=False)

In [None]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

In [None]:
display_importances(feat_importances)

In [None]:
feat_gb = feat_importances.groupby('feature').mean().sort_values(by="importance", ascending=False)

In [None]:
feat_gb.to_csv("feature_importance.csv")

In [None]:
test = pd.read_csv("../input/test.csv")

In [None]:
test = test.merge(test_df, right_index=True, left_on='groupId')

In [None]:
test[['Id', 'winPlacePerc']].to_csv("submission.csv", index=False)