In [1]:
# The number of entries to read in. Use it to have fast turn-around. The values are separate for train and test sets
max_events_trn=None
max_events_tst=None
# Number on CV folds
n_cv=3

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=Warning)

from sklearn.metrics import mean_squared_error, mean_absolute_error

import os
print(os.listdir("../input"))

['train.csv', 'sample_submission.csv', 'test.csv']


Define a function to reduce memory foorprint

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Read in the data

In [4]:
df_trn = pd.read_csv('../input/train.csv', nrows=max_events_trn)
df_trn = reduce_mem_usage(df_trn)

df_tst = pd.read_csv('../input/test.csv',  nrows=max_events_tst)
df_tst = reduce_mem_usage(df_tst)

Memory usage of dataframe is 864.34 MB
Memory usage after optimization is: 178.69 MB
Decreased by 79.3%
Memory usage of dataframe is 356.28 MB
Memory usage after optimization is: 73.04 MB
Decreased by 79.5%


In [5]:
# Data 
df_trn.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,maxPlace,numGroups,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,0,24,0,0,5,247.25,2,0,4,17,1050,2,1,65.3125,29,28,1,591.5,0,0.0,0,0,782.5,4,1458,0.856934
1,1,440875,1,1,0,37.65625,1,1,0,45,1072,1,1,13.546875,26,23,0,0.0,0,0.0,0,0,119.625,3,1511,0.040009
2,2,878242,2,0,1,93.75,1,0,2,54,1404,0,0,0.0,28,28,1,0.0,0,0.0,0,0,3248.0,5,1583,0.740723
3,3,1319841,3,0,0,95.875,0,0,0,86,1069,0,0,0.0,97,94,0,0.0,0,0.0,0,0,21.484375,1,1489,0.114624
4,4,1757883,4,0,1,0.0,0,0,1,58,1034,0,0,0.0,47,41,0,0.0,0,0.0,0,0,641.0,4,1475,0.521484


In [6]:
df_trn.info(memory_usage='deep', verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4357336 entries, 0 to 4357335
Columns: 26 entries, Id to winPlacePerc
dtypes: float16(6), int16(2), int32(3), int8(15)
memory usage: 178.7 MB


In [7]:
df_tst.info(memory_usage='deep', verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1867913 entries, 0 to 1867912
Columns: 25 entries, Id to winPoints
dtypes: float16(5), int16(2), int32(3), int8(15)
memory usage: 73.0 MB


- The training dataset has 4.3M entries, which is not small and aloows for advanced models like GBM and NN to dominate.
- The test dataset is only 1.9M entries
- There are 25 features (+ the target in the train dataset)

## Missing data

In [8]:
df_trn.isnull().sum()

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
maxPlace           0
numGroups          0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64

No missing data

## Prepare the data

In [10]:
y = df_trn['winPlacePerc']
df_trn.drop('winPlacePerc', axis=1, inplace=True)

We will NOT use Id, groupId, matchId. The first one is a unique identifier and can be useful only in the case of data leakage. The other two would be useful in feature engineering with grouped stats per match and per team.

In [11]:
# we will NOT use 
features_not2use = ['Id', 'groupId', 'matchId']

In [12]:
for df in [df_trn, df_tst]:
    df.drop(features_not2use, axis=1, inplace=True)

## Train and evaluate a model
Start by defining handy helper functions...

In [13]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.base import clone, ClassifierMixin, RegressorMixin
import lightgbm as lgb


def learning_rate_decay_power(current_iter):
    '''
    The function defines learning rate deay for LGBM
    '''
    base_learning_rate = 0.10
    min_lr = 5e-2
    lr = base_learning_rate  * np.power(.996, current_iter)
    return lr if lr > min_lr else min_lr


def train_single_model(clf_, X_, y_, random_state_=314, opt_parameters_={}, fit_params_={}):
    '''
    A wrapper to train a model with particular parameters
    '''
    c = clone(clf_)
    c.set_params(**opt_parameters_)
    c.set_params(random_state=random_state_)
    return c.fit(X_, y_, **fit_params_)

def train_model_in_CV(model, X, y, metric, metric_args={},
                            model_name='xmodel',
                            seed=31416, n=5,
                            opt_parameters_={}, fit_params_={},
                            verbose=True):
    # the list of classifiers for voting ensable
    clfs = []
    # performance 
    perf_eval = {'score_i_oof': 0,
                 'score_i_ave': 0,
                 'score_i_std': 0,
                 'score_i': []
                }
    # full-sample oof prediction
    y_full_oof = pd.Series(np.zeros(shape=(y.shape[0],)), 
                          index=y.index)
    
    if 'sample_weight' in metric_args:
        sample_weight=metric_args['sample_weight']
        
    doSqrt=False
    if 'sqrt' in metric_args:
        doSqrt=True
        del metric_args['sqrt']

    cv = KFold(n, shuffle=True, random_state=seed) #Stratified
    # The out-of-fold (oof) prediction for the k-1 sample in the outer CV loop
    y_oof = pd.Series(np.zeros(shape=(X.shape[0],)), 
                      index=X.index)
    scores = []
    clfs = []

    for n_fold, (trn_idx, val_idx) in enumerate(cv.split(X, (y!=0).astype(np.int8))):
        X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

        if fit_params_:
            # use _stp data for early stopping
            fit_params_["eval_set"] = [(X_trn,y_trn), (X_val,y_val)]
            fit_params_['verbose'] = verbose

        clf = train_single_model(model, X_trn, y_trn, 314+n_fold, opt_parameters_, fit_params_)

        clfs.append(('{}{}'.format(model_name,n_fold), clf))
        # evaluate performance
        if isinstance(clf, RegressorMixin):
            y_oof.iloc[val_idx] = clf.predict(X_val)
        elif isinstance(clf, ClassifierMixin):
            y_oof.iloc[val_idx] = clf.predict_proba(X_val)[:,1]
        else:
            raise TypeError('Provided model does not inherit neither from a regressor nor from classifier')
        if 'sample_weight' in metric_args:
            metric_args['sample_weight'] = y_val.map(sample_weight)
        scores.append(metric(y_val, y_oof.iloc[val_idx], **metric_args))
        #cleanup
        del X_trn, y_trn, X_val, y_val

    # Store performance info for this CV
    if 'sample_weight' in metric_args:
        metric_args['sample_weight'] = y_oof.map(sample_weight)
    perf_eval['score_i_oof'] = metric(y, y_oof, **metric_args)
    perf_eval['score_i'] = scores
    
    if doSqrt:
        for k in perf_eval.keys():
            if 'score' in k:
                perf_eval[k] = np.sqrt(perf_eval[k])
        scores = np.sqrt(scores)
            
    perf_eval['score_i_ave'] = np.mean(scores)
    perf_eval['score_i_std'] = np.std(scores)

    return clfs, perf_eval, y_oof

def print_perf_clf(name, perf_eval):
    print('Performance of the model:')    
    print('Mean(Val) score inner {} Classifier: {:.4f}+-{:.4f}'.format(name, 
                                                                      perf_eval['score_i_ave'],
                                                                      perf_eval['score_i_std']
                                                                     ))
    print('Min/max scores on folds: {:.4f} / {:.4f}'.format(np.min(perf_eval['score_i']),
                                                            np.max(perf_eval['score_i'])))
    print('OOF score inner {} Classifier: {:.4f}'.format(name, perf_eval['score_i_oof']))
    print('Scores in individual folds: {}'.format(perf_eval['score_i']))

Now let's define the parameter and model in a scalable fashion (we can add later on further models to the list and it will work out-of-the-box).

The format is a dictionary with keys that are user model names and items being an array (or tuple) of:

- model to be fitted;
- additional model parameters to be set;
- model fit parameters (they are passed to model.fit() call);
- target variable.

In [14]:
mdl_inputs = {
        # This will be with MAE loss
        'lgbm1_reg': (lgb.LGBMRegressor(max_depth=-1, min_child_samples=400, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000, learning_rate=0.05),
                 {'objective': 'mae', 'colsample_bytree': 0.75, 'min_child_weight': 10.0, 'num_leaves': 30, 'reg_alpha': 1, 'subsample': 0.75}, 
                 {"early_stopping_rounds":100, 
                  "eval_metric" : 'mae',
                  'eval_names': ['train', 'early_stop'],
                  'verbose': False, 
                  'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_decay_power)],
                  'categorical_feature': 'auto'},
                 y
                ),
        # This will be with FAIR loss
        'lgbm2_reg': (lgb.LGBMRegressor(max_depth=-1, min_child_samples=400, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000, learning_rate=0.05),
                 {'objective': 'fair', 'colsample_bytree': 0.75, 'min_child_weight': 10.0, 'num_leaves': 30, 'reg_alpha': 1, 'subsample': 0.75}, 
                 {"early_stopping_rounds":100, 
                  "eval_metric" : 'mae',
                  'eval_names': ['train', 'early_stop'],
                  'verbose': False, 
                  'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_decay_power)],
                  'categorical_feature': 'auto'},
                 y
                ),
       }

Do the actual model training

In [None]:
%%time
mdls = {}
results = {}
y_oofs = {}
for name, (mdl, mdl_pars, fit_pars, y_) in mdl_inputs.items():
    print('--------------- {} -----------'.format(name))
    mdl_, perf_eval_, y_oof_ = train_model_in_CV(mdl, df_trn, y_, mean_absolute_error, 
                                                          metric_args={},
                                                          model_name=name, 
                                                          opt_parameters_=mdl_pars,
                                                          fit_params_=fit_pars, 
                                                          n=n_cv,
                                                          verbose=False)
    results[name] = perf_eval_
    mdls[name] = mdl_
    y_oofs[name] = y_oof_
    print_perf_clf(name, perf_eval_)

--------------- lgbm1_reg -----------
