This notebook contains my own implementation of a pipeline for testing different feature engineering strategies. Linear regression models are trained for each added feature or groups of features in order to evaluate their effectiveness.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from scipy.stats import skew
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from math import sqrt
import gc

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Import train and test data
train = pd.read_csv('../input/train_V2.csv')
test = pd.read_csv('../input/test_V2.csv')

# Take a look at the data shape

train.head()

The following memory saving function from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage works well but causes overflow errors in this case, so until I get around to modifying it it will not be used.

In [None]:
# Memory saving function from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

It is always important to examine data for missing values.

In [None]:
# Look at missing values
train.isnull().sum()

We could find the index of the observation with the missing value, but it's simpler to just use the dropna() function, even though it takes a little bit longer.

In [None]:
# Drop the single missing value
train = train.dropna()

In [None]:
# Columns to drop
dropCols = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
predictors = [pred for pred in list(train) if pred not in dropCols]

It is generally recommended to center and scale predictors in order to produce a stable linear regression model.

In [None]:
# Center and scale
trans = preprocessing.RobustScaler().fit(train[predictors])
train[predictors] = trans.transform(train[predictors])

The following function is used to build and grade a linear regression model for the specified set of predictors. Different sets of engineered features will be passed into the function in order to assess their usefulness.

In [None]:
from sklearn.model_selection import train_test_split

accuracies = []
feature_names = []
base_improve = []

# Fit a linear regression model to the data and output the results
def test_feature_accuracy(data, feature_name):
    
    # Predictors
    use_preds = [pred for pred in list(data) if pred not in dropCols]
    data_X = data[use_preds]
    # Target
    data_Y = data['winPlacePerc']
    
    print('Training model with feature: ' + feature_name)
    
    # 80/20 train test split
    train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.20, random_state=69)
    
    # Normalize to avoid overflow
    model = linear_model.LinearRegression(normalize=True)
    
    # Train model
    model.fit(train_X, train_Y)
    
    # Evaluate model
    score_val = model.score(test_X, test_Y)
    
    print('Model with feature: ' + feature_name + ' scored accuracy: ' + str(score_val * 100) + '%')
    
    # Save score
    accuracies.append(score_val)
    feature_names.append(feature_name)
    
    # Save improvement
    base_improve.append(accuracies[len(accuracies) - 1] - accuracies[0])
    
# Baseline linear regression
test_feature_accuracy(train, 'Baseline')

In [None]:
# Engineer some features
train['PlayersInMatch'] = train.groupby('matchId')['matchId'].transform('count')
test['PlayersInMatch'] = test.groupby('matchId')['matchId'].transform('count')

temp_preds = predictors.copy()
temp_preds.append('PlayersInMatch')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'PlayersInMatch')

train['TotalDistance'] = train['rideDistance'] + train['swimDistance'] + train['walkDistance']
test['TotalDistance'] = test['rideDistance'] + test['swimDistance'] + test['walkDistance']

temp_preds = predictors.copy()
temp_preds.append('TotalDistance')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'TotalDistance')


train['TeamSize'] = train.groupby(['matchId', 'groupId'])['groupId'].transform('count')
test['TeamSize'] = test.groupby(['matchId', 'groupId'])['groupId'].transform('count')

temp_preds = predictors.copy()
temp_preds.append('TeamSize')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'TeamSize')


train['TeamTotalKills'] = train.groupby(['matchId', 'groupId'])['kills'].transform('sum')
test['TeamTotalKills'] = test.groupby(['matchId', 'groupId'])['kills'].transform('sum')

temp_preds = predictors.copy()
temp_preds.append('TeamTotalKills')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'TeamTotalKills')


train['BoostsOverDistance'] = (train['boosts'] / (train['TotalDistance'] + 1))
test['BoostsOverDistance'] = (test['boosts'] / (test['TotalDistance'] + 1))

temp_preds = predictors.copy()
temp_preds.append('BoostsOverDistance')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'BoostsOverDistance')

train['HeadshotKillsPerKill'] = train['headshotKills'] / train['kills']
train['HeadshotKillsPerKill'].fillna(0, inplace=True)
test['HeadshotKillsPerKill'] = test['headshotKills'] / test['kills']
test['HeadshotKillsPerKill'].fillna(0, inplace=True)

temp_preds = predictors.copy()
temp_preds.append('HeadshotKillsPerKill')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'HeadshotKillsPerKil')

train['MatchKills'] = train.groupby('matchId')['kills'].transform('sum')

temp_preds = predictors.copy()
temp_preds.append('MatchKills')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'MatchKills')

train['PercMatchKills'] = train['kills'] / train['MatchKills']
train['PercMatchKills'].fillna(0, inplace=True)

temp_preds = predictors.copy()
temp_preds.append('PercMatchKills')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'PercMatchKills')

train['TeamPercMatchKills'] = train['TeamTotalKills'] / train['MatchKills']
train['TeamPercMatchKills'].fillna(0, inplace=True)

temp_preds = predictors.copy()
temp_preds.append('TeamPercMatchKills')
temp_preds.append('winPlacePerc')
temp_drop = [pred for pred in list(train) if pred not in temp_preds]
test_feature_accuracy(train.drop(temp_drop, axis=1), 'TeamPercMatchKills')

test['MatchKills'] = test.groupby('matchId')['kills'].transform('sum')
test['PercMatchKills'] = test['kills'] / test['MatchKills']
test['PercMatchKills'].fillna(0, inplace=True)
test['TeamPercMatchKills'] = test['TeamTotalKills'] / test['MatchKills']
test['TeamPercMatchKills'].fillna(0, inplace=True)

# Test all new features
test_feature_accuracy(train, 'All Simple Features')

Most features provide only a marginal increase in accuracy, but all together they provide about a 2% accuracy increase, which is more than negligible. 

In [None]:
# Add the new predictors to list
predictors = [pred for pred in list(train) if pred not in dropCols]

# Engineer some more
train_mean = train.groupby(['matchId', 'groupId'])[predictors].mean()
train_mean_rank = train_mean.groupby('matchId')[predictors].rank(pct=True)

# Mean
train_with_mean = train.merge(train_mean, suffixes=['', 'Mean'], how='left', on=['matchId', 'groupId'])
print('Merged mean')

del train_mean
gc.collect()

# Test the model with feature Mean added
test_feature_accuracy(train_with_mean, 'Mean')

del train_with_mean
gc.collect()

# Mean Rank
train_with_meanrank = train.merge(train_mean_rank, suffixes=['', 'MeanRank'], how='left', on=['matchId', 'groupId'])
print('Merged mean rank')

del train_mean_rank
gc.collect()

# Test the model with feature MeanRank added
test_feature_accuracy(train_with_meanrank, 'MeanRank')

del train_with_meanrank
gc.collect()

train_max = train.groupby(['matchId', 'groupId'])[predictors].max()
train_max_rank = train_max.groupby('matchId')[predictors].rank(pct=True)

train_with_max = train.merge(train_max, suffixes=['', 'Max'], how='left', on=['matchId', 'groupId'])
print('merged max')

del train_max
gc.collect()

# Test the model with feature Max added
test_feature_accuracy(train_with_max, 'Max')

del train_with_max
gc.collect()

# MaxRank
train_with_maxrank = train.merge(train_max_rank, suffixes=['', 'MaxRank'], how='left', on=['matchId', 'groupId'])
print('merged max rank')

del train_max_rank

gc.collect()

# Test the model with feature MaxRank added
test_feature_accuracy(train_with_maxrank, 'MaxRank')

del train_with_maxrank
gc.collect()

train_min = train.groupby(['matchId', 'groupId'])[predictors].min()
train_min_rank = train_min.groupby('matchId')[predictors].rank(pct=True)

train_with_min = train.merge(train_min, suffixes=['', 'Min'], how='left', on=['matchId', 'groupId'])
print('merged min')

del train_min
gc.collect()

# Test the model with feature Min added
test_feature_accuracy(train_with_min, 'Min')

del train_with_min
gc.collect()

train_with_minrank = train.merge(train_min_rank, suffixes=['', 'MinRank'], how='left', on=['matchId', 'groupId'])
print('merged minrank')

del train_min_rank
gc.collect()

# Test the model with feature MinRank added
test_feature_accuracy(train_with_minrank, 'MinRank')

del train_with_minrank
gc.collect()

# Match mean
train_match_mean = train.groupby('matchId')[predictors].mean()

train_with_match_mean = train.merge(train_match_mean, suffixes=['', 'MatchMean'], how='left', on=['matchId'])
print('merged match_mean')

del train_match_mean
gc.collect()

# Test model with feature MatchMean added
test_feature_accuracy(train_with_match_mean, 'MatchMean')

del train_with_match_mean
gc.collect()

Rank features provide a much more noticeable increase in performance, with MeanRank and MaxRank improving the accuracy by roughly 10% each, from about 83% to about 93%. It's clear that rank features are very important in this case.

In [None]:
# Compare to baseline
def compare_to_baseline(pos):
    improve = base_improve[pos]
    feature = feature_names[pos]
    accuracy = accuracies[pos]
    
    print(feature + ' score: ' + str(accuracy))
    print(feature + ' improvement over baseline: ' + str(improve))

The following loop compares each model to the baseline.

In [None]:
for i in range(len(feature_names)):
    compare_to_baseline(i)