In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb

In [None]:
data_path = '/kaggle/input/pubg-finish-placement-prediction'

## Load data

In [None]:
train_data = pd.read_csv(os.path.join(data_path, 'train_V2.csv'))
train_data.head()

## Basic exploration

In [None]:
# Basic stats
train_data.describe()

In [None]:
# Each Id is unique
print(train_data.shape)
print(train_data.Id.nunique())

In [None]:
# No duplicates
train_data.duplicated().sum()

In [None]:
# One NaN value in target!
train_data.isna().sum()

In [None]:
# Drop NaN value
train_data.dropna(inplace=True)
train_data.shape

## Feature distributions

In [None]:
# Separate fetaures and target (dropped Id, Group Id, MatchId)
X = train_data[['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints',
      'kills', 'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace', 'numGroups',
      'rankPoints', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys',
      'walkDistance', 'weaponsAcquired', 'winPoints']]

y = train_data.winPlacePerc

In [None]:
# Distribution plots of numerical features
plt.figure(figsize=(15,20))
for index, column in enumerate(X.columns[0:12]):
    plt.subplot(4,3,index+1)
    plt.boxplot(X[column])
    plt.title(f'{column} distribution');

In [None]:
plt.figure(figsize=(15,20))
for index, column in enumerate(X.columns[13:]):
    plt.subplot(4,3,index+1)
    plt.boxplot(X[column])
    plt.title(f'{column} distribution');

In [None]:
# Distribution plot of matchType
plt.figure(figsize=(10,7))
plt.bar(x = X.matchType.unique(), height= X.matchType.value_counts())
plt.xticks(rotation=45);

In [None]:
# Correlation coefficients
plt.figure(figsize=(12,12))
corr = X.corr()
sns.heatmap(corr);

* maxPlace and numGroups are ~100% positively correlated -> consider drop one of them(?)
* killPoints and winPoints also highly positively correlated (corr > 80%)
* rankPoints ~100% negatively correlated with killPoints and winPoints

## Simple model

### Basic preprocessing

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Scale for numerical variables: 
num_transformer = Pipeline([
    ('scaler', RobustScaler())])

# Encode categorical variable (matchType)
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Paralellize "num_transformer" and "One hot encoder"
simple_preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include=['int64', 'float64'])),
    ('cat_transformer', cat_transformer, make_column_selector(dtype_include=['object']))],
    remainder='passthrough')

In [None]:
# fit_transform on X_train ONLY!
X_train_preprocessed = simple_preprocessor.fit_transform(X_train)
# Then transform X_test, do not fit on X_test!
X_test_preprocessed = simple_preprocessor.transform(X_test)

### First model: linear regression

In [None]:
lin_reg = LinearRegression()

In [None]:
# Cross-validation using Mean Absolute Error (MAE) as scoring
cv_results = cross_validate(lin_reg, X_train_preprocessed, y_train, scoring = 'neg_mean_absolute_error', cv = 5)
avg_mae = abs(cv_results['test_score'].mean())
avg_mae

In [None]:
# Fit and score linear regression (model.score returns R2 score, not MAE!)
lin_reg.fit(X_train_preprocessed, y_train)
lin_reg.score(X_test_preprocessed, y_test)

In [None]:
# Predict with fitted model and compute MAE
y_pred = lin_reg.predict(X_test_preprocessed)
mean_absolute_error(y_test, y_pred)

* A linear regression on all preprocessed features generates ~0.089 of MAE in average
* R2 score isn't that bad... ~0.83
* Some features probably can be dropped while new features could be created

## Features importance

From my battle royale gaming experience, here is the "intuitive" list of features that should be most important (unsorted):

* assists         --> are you a team player?
* boosts          --> are you taking advantage of items?
* DBNOs           --> can you hurt enemies?
* headshotKills   --> how precise are you?
* heals           --> are you taking advantage of items?
* damageDealt     --> can you recover from damages? (can be double sided though...)
* kills           --> do you engage battle a lot and can you actually kill?
* killStreaks     --> how many players can you kill in a row? (especially in squads it's easy to kill and then immediately get killed by their teammate)
* matchDuration   --> Can you actually survive through time?
* revives         --> Are you able to support/heal your teammates?
* roadKills       --> Not that easy to achieve... intentionally
* weaponsAcquired --> Can you pick the right weapons (can be double sided...)
* winPoints       --> What's your external winning ranking?

Those features intuitively reflect a player's skills, therefore chances to win... but again, only my assumption. Let's try a feature permutation importance on Linear Regression.

In [None]:
# Remove matchType from X_train to compute permuation importances
X_train_permutation = X_train.drop('matchType', axis = 1, inplace = False)

In [None]:
# Fit a linear regression then compute permutation scores
lin_model = LinearRegression().fit(X_train_permutation, y_train) # Fit model

permutation_score = permutation_importance(lin_model, X_train_permutation, y_train, n_repeats=10) # Perform Permutation

In [None]:
# Display permuation scores in a sorted (descending) dataframe
importance_df = pd.DataFrame(np.vstack((X_train_permutation.columns,
                                        permutation_score.importances_mean)).T) # Unstack results

importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

The 7 last features, from swimDistance to longestKill, don't have a significant impact on the R2 score.

However, **low-impact** features on **bad model** could be **higher-impact** features on **better model**... 

Especially, if you're skilled enough to head shot or run over players (intentionally) then you most likely have a better chance to win...

-> This permutation score does not make much sense -> probably Linear Regression is a **bad model** here

## Feature engineering

In [None]:
def create_features(data):
    data['playersJoined'] = data.groupby('matchId')['matchId'].transform('count')
    data['killsNorm'] = data['kills']*((100-data['playersJoined'])/100 + 1)
    data['damageDealtNorm'] = data['damageDealt']*((100-data['playersJoined'])/100 + 1)
    data['totalDistance'] = data['rideDistance'] + data['swimDistance'] + train_data['walkDistance']
    data['walkingRatio'] = data['walkDistance'] / (data['totalDistance'] + 1)
    data['swimRatio'] = data['swimDistance'] / (data['totalDistance'] + 1)
    data['rideRatio'] = data['rideDistance'] / (data['totalDistance'] + 1)
    data['healsAndBoosts'] = data['heals'] + (data['boosts'] + 1)
    data['healsRatio'] = data['heals'] / (data['healsAndBoosts'] + 1)
    data['boostsRatio'] = data['boosts'] / (data['healsAndBoosts'] + 1)
    data['healsAndBoostsPerDistance'] = data['healsAndBoosts'] / (data['totalDistance'] + 1)
    data['killsPerTotalDistance'] = data['kills']/(data['totalDistance']+1)
    data['team'] = [1 if i>50 else 2 if (i>25 & i<=50) else 4 for i in data['numGroups']]
    data['headShotsKillsRatio'] = data['headshotKills']/(data['kills'] + 1)
    data['weaponsPerTotalDistance'] = data['weaponsAcquired'] / (data['totalDistance'] + 1)
    data['healsPerDamageDealt'] = data['heals'] / (data['damageDealt'] + 1)
    data['boostsPerTotalDistance'] = data['boosts'] / (data['totalDistance'] + 1)
    data['roadKillsRatio'] = data['roadKills'] / (data['kills'] + 1)
    data['killsPerSecond'] = data['kills'] / data['matchDuration']
    data['revivesPerKills'] = data['revives'] / (data['kills'] + 1)
    data['assistsPerKills'] = data['assists'] / (data['kills'] + 1)
    data['DBNOsPerKills'] = data['DBNOs'] / (data['kills'] + 1)
    data_engineered = data.drop(['Id', 'groupId', 'matchId', 'assists', 'kills', 'damageDealt', 'rideDistance', 'swimDistance', 'walkDistance', 'heals', 'boosts', 'rankPoints', 'headshotKills', 'revives', 'roadKills', 'DBNOs', 'weaponsAcquired'], axis = 1, inplace = False)
    data_engineered.dropna(inplace=True)
    data_engineered.drop(['vehicleDestroys', 'healsPerDamageDealt', 'teamKills', 'boostsPerTotalDistance', 'healsAndBoostsPerDistance', 'team'], axis = 1, inplace = True)
    
    return data_engineered

In [None]:
train_data_engineered = create_features(train_data)

In [None]:
train_data_engineered.shape

## Preprocessing pipeline

In [None]:
def get_features_to_scale(data):
    feature_to_scale = []
    for feature in data.select_dtypes(exclude= 'object').columns:
        if max(data[feature]) > 1:
            feature_to_scale.append(feature)
    return feature_to_scale

In [None]:
def build_preprocessor(data):
    num_transformer = Pipeline([
        ('scaler', RobustScaler())])

    # Encode categorical variable (matchType)
    cat_transformer = OneHotEncoder(handle_unknown='ignore')

    # Paralellize "num_transformer" and "One hot encoder"
    preprocessor = ColumnTransformer([
        ('num_transformer', num_transformer, get_features_to_scale(data)),
        ('cat_transformer', cat_transformer, make_column_selector(dtype_include=['object']))],
        remainder='passthrough')
    return preprocessor

## Engineered features importance (with linear regression)

In [None]:
# Drop matchType to compute features importance
#X_train_permutation = X_engineered.drop('matchType', axis = 1, inplace = False)

In [None]:
# Perform feature importance on linear regression for computing issues
#lin_model = LinearRegression().fit(X_train_permutation, y) # Fit model

#permutation_score = permutation_importance(lin_model, X_train_permutation, y, n_repeats=10) # Perform Permutation

In [None]:
# Get decreasing features importance
#importance_df = pd.DataFrame(np.vstack((X_train_permutation.columns,
                                        #permutation_score.importances_mean)).T) # Unstack results

#importance_df.columns=['feature','score decrease']

#importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

## LightGradientBoosting Regression

In [None]:
# Sample 1 million observations
#train_data_engineered_sampled = train_data_engineered.sample(1000000)
# Get features and target from train_data_engineered
X_engineered = train_data_engineered.drop('winPlacePerc', axis = 1, inplace = False)
y = train_data_engineered.winPlacePerc
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_engineered, y, test_size=0.3, random_state=42)

In [None]:
# Build preprocessor
preprocessor = build_preprocessor(X_engineered)
# fit_transform on X_train ONLY!
X_train_preprocessed = pd.DataFrame(preprocessor.fit_transform(X_train))
# Then transform X_test, do not fit on X_test!
X_test_preprocessed = pd.DataFrame(preprocessor.transform(X_test))

In [None]:
# LightGB Regression
params={'learning_rate': 0.05,
        'objective':'mae',
        'metric':'mae',
        'num_leaves': 128,
        'verbose': 1,
        'random_state':42,
        'bagging_fraction': 0.7,
        'feature_fraction': 0.7
       }

reg = lgb.LGBMRegressor(**params, n_estimators=10000)
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
#scores = cross_val_score(reg, X_train_preprocessed, y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
#print(abs(scores))
reg.fit(X_train_preprocessed, y_train)
#pred = reg.predict(X_test_preprocessed, num_iteration=reg.best_iteration_)

#mean_absolute_error(y_test, pred)

## Submission

In [None]:
test_data = pd.read_csv(os.path.join(data_path, 'test_V2.csv'))

In [None]:
print(test_data.shape)
test_data.head()

In [None]:
test_data_engineered = create_features(test_data)
#preprocessor = build_preprocessor(test_data_engineered)
X_test_preprocessed = pd.DataFrame(preprocessor.transform(test_data_engineered))

In [None]:
y_pred = reg.predict(X_test_preprocessed, num_iteration=reg.best_iteration_)

In [None]:
submission = pd.DataFrame(test_data.Id)
submission.columns = ['Id']
submission[ 'winPlacePerc'] = y_pred
submission

In [None]:
submission.to_csv('submission.csv', index = False)

In [None]:
submission.to_csv('submission.csv', index = False)