In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Special Thanks

https://www.kaggle.com/rejasupotaro/effective-feature-engineering

# Workflow
1. Import datasets
1. EDA
1. Feature Engineering
1. Modeling
1. Submit

# Import datasets

In [None]:
train = pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/train_V2.csv')

In [None]:
# Display all columns
pd.options.display.max_columns = None

In [None]:
train.head()

# EDA

## Correlation

In [None]:
# Import libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(11, 11))
sns.heatmap(
    train.corr(),
    linewidths=0.1,
    cmap='RdBu',
)

# Feature Engineering

In [None]:
# Import libraries
import gc
import time
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
# Ideas to experiment

def original(df):
    return df

def items(df):
    df['items'] = df['heals'] + df['boosts']
    return df

def players_in_team(df):
    agg = df.groupby('groupId').size().to_frame('players_in_team')
    df = df.merge(agg, on='groupId')
    return df

def total_distance(df):
    df['total_distance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']
    return df

def headshots_over_kills(df):
    df['headshots_over_kills'] = df['headshotKills'] / df['kills']
    df['headshots_over_kills'] = df['headshots_over_kills'].fillna(0)
    return df

def killPlace_over_maxPlace(df):
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    return df

def walkDistance_over_heals(df):
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_heals'] = df['walkDistance_over_heals'].replace(np.inf, 0)
    return df

def walkDistance_over_kills(df):
    df['workDistance_ove_kills'] = df['walkDistance'] / df['kills']
    df['workDistance_ove_kills'] = df['workDistance_ove_kills'].fillna(0)
    df['workDistance_ove_kills'] = df['workDistance_ove_kills'].replace(np.inf, 0)
    return df

def teamwork(df):
    df['teamwork'] = df['assists'] + df['revives']
    return df
    
def match_mode_classifier(mt):
    if 'solo' in mt:
        return 'solo'
    elif 'duo' in mt:
        return 'duo'
    elif 'squad' in mt:
        return 'squad'
    else:
        return 'others'

In [None]:
# My Idea to experiment
def match_mode(df):
    df['match_mode'] = df['matchType'].map(lambda mt: match_mode_classifier(mt))
    dummies = pd.get_dummies(df['match_mode'])
    df = df.join(dummies)
    df = df.drop('match_mode', axis=1)
    return df

In [None]:
# Function to run experiments
def run_experiments(functions):
    results = []
    for function in functions:
        start = time.time()
        score = run_experiment(function)
        execution_time = time.time() - start
        result = {
            'name': function.__name__,
            'score': score,
            'exection time': f'{round(execution_time, 2)}s'
        }
        print(result)
        results.append(result)
        gc.collect()
    return pd.DataFrame(results, columns=['name', 'score', 'execution time']).sort_values(by='score')

In [None]:
# Function to run a experiment
def run_experiment(function):
    df = train.copy()
    df = function(df)
    
    target = 'winPlacePerc'
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
    cols_to_fit = [
        col
        for col in df.columns
        if col not in cols_to_drop
    ]
    
    X = df[cols_to_fit]
    y = df[target].fillna(df[target].mean())
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)
    
    model = LGBMRegressor(random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return mean_absolute_error(y_pred, y_valid)

In [None]:
# # Run Experiments!
# run_experiments([
#     teamwork,
#     match_mode,
#     original,
#     items,
#     players_in_team,
#     total_distance,
#     headshots_over_kills,
#     killPlace_over_maxPlace,
#     walkDistance_over_heals,
#     walkDistance_over_kills,
# ])

In [None]:
# Ideas to experiment
def min_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId','groupId'])[features].min()
    return df.merge(agg, suffixes=['', '_min'], how='left', on=['matchId', 'groupId'])

def max_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].max()
    return df.merge(agg, suffixes=['', '_max'], how='left', on=['matchId', 'groupId'])

def sum_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].sum()
    return df.merge(agg, suffixes=['', '_sum'], how='left', on=['matchId', 'groupId'])

def median_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].median()
    return df.merge(agg, suffixes=['', '_median'], how='left', on=['matchId', 'groupId'])

def mean_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    return df.merge(agg, suffixes=['', '_mean'], how='left', on=['matchId', 'groupId'])

def rank_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    agg = agg.groupby('matchId')[features].rank(pct=True)
    return df.merge(agg, suffixes=['', '_mean_rank'], how='left', on=['matchId', 'groupId'])

In [None]:
# My Idea to experiment

In [None]:
# # Run Experiments!
# run_experiments([
#     original,
#     min_by_team,
#     max_by_team,
#     sum_by_team,
#     median_by_team,
#     mean_by_team,
#     rank_by_team
# ])

## Permutation Importance

In [None]:
# import eli5
# from eli5.sklearn import PermutationImportance

# target = 'winPlacePerc'
# cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
# cols_to_fit = [col for col in train.columns if col not in cols_to_drop]

# X = train[cols_to_fit]
# y = train[target].fillna(train[target].mean())

# X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

# model = LGBMRegressor(random_state=0)
# model.fit(X_train, y_train)

# perm = PermutationImportance(model, random_state=42).fit(X_valid, y_valid)
# eli5.show_weights(perm, feature_names=list(cols_to_fit))

## Promising Features

**We couldn't run all of promissing aggregates because of memory usage error..**

In [None]:
def run_promising_preprocesses(df):
    # Caution! There are dependencies to run.
    df = run_promissing_aggregates(df)
    df = run_promissing_conversions(df)
    df = run_promissing_creations(df)
    return df

def run_promissing_aggregates(df):
    # Common
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    
    # Aggregates
#     agg_min = df.groupby(['matchId','groupId'])[features].min()
#     agg_max = df.groupby(['matchId', 'groupId'])[features].max()
#     agg_sum = df.groupby(['matchId', 'groupId'])[features].sum()
#     agg_median = df.groupby(['matchId', 'groupId'])[features].median()
    agg_mean = df.groupby(['matchId', 'groupId'])[features].mean()
    agg_rank = agg_mean.groupby('matchId')[features].rank(pct=True)
    
    # Merge
#     df = df.merge(agg_min, suffixes=['', '_min'], how='left', on=['matchId', 'groupId'])
#     df = df.merge(agg_max, suffixes=['', '_max'], how='left', on=['matchId', 'groupId'])
#     df = df.merge(agg_sum, suffixes=['', '_sum'], how='left', on=['matchId', 'groupId'])
#     df = df.merge(agg_median, suffixes=['', '_median'], how='left', on=['matchId', 'groupId'])
#     df = df.merge(agg_mean, suffixes=['', '_mean'], how='left', on=['matchId', 'groupId'])
    df = df.merge(agg_rank, suffixes=['', '_rank'], how='left', on=['matchId', 'groupId'])
    
    return df

def run_promissing_conversions(df):
    df = match_mode(df) # One-Hot encoding 'matchMode'
    return df

def run_promissing_creations(df):
    df = players_in_team(df) # Add 'players_in_team'
    df = killPlace_over_maxPlace(df) # Add 'killPlace_over_maxPlace'
    df = total_distance(df) # Add 'total_distance'
    df = items(df) # Add 'total_distance'
    return df

In [None]:
# Run Promising Preprocesses
train_preprocessed = run_promising_preprocesses(train.copy())
train_preprocessed

In [None]:
# Evaluate
def evaluate(df):
    target = 'winPlacePerc'
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
    cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
    X = df[cols_to_fit]
    y = df[target].fillna(df[target].mean())

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

    model = LGBMRegressor(random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return mean_absolute_error(y_pred, y_valid)

evaluate(train_preprocessed)

In [None]:
# Output Memory Usage
import sys

print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
print(" ------------------------------------ ")
for var_name in dir():
    if not var_name.startswith("_"):
        print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))

# Modeling

In [None]:
# Fitting with promissing data
target = 'winPlacePerc'
cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
cols_to_fit = [col for col in train_preprocessed.columns if col not in cols_to_drop]

X = train_preprocessed[cols_to_fit]
y = train_preprocessed[target].fillna(train_preprocessed[target].mean())

model = LGBMRegressor(random_state=0)
model.fit(X, y)

In [None]:
# Import Test Data
test = pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/test_V2.csv')

In [None]:
# Run Promising Preprocesses in Test
test_preprocessed = run_promising_preprocesses(test.copy())
test_preprocessed

In [None]:
# Predict
X_test = test_preprocessed[cols_to_fit]

y_pred = model.predict(X_test)
y_pred

# Submit

In [None]:
submission = pd.DataFrame({
    'Id': test_preprocessed['Id'], # Caution!
    'winPlacePerc': y_pred
})
submission.to_csv('submission.csv', index=False)