In [None]:
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
print(os.listdir("../input"))

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df
# courtesy of Guillaume Martin https://www.kaggle.com/gemartin

In [None]:
# train_x = import_data('../input/train.csv')

In [None]:
train_x = pd.read_csv('../input/train.csv')

In [None]:
# test_x = import_data('../input/test.csv')

In [None]:
test_x = pd.read_csv('../input/test.csv')

**EDA**

In [None]:
train_x.isnull().sum()

In [None]:
eda = train_x.copy()

In [None]:
sns.set(style='whitegrid')
corr = train_x.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(16, 14))
cmap = sns.diverging_palette(145, 280, s=85, l=15, n=100)

sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=.5,
            cbar_kws={'shrink': .6})
plt.show()

In [None]:
train_x.corr()

In [None]:
'''
plot_data = train_x.sample(1000)
sns.set(style='ticks')
fig, ax = plt.subplots(figsize=(16,16))
sns.pairplot(plot_data, kind='reg')
'''

In [None]:
eda.numGroups.hist(figsize=(15, 13), bins=100)

In [None]:
print('Maximum number of groups: {}.'.format(eda.numGroups.max()))

In [None]:
eda['matchType'] = 0
eda.loc[eda.numGroups < 101, 'matchType'] = 0  # solo
eda.loc[eda.numGroups < 51, 'matchType'] = 1  # duo
eda.loc[eda.numGroups < 34, 'matchType'] = 2  # tri
eda.loc[eda.numGroups < 26, 'matchType'] = 3  # quad

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.pointplot(x="matchType", y='winPlacePerc', data=eda.sample(5000))
#  queuing quad has a lower win percentage

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.scatterplot(x='winPlacePerc', y='revives', data=eda.sample(5000))
# slight correlation in winrate and number of revives

In [None]:
print('Mean number of revives: {}'.format(eda.revives.mean()))
print('Maximum number of revives: {}'.format(eda.revives.max()))
# quite ridiculous number of revives, unless in custom matches

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.distplot(eda.revives)

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.distplot(eda.walkDistance)

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.scatterplot(x='winPlacePerc', y='walkDistance', data=eda.sample(5000))
# high correlation in walk distance to winrate, which makes sense

In [None]:
print("Mean distance walked: {}".format(eda['walkDistance'].mean()))
print("Maximum distance walked: {}".format(eda.walkDistance.max()))
# map size is 8x8 squares of 1km each, 17km of walking distance is very far

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.scatterplot(x='winPlacePerc', y='swimDistance', data=eda.sample(5000))
# some correlation in swim distance to winrate

In [None]:
print("Mean distance swimmed: {}".format(eda.swimDistance.mean()))
print("Maximum distance swimmed: {}".format(eda.swimDistance.max()))

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.scatterplot(x='winPlacePerc', y='rideDistance', data=eda.sample(5000))
# some correlation in ride distance to winrate

In [None]:
print("Mean distance rode: {}".format(eda.rideDistance.mean()))
print("Maximum distance rode: {}".format(eda.rideDistance.max()))
# riding 48km is riding across the map 6-7 times, which doesn't make much sense

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.scatterplot(x='winPlacePerc', y='vehicleDestroys', data=eda.sample(5000))

In [None]:
print("Mean vehicles destroyed: {}".format(eda.vehicleDestroys.mean()))
print("Maximum vehicles destroyed: {}".format(eda.vehicleDestroys.max()))

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.scatterplot(x='kills', y='damageDealt', data=eda.sample(5000))

In [None]:
print('Correlation between kills & damageDealt: {}'.format(eda[['kills', 'damageDealt']].corr()['kills'][1]))
print('Highest kill: {}'.format(eda.kills.max()))
# quite high correlation

In [None]:
fig, ax = plt.subplots(figsize=(15,13))
sns.set(style='whitegrid')
sns.pointplot(x='heals', y='winPlacePerc', data=eda.sample(5000), color='blue', ax=ax)
sns.pointplot(x='boosts', y='winPlacePerc', data=eda.sample(5000), color='orange', ax=ax)
plt.xlabel('Heals/Boosts')
plt.text(0, 0.98, 'Heals', color='blue', fontsize=15)
plt.text(0, 1, 'Boosts', color='orange', fontsize=15)

**Feature Cleaning**

In [None]:
def featureClean(dataset):
    dataset['matchType'] = 0
    dataset.loc[dataset.numGroups < 101, 'matchType'] = 0  # solo
    dataset.loc[dataset.numGroups < 51, 'matchType'] = 1  # duo
    dataset.loc[dataset.numGroups < 34, 'matchType'] = 2  # tri
    dataset.loc[dataset.numGroups < 26, 'matchType'] = 3  # quad
    dataset['matchType'] = dataset['matchType'].astype('category')

    dataset.loc[dataset['teamKills'] > 0, 'teamKills'] = 1  # biased data, single matchmake default 0
    dataset['teamKills'] = dataset['teamKills'].astype('category')
    
    return dataset
# numGroups replaced with matchType
# kills & damageDealt has overly high correlation, replaced with damageDealt

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin, BaseEstimator

In [None]:
class CustomSelector(TransformerMixin, BaseEstimator):
    def __init__(self, features):
        self.features = features
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        return x[self.features]

In [None]:
scaleData = Pipeline([
    ('selector', CustomSelector(['walkDistance', 'swimDistance', 'rideDistance', 'vehicleDestroys', 'assists', 'boosts', 'damageDealt', 'DBNOs', 
                                 'headshotKills', 'heals', 'killPlace', 'killPoints', 'killStreaks', 'longestKill', 'revives', 'roadKills', 
                                 'weaponsAcquired', 'winPoints'])),
    ('std_scaler', StandardScaler())
])

**Clean Data**

In [None]:
train_x = featureClean(train_x)
test_x = featureClean(test_x)

In [None]:
train_y = train_x.pop('winPlacePerc')

In [None]:
train_x[['walkDistance', 'swimDistance', 'rideDistance', 'vehicleDestroys', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace',
         'killPoints', 'killStreaks', 'longestKill', 'revives', 'roadKills', 'weaponsAcquired', 'winPoints']] = scaleData.fit_transform(train_x)

test_x[['walkDistance', 'swimDistance', 'rideDistance', 'vehicleDestroys', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace',
         'killPoints', 'killStreaks', 'longestKill', 'revives', 'roadKills', 'weaponsAcquired', 'winPoints']] = scaleData.transform(test_x)

In [None]:
x = train_x[['matchType', 'walkDistance', 'swimDistance', 'rideDistance', 'vehicleDestroys', 'assists', 'boosts', 'damageDealt', 'DBNOs', 
                   'headshotKills', 'heals', 'killPlace', 'killPoints', 'killStreaks', 'longestKill', 'revives', 'roadKills', 'teamKills', 
                   'weaponsAcquired', 'winPoints']]  # features used in model building
y = train_y

In [None]:
x_test = test_x[['matchType', 'walkDistance', 'swimDistance', 'rideDistance', 'vehicleDestroys', 'assists', 'boosts', 'damageDealt', 'DBNOs', 
                   'headshotKills', 'heals', 'killPlace', 'killPoints', 'killStreaks', 'longestKill', 'revives', 'roadKills', 'teamKills', 
                   'weaponsAcquired', 'winPoints']]

**Model Building**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold, ShuffleSplit
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

In [None]:
'''
params = [
    {'n_estimators': [50, 100], 'max_features': [8, 12]},
    {'bootstrap': [False]}
]
forest_reg = RandomForestRegressor(random_state=0, min_samples_leaf=20)
grid_search = GridSearchCV(forest_reg, params, cv=5, scoring='neg_mean_absolute_error')
'''

In [None]:
'''
grid_search.fit(train_x, train_y)
cvres = grid_search.cv_results_
for mean_score, params in sorted(zip(cvres['mean_test_score'], cvres['params'])):
    print(-mean_score, params)
'''

In [None]:
'''
forest_reg = RandomForestRegressor(n_estimators=100, random_state=0, min_samples_leaf = 10, max_features=12, bootstrap=True)
forest_reg.fit(train_x, train_y)
prediction = forest_reg.predict(test_x)
'''

In [None]:
max_rounds = 5000
ss = ShuffleSplit(n_splits=1, random_state=0)
for train_index, test_index in ss.split(x, y):
    x_train, y_train = x.iloc[train_index], y.iloc[train_index]
    x_val, y_val = x.iloc[test_index], y.iloc[test_index]
    
    params = {
        'objective': 'regression',
        'learning_rate': 0.05,
        'bagging_fraction': 0.8,
        'min_data_in_leaf': 20,
        'metric': 'l2'
    }
    
    train_dataset = lgb.Dataset(x_train, label=y_train)
    eval_dataset = lgb.Dataset(x_val, label=y_val)
    
    lgb_reg = lgb.train(params, train_set=train_dataset, num_boost_round=max_rounds,
                       valid_sets=eval_dataset, early_stopping_rounds=50, verbose_eval=10)

In [None]:
prediction = lgb_reg.predict(x_test, num_iteration=lgb_reg.best_iteration or max_rounds)

In [None]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['winPlacePerc'] = np.clip(prediction, 0, 1)
submission.to_csv('submissionlgb.csv', index=False)