In [None]:
%matplotlib inline
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
print(os.listdir("../input"))

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df
# courtesy of Guillaume Martin https://www.kaggle.com/gemartin

In [None]:
train_x = import_data('./train.csv')

In [None]:
test_x = import_data('./test.csv')

In [None]:
sns.set(style='whitegrid')
corr = train_x.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(16, 14))
cmap = sns.diverging_palette(145, 280, s=85, l=15, n=100)

sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=.5,
            cbar_kws={'shrink': .6})
plt.show()

In [None]:
train_x.corr()

In [None]:
pd.DataFrame.hist(train_x, figsize=(20, 20), bins=100)
plt.show()plt.show()

In [None]:
plt.scatter(x=train_x['revives'], y=train_x['winPlacePerc'])
plt.show()

In [None]:
def featureEngineer_train(dataset):
    dataset.drop(['Id', 'groupId', 'matchId'], inplace=True, axis=1)  # ids
    dataset.drop(['maxPlace'], inplace=True, axis=1)  # repeated variable from numGroups

    dataset.drop('vehicleDestroys', axis=1, inplace=True)  # not relevant to skills

    dataset['AllknockKillassist'] = dataset['DBNOs'] + dataset['assists'] + dataset['kills'] - dataset['roadKills']
    dataset.drop(['DBNOs', 'assists', 'kills', 'killPlace', 'roadKills'], inplace=True, axis=1)  # essentially the same

    dataset.drop('damageDealt', axis=1, inplace=True)  # over 0.8 correlation with kills/assist/DBNOs
    
    dataset['distanceTravelled'] = dataset['swimDistance'] + dataset['rideDistance'] + dataset['walkDistance']
    dataset.drop(['swimDistance', 'rideDistance', 'walkDistance'], axis=1, inplace=True)  # combining distances travelled
    
    dataset['heal'] = dataset['heals'] + dataset['boosts']
    dataset.drop(['heals', 'boosts'], axis=1, inplace=True)  # combining heals & boosts
    
    dataset.loc[dataset['revives'] > 10, 'revives'] = dataset.revives.median()  # anymore than that is just cheating/nonsense
    
    dataset.loc[dataset['teamKills'] > 0, 'teamKills'] = 1  # biased data, single matchmake default 0
    dataset['teamKills'] = dataset['teamKills'].astype('category')
    
    return dataset

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin, BaseEstimator

In [None]:
class CustomSelector(TransformerMixin, BaseEstimator):
    def __init__(self, features):
        self.features = features
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        return x[self.features]

In [None]:
scaleData = Pipeline([
    ('selector', CustomSelector(['headshotKills', 'killPoints', 'killStreaks', 'longestKill', 'numGroups', 'revives', 'weaponsAcquired', 'winPoints', 'AllknockKillassist', 'distanceTravelled', 'heal'])),
    ('std_scaler', StandardScaler())
])

In [None]:
train_y = train_x.pop('winPlacePerc')

In [None]:
train_x = featureEngineer_train(train_x)
test_x = featureEngineer_train(test_x)

In [None]:
train_x[['headshotKills', 'killPoints', 'killStreaks', 'longestKill', 'numGroups', 'revives', 'weaponsAcquired', 'winPoints', 'AllknockKillassist', 'distanceTravelled', 'heal']] = scaleData.fit_transform(train_x)
test_x[['headshotKills', 'killPoints', 'killStreaks', 'longestKill', 'numGroups', 'revives', 'weaponsAcquired', 'winPoints', 'AllknockKillassist', 'distanceTravelled', 'heal']] = scaleData.transform(test_x)

**Model Building**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
params = [
    {'max_features': [8, 12]},
    {'bootstrap': [False]}
]  # running more params take a long time
forest_reg = RandomForestRegressor(random_state=0, n_estimators=100)  # other hyperparameters will be default
grid_search = GridSearchCV(forest_reg, params, cv=3, scoring='neg_mean_absolute_error')

In [None]:
grid_search.fit(train_x, train_y)
cvres = grid_search.cv_results_
for mean_score, params in sorted(zip(cvres['mean_test_score'], cvres['params'])):
    print(-mean_score, params)

In [None]:
prediction = grid_search.predict(test_x)

In [None]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['winPlacePerc'] = np.clip(prediction, 0, 1)
submission.to_csv('submission.csv', index=False)