In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
pd.set_option('float_format', '{:f}'.format)
# Any results you write to the current directory are saved as output.

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
print('train')
train_df = import_data('../input/train.csv')
print('test')
test_df = import_data('../input/test.csv')

In [None]:
def featureEngineering(df):
    df_size = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
    df_mean = df.groupby(['matchId','groupId']).mean().reset_index()
    df_max = df.groupby(['matchId','groupId']).max().reset_index()
    df_min = df.groupby(['matchId','groupId']).min().reset_index()
    df_match_mean = df.groupby(['matchId']).mean().reset_index()
    df_train_max_PG = df.groupby(['matchId','groupId'])['kills'].count().reset_index().groupby('matchId')['kills'].max().reset_index()
    df_train_max_PG.columns = ['matchId','max_players_in_group']
    
    df = pd.merge(df, df_mean, suffixes=["", "_mean"], how='left', on=['matchId', 'groupId'])
    df = pd.merge(df, df_max, suffixes=["", "_max"], how='left', on=['matchId', 'groupId'])
    df = pd.merge(df, df_min, suffixes=["", "_min"], how='left', on=['matchId', 'groupId'])
    df = pd.merge(df, df_match_mean, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    df = pd.merge(df, df_size, how='left', on=['matchId', 'groupId'])
    df = pd.merge(df, df_train_max_PG, how='left', on=['matchId'])
    return df

#print('[{}] Start making some feature engineering...'.format(time.time() - start_time))

train_df = featureEngineering(train_df)
test_df  = featureEngineering(test_df)

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[(train_df.kills <= 35) & (train_df.assists <= 13) & (train_df.boosts <= 16) & (train_df.damageDealt <= 3500) 
                   & (train_df.DBNOs <= 35) & (train_df.headshotKills <= 22) & (train_df.killStreaks <= 10) & (train_df.assists <= 13)
                   & (train_df.longestKill <= 1000) & (train_df.revives <= 20) & (train_df.rideDistance <= 28000) & (train_df.roadKills <= 15) 
                   & (train_df.swimDistance <= 3500) & (train_df.teamKills <= 10) & (train_df.vehicleDestroys <= 3) & (train_df.walkDistance <= 13000) 
                   & (train_df.weaponsAcquired <= 60)]

print('New size: %d' % len(train_df))

def headshot_precent(data):
    data['headPerc']=data['headshotKills']/data['kills']
    data['roadPerc']=data['roadKills']/data['kills']
    data['totalDistance']=data['rideDistance']+data['swimDistance']+data['walkDistance']
    data['rideDistancePerc']=data['rideDistance']/data['totalDistance']
    data['swimDistancePerc']=data['swimDistance']/data['totalDistance']
    data['walkDistancePerc']=data['walkDistance']/data['totalDistance']
    
    data.fillna(0,axis=1,inplace=True)                                          
    return data.head()
headshot_precent(train_df)
headshot_precent(test_df)

In [None]:
features_not2use = ['Id', 'groupId', 'matchId','winPlacePerc_mean','winPlacePerc_max','winPlacePerc_min','winPlacePerc_match_mean']
for df in [train_df]:
    df.drop(features_not2use, axis=1, inplace=True)

features_not2use = ['Id', 'groupId', 'matchId']
for df in [test_df]:
    df.drop(features_not2use, axis=1, inplace=True)
    
X_train = train_df.sample(frac=0.8)
X_val = train_df.loc[~train_df.index.isin(X_train.index)]


In [None]:
#list(test_df)

In [None]:
y_train = X_train['winPlacePerc']
X_train.drop('winPlacePerc', axis=1, inplace=True)
y_val = X_val['winPlacePerc']
X_val.drop('winPlacePerc', axis=1, inplace=True)

In [None]:
list(test_df)

In [None]:
import catboost
import time
from lightgbm import LGBMRegressor
import xgboost as lgb

start_time = time.time()
model = LGBMRegressor(iterations=250, learning_rate=0.05, loss_function='MAE',eval_metric='MAE', depth = -1,
                          use_best_model=True, od_type="Iter", od_wait=10, thread_count=128, random_seed = 123, num_leaves= 144,n_estimators= 800,
                      bagging_fraction= 0.8, bagging_freq= 5, feature_fraction= 0.9, objective= 'regression_l2')
model.fit(X_train, y_train, eval_set=(X_val, y_val))
end_time = time.time()
print('The training time = {}'.format(end_time - start_time))

In [None]:
pred = model.predict(test_df)

In [None]:
test_new = import_data('../input/test.csv')
test_new['winPlacePercPred'] = pred
aux = test_new.groupby(['matchId','groupId'])['winPlacePercPred'].agg('mean').groupby('matchId').rank(pct=True).reset_index()
aux.columns = ['matchId','groupId','winPlacePerc']
test_new = test_new.merge(aux, how='left', on=['matchId','groupId'])
submission = import_data('../input/sample_submission.csv')
submission = test_new[['Id','winPlacePerc']]
submission.to_csv('sample_submission.csv', index=False)