In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold ,KFold, GridSearchCV
from xgboost import XGBRFRegressor,XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RANSACRegressor, LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label='훈련 세트')
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label='검증세트')

In [None]:
train = pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/train_V2.csv')
train = reduce_mem_usage(train)
test = pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/test_V2.csv')
test = reduce_mem_usage(test)

In [None]:
test = pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/test_V2.csv')
test = reduce_mem_usage(test)

In [None]:
print("shape of before train drop null data : "+ str(train.shape[0]) + "," + str(train.shape[1]))
train = train.dropna()
print("shape of after train drop null data : "+ str(train.shape[0]) + "," + str(train.shape[1]))

In [None]:
kfolds= KFold(n_splits=3)

rid = Ridge()
lasso = Lasso()
ela = ElasticNet()
rfr = RandomForestRegressor(random_state = 42)
xgr = XGBRegressor(random_state = 42)
xgrf = XGBRFRegressor(random_state = 42)
dtr = DecisionTreeRegressor(random_state = 42)
adar = AdaBoostRegressor(random_state = 42)
grdr = GradientBoostingRegressor(random_state = 42)
linr = LinearRegression()
logr = LogisticRegression(random_state = 42)
svr = SVR()
ranc = RANSACRegressor()
extr = ExtraTreesRegressor(random_state = 42)

def fillInf(df, val):
    numcols = df.select_dtypes(include = 'number').columns
    cols = numcols[numcols != 'winPlacePerc']
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    for c in data_cols:
        df[c].fillna(val, inplace = True)

def pipline(data):
    
    def fillInf(df, val):
        numcols = df.select_dtypes(include = 'number').columns
        cols = numcols[numcols != 'winPlacePerc']
        df[df == np.Inf] = np.NaN
        df[df == np.NINF] = np.NaN
        for c in data_cols:
            df[c].fillna(val, inplace = True)
    
    col = ['assists', 'boosts', 'damageDealt', 'DBNOs',
           'headshotKills','heals','killPlace','kills','killStreaks','longestKill',
            'walkDistance']

    cols = ['killPoints', 'maxPlace', 'numGroups', 'rankPoints', 'roadKills', 'teamKills','winPoints',
           'matchDuration']
    
    group = data.groupby(['matchId','groupId','matchType'])
    match = data.groupby('matchId')

    match_data = pd.concat([
        match.size().to_frame('mplayers'),
        match[col].sum().rename(columns = lambda s : 'msum' + s),
        match[col].max().rename(columns = lambda s : 'mmax' + s),
        match[col].mean().rename(columns = lambda s : 'mmean' + s)
    ], axis = 1).reset_index()

    group_data = pd.concat([
        group.size().to_frame('gplayer'),
        group[col].sum().rename(columns = lambda x : 'gsum' + x),
        group[col].max().rename(columns = lambda x : 'gmax' + x),
        group[col].mean().rename(columns = lambda x : 'gmean' + x)
    ], axis = 1).reset_index()

    
    data_one = pd.merge(match_data, group_data)

    data_one = reduce_mem_usage(data_one)

    data['DBNOsAndMP'] = data['DBNOs'] / data['maxPlace']
    data['DBNOsAndNG'] = data['DBNOs'] / data['numGroups']
    data['RDAndMD'] = data['rideDistance'] / data['matchDuration']
    data['voidKills'] = data['teamKills'] + data['roadKills']
    data['totalDistance'] = data['rideDistance'] + data['swimDistance'] + data['walkDistance']

    test_data = pd.merge(data_one, data)


    test_data['PlayerTime'] = test_data['mplayers'] / test_data['matchDuration']
    test_data['enemyPlayer'] = test_data['mplayers'] - test_data['gplayer']
    test_data['SavePlayer'] = test_data['enemyPlayer'] - test_data['kills']
    
    data_cols = test_data.columns
    fillInf(test_data, 0)
    
    group_cols = group_data.columns
    match_cols = match_data.columns
    
    for i in col:
        test_data['msum' + i + 'avg'] = test_data['msum' + i] / test_data[i]
        test_data['mmax' + i + 'avg'] = test_data['msum' + i] / test_data[i]
        test_data['mmean' + i + 'avg'] = test_data['mmean' + i] / test_data[i]
        test_data['gsum' + i + 'avg'] = test_data['gsum' + i] / test_data[i]
        test_data['gmax' + i + 'avg'] = test_data['gmax' + i] / test_data[i]
        test_data['gmean' + i + 'avg'] = test_data['gmean' + i] / test_data[i]
        
        test_data.drop('msum'+i, axis = 1, inplace = True)
        test_data.drop('mmax'+i, axis = 1, inplace = True)
        test_data.drop('mmean'+i, axis = 1, inplace = True)
        test_data.drop('gsum'+i, axis = 1, inplace = True)
        test_data.drop('gmax'+i, axis = 1, inplace = True)
        test_data.drop('gmean'+i, axis = 1, inplace = True)
    
    for i in col:
        test_data.drop(i, axis = 1, inplace = True)
    for j in cols:
        test_data.drop(j, axis = 1, inplace = True)

    data_cols = test_data.columns
    fillInf(test_data, 0)
    
    test_data = reduce_mem_usage(test_data)

    test_data = pd.get_dummies(test_data, columns = ['matchType'])
    test_data.drop({'groupId', 'matchId', 'Id'}, axis = 1, inplace = True)

    
        
    return test_data

test = pipline(test)
scaler = StandardScaler()
test = scaler.fit_transform(test)

In [None]:
train = pipline(train)

scaler = StandardScaler()
Y = train['winPlacePerc']
X = train.drop(['winPlacePerc'], axis = 1)
X = reduce_mem_usage(X)
X_scale = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [None]:
from lightgbm import LGBMRegressor

gbm=LGBMRegressor()

gbmreg = GridSearchCV(estimator = gbm,
                     scoring = 'neg_mean_squared_error',
                     param_grid = {'max_depth' : [2,4,6] },
                     cv = kfolds)
gbmreg.fit(X_train, y_train)
y_predict_gbm = gbmreg.predict(X_test)
print('best mse : ',gbmreg.best_score_,' r2_score : ', r2_score(y_test, y_predict_gbm))


In [None]:
pred_test_y = gbmreg.predict(test)
test_raw = pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/test_V2.csv')

In [None]:
test = pd.DataFrame(columns=['Id', 'winPlacePerc'])
test['Id'] = test_raw['Id']
test['winPlacePerc'] = pred_test_y

test.to_csv('submission.csv',index=False)

In [None]:
"""file = './submission.csv'

if os.path.isfile(file):
  os.remove(file)
