In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_csv('../input/pubg-finish-placement-prediction/train_V2.csv')
test = pd.read_csv('../input/pubg-finish-placement-prediction/test_V2.csv')

In [None]:
train.isnull().values.any()  

In [None]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

In [None]:
train.info()

In [None]:
train.head(5)

# Категориальные признаки

In [None]:
encoder = LabelEncoder()

In [None]:
train.groupId = encoder.fit_transform(train.groupId)
train.Id = encoder.fit_transform(train.Id)

In [None]:
test.groupId = encoder.fit_transform(test.groupId)
test.Id = encoder.fit_transform(test.Id)

In [None]:
train.matchId = encoder.fit_transform(train.matchId)
test.matchId = encoder.fit_transform(test.matchId) 

In [None]:
train.matchType = encoder.fit_transform(train.matchType)
test.matchType = encoder.fit_transform(test.matchType) 

In [None]:
train.describe()

In [None]:
test.describe()

# Разбиваем данные на тренировочные и валидационные

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_val = train_test_split(train, test_size = 0.3)

In [None]:
df_train.columns

In [None]:
features = ['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints']
categorical_feature = ['Id','groupId','matchId',  'matchType']
label = 'winPlacePerc'

In [None]:
dtrain = lgb.Dataset(data=df_train[features],
                     label=df_train[label],
                     feature_name=features,
                     categorical_feature=categorical_feature, 
                    free_raw_data=False)

dval = lgb.Dataset(data=df_val[features],
                   label=df_val[label],
                   feature_name=features,
                   categorical_feature=categorical_feature,
                  free_raw_data=False)

# Обучаем модель

In [None]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'mae',
        'metric': 'mae',
        'num_leaves': 64,
        'max_depth': None,
        'learning_rate': 0.1,
        'min_data_in_leaf': 10,
        'feature_fraction': 0.6,
        'bagging_fraction': 0.6,
        'bagging_freq': 1,
        'num_threads': 6,
        'verbosity': -1
    }

In [None]:
model = lgb.train(params, dtrain, 
              # указываем валидационный датасет и тренировочный (хотим посмотреть качество и на нем тоже)
              # однако он будет проигнорирован механикой ранней остановки
              valid_sets=(dtrain, dval),
              # поставим очень большое количество итераций бустинга
              num_boost_round=5000,
              # но будем использовать раннюю остановку по качеству на валидационной выборке
              early_stopping_rounds=25,
              #будем выводить промежуточные результаты каждые 25 итераций
              verbose_eval=25)

In [None]:
df_val[features]

In [None]:
y_pred = model.predict(df_val[features])

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
print('The mae of prediction is:', mean_absolute_error(df_val[label], y_pred))
print('The mse of prediction is:', mean_squared_error(df_val[label], y_pred))

# Перебор гиперпараметров

### Использовать lgb.cv (интерфейс практически совпадает с lgb.cv), чтобы с помощью кросс-валидации и ранней остановки получать качество работы алгоритма на указанных params

### Реализовать функцию eval_lgb, которая на вход принимает указанный список гиперпараметров, а на выходе возвращает значение метрики качества. ВНИМАНИЕ, некоторые параметры (objective, metric и т.д.) должны быть каждый раз одинаковы. Их передавать не нужно.

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
def eval_lgb(params):
    base_params = {
        'boosting_type': 'gbdt',
        'objective': 'mae',
        'metric': 'mae',
        'num_threads': 6
    }
    base_params.update(params)
    results = lgb.cv(params, dtrain, num_boost_round=5000, nfold=3, eval_train_metric = 'mae',  stratified=False,
                    verbose_eval=25, early_stopping_rounds=25)
    for key, val in results.items():
        print("\n" + key, " : ", val)

In [None]:
res = eval_lgb(params)

In [None]:
results

In [None]:
y_pred = results.predict(df_val[features], num_iteration=results.best_iteration)

In [None]:
train.isin([np.inf, -np.inf]).values.any()

In [None]:
from sklearn.utils.multiclass import type_of_target
type_of_target(df_train['winPlacePerc'])