In [None]:
# !git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
# %cd LightGBM

In [None]:
# !mkdir build

In [None]:
# !cmake -DUSE_GPU=1

In [None]:
# !make -j$(nproc)

In [None]:
# !sudo apt-get -y install python-pip

In [None]:
# !sudo -H pip install setuptools pandas numpy scipy sckit-learn -U

In [None]:
# %cd /content/LightGBM/python-package/

In [None]:
# !sudo python setup.py install --precompile

In [None]:
# !pip install optuna

In [None]:
# !pip install pickle5

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import os
import time
import gc
import pickle5 as pickle
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/kaggle/train.pkl', 'rb') as fh:
    train = pickle.load(fh)

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/kaggle/test.pkl', 'rb') as gh:
    test = pickle.load(gh)

In [None]:
train.columns.unique()

In [None]:

train['headshotrate'] = train['kills']/train['headshotKills']
train['killStreakrate'] = train['killStreaks']/train['kills']
train['healthitems'] = train['heals'] + train['boosts']
train['totalDistance'] = train['rideDistance'] + train["walkDistance"] + train["swimDistance"]
train['headshotKills_over_kills'] = train['headshotKills'] / train['kills']
train['distance_over_weapons'] = train['totalDistance'] / train['weaponsAcquired']
train['walkDistance_over_heals'] = train['walkDistance'] / train['heals']
train['walkDistance_over_kills'] = train['walkDistance'] / train['kills']
train['killsPerWalkDistance'] = train['kills'] / train['walkDistance']
train["skill"] = train["headshotKills"] + train["roadKills"]

train[train == np.Inf] = np.NaN
train[train == np.NINF] = np.NaN

train.fillna(0, inplace=True)

train = train.drop(['Id', 'groupId', 'matchId'], axis=1)

In [None]:
matchType = train.matchType.unique()
match_dict = {}
for i, each in enumerate(matchType):
    match_dict[each] = i
train.matchType = train.matchType.map(match_dict)
matchtype_test = test.matchType.unique()
match_dict_test = {}
for i, each in enumerate(matchtype_test):
    match_dict_test[each] = i
test.matchType = test.matchType.map(match_dict_test)


In [None]:
train.info()

In [None]:
X = train.drop('winPlacePerc', axis=1)
y = train['winPlacePerc']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

In [None]:
# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler

mm_sc = MinMaxScaler()
mm_sc.fit(X_train)

X_train_scaled = mm_sc.transform(X_train)
X_test_scaled = mm_sc.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

linear = LinearRegression()
linear.fit(X_train_scaled, y_train)

In [None]:
y_pred = linear.predict(X_test_scaled)
y_pred

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print('MSE:', mse)
print('R Squared:', r2)
print('MAE:', mae)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from lightgbm.sklearn import LGBMRegressor

# KFold(CV), partial : optuna를 사용하기 위함
from sklearn.model_selection import KFold
from functools import partial


In [None]:
import time
start=time.time()
model = RandomForestRegressor(n_estimators=10, min_samples_leaf=5,
                                max_features=0.5, n_jobs=-1, verbose=2)
model.fit(X_train, y_train)
end = time.time()
print(f"{end - start:.5f} sec")

In [None]:
from sklearn.metrics import r2_score
evaluation_metric = r2_score

print("Prediction")
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)


train_score = evaluation_metric(y_train, pred_train)
test_score = evaluation_metric(y_test, pred_test)

print("Train Score : %.4f" % train_score)
print("Test Score : %.4f" % test_score)

In [None]:
# For Regression

def optimizer(trial, X, y, K):
    # 조절할 hyper-parameter 조합을 적어줍니다.
    n_estimators = trial.suggest_int("n_estimators", 50, 1000)
    max_depth = trial.suggest_int("max_depth", 8, 30)
    max_features = trial.suggest_categorical("max_features", ['auto', 'sqrt', 'log2'])
    
    
    # 원하는 모델을 지정합니다, optuna는 시간이 오래걸리기 때문에 저는 보통 RF로 일단 테스트를 해본 뒤에 LGBM을 사용합니다.
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=-1,
                                  random_state=0xC0FFEE)
    
    
    # K-Fold Cross validation을 구현합니다.
    folds = KFold(n_splits=K)
    scores = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        score = evaluation_metric(y_val, preds)
        scores.append(score)
    
    
    # K-Fold의 평균 loss값을 돌려줍니다.
    return np.mean(scores)

In [None]:
X_train.shape, y_train.shape

In [None]:
K = 5 # Kfold 수
opt_func = partial(optimizer, X=X_train, y=y_train, K=K)

rf_study = optuna.create_study(study_name="RF", direction="maximize") # regression task에서 R^2를 최대화!
rf_study.optimize(opt_func, n_trials=30)

In [None]:
rf_study.trials_dataframe()

In [None]:
# random sampler
sampler = TPESampler(seed=10)

# define function
def objective(trial):

    lgbm_param = {
        'objective': 'regression',
        'verbose': -1,
        'metric': 'mae', 
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
     }

    # Generate model
    model_lgbm = LGBMRegressor(**lgbm_param)
    model_lgbm = model_lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
                                verbose=0, early_stopping_rounds=200)
                           
    # * 평기 지표이다.
    # 원하는 평가 지표에 따라 사용하면 된다.                         
    MAE = mean_absolute_error(y_test, model_lgbm.predict(X_test))
    return MAE

optuna_lgbm = optuna.create_study(direction='minimize', sampler=sampler)

# * n_trials의 경우 optuna를 몇번 실행하여 hyper parameter를 찾을 것인지를 정한다.
# 50으로 설정해도 유의미한 값이 나온다.
optuna_lgbm.optimize(objective, n_trials=50)

In [None]:
lgbm_trial = optuna_lgbm.best_trial
lgbm_trial_params = lgbm_trial.params
print('Best Trial: score {},\nparams {}'.format(lgbm_trial.value, lgbm_trial_params))


In [None]:
lgbm = LGBMRegressor(**lgbm_trial_params)
lgbm_study = lgbm.fit(X_train, y_train)
