# Libraries
---

In [None]:
import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters
---

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 50000
EARLY_STOPPING_ROUNDS = 200
VERBOSE = 1000
SEED = 2021

TARGET = 'f1'

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Datasets
---

In [None]:
INPUT = "../input/tabular-playground-series-sep-2021/"
OUTPUT = "./output/"
os.makedirs(OUTPUT, exist_ok=True)

train = pd.read_csv(INPUT + "train.csv").set_index('id')
test = pd.read_csv(INPUT + "test.csv").set_index('id')

all_df = pd.concat([train, test])
features = [col for col in test.columns if ('f' in col) & (col!=TARGET)]

train_df = all_df[~all_df[TARGET].isna()].copy() 
test_df = all_df[all_df[TARGET].isna()].copy()

target = train_df[TARGET]

del all_df, train, test
gc.collect()

In [None]:
target.hist()

In [None]:
train_df['n_missing'] = train_df[features].isna().sum(axis=1)
test_df['n_missing'] = test_df[features].isna().sum(axis=1)

train_df[features] = train_df[features].fillna(train_df[features].mean())
test_df[features] = test_df[features].fillna(test_df[features].mean())

train_df['std'] = train_df[features].std(axis=1)
test_df['std'] = test_df[features].std(axis=1)

train_df['min'] = train_df[features].min(axis=1)
test_df['min'] = test_df[features].min(axis=1)

features += ['n_missing', 'std', 'min']
n_missing = train_df['n_missing'].copy()

In [None]:
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
test_df[features] = scaler.transform(test_df[features])

In [None]:
train_df[features].shape, target.shape, test_df[features].shape

# LGBMClassifier
---

In [None]:
lgb_params = {
    'objective': 'regression',
    'n_estimators': N_ESTIMATORS,
    'random_state': SEED,
    'learning_rate': 5e-3,
    'subsample': 0.4,
    'subsample_freq': 1,
    'colsample_bytree': 0.2,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'importance_type': 'gain',
}

In [None]:
lgb_oof = np.zeros(train_df.shape[0])
lgb_pred = np.zeros(test_df.shape[0])
lgb_importances = pd.DataFrame()

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train_df)):
    print(f"===== fold {fold} =====")
    X_train, y_train = train_df[features].iloc[trn_idx], target.iloc[trn_idx]
    X_valid, y_valid = train_df[features].iloc[val_idx], target.iloc[val_idx]
    X_test = test_df[features]
    
    start = time.time()
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )

    fi_tmp = pd.DataFrame()
    fi_tmp['feature'] = model.feature_name_
    fi_tmp['importance'] = model.feature_importances_
    fi_tmp['fold'] = fold
    fi_tmp['seed'] = SEED
    lgb_importances = lgb_importances.append(fi_tmp)

    lgb_oof[val_idx] = model.predict(X_valid)
    lgb_pred += model.predict(X_test) / N_SPLITS
        
    elapsed = time.time() - start
    rmse = mean_squared_error(y_valid, lgb_oof[val_idx], squared=False)
    print(f"fold {fold} - lgb rmse: {rmse:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof lgb rmse = {mean_squared_error(target, lgb_oof, squared=False)}")

np.save(OUTPUT + f"{TARGET}_oof.npy", lgb_oof)
np.save(OUTPUT + f"{TARGET}_pred.npy", lgb_pred)

# Feature importance
---

In [None]:
order = list(lgb_importances.groupby('feature').mean().sort_values('importance', ascending=False).index)

fig = plt.figure(figsize=(16, 16), tight_layout=True)
sns.barplot(x="importance", y="feature", data=lgb_importances.groupby('feature').mean().reset_index(), order=order)
plt.title("LightGBM feature importances")