In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import seaborn as sns

In [None]:
train_stft = pd.read_csv("../input/volcano-20201231-feature-engineering/train_features.csv")
test_stft = pd.read_csv("../input/volcano-20201231-feature-engineering/test_features.csv")

train_fft = pd.read_csv("../input/volcano-2020-10-19/train_features.csv")
test_fft = pd.read_csv("../input/volcano-2020-10-19/test_features.csv")

train_features = pd.concat([train_stft, train_fft], axis=1)
test_features = pd.concat([test_stft, test_fft], axis=1)

features = pd.concat([train_features, test_features], axis=0)
features.fillna(0, inplace=True)
features

In [None]:
tmp = features.sample(n=40, axis=1, random_state=91)
tmp

In [None]:
fig, axes = plt.subplots(8, 5, figsize=(25, 20))

for i, col in enumerate(tmp.columns):
    axes.ravel()[i].hist(tmp[col], bins=50, color="teal")
    
plt.show()

In [None]:
from scipy.special import erfinv

# ランクガウス変換する関数を定義する
def rank_gauss(x):
    n = x.shape[0]    # サンプル数
    temp = x.argsort()
    rank_x = temp.argsort() / n
    rank_x -= rank_x.mean()
    rank_x *= 2
    efi_x = erfinv(rank_x)
    efi_x -= efi_x.mean()
    return efi_x

In [None]:
fig, axes = plt.subplots(8, 5, figsize=(25, 20))

for i, col in enumerate(tmp.columns):
    rg = rank_gauss(tmp[col])
    axes.ravel()[i].hist(rg, bins=50, color="teal")
    
plt.show()

In [None]:
X_rg = pd.DataFrame()

for col in features.columns:
    rg = rank_gauss(features[col])
    X_rg = pd.concat([X_rg, rg], axis=1)
    
X_rg

In [None]:
X = X_rg.iloc[:len(train_features), :]
X_test = X_rg.iloc[len(train_features):, :]

X = np.array(X)
X_test = np.array(X_test)

In [None]:
train = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
y = train["time_to_eruption"]
y

## Prediction

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error as mae
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=34)

print(X_train.shape)
print(X_val.shape)

In [None]:
kf1 = KFold(n_splits=10, shuffle=True, random_state=95)
kf2 = KFold(n_splits=10, shuffle=True, random_state=43)

## lgb

In [None]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

In [None]:
def lgb_score(params):

    params["num_leaves"] = int(params["num_leaves"])
    params["min_data_in_leaf"] = int(params["min_data_in_leaf"])

    model = lgb.train(params=lgb_params,
                      train_set=lgb_train,
                      valid_sets=(lgb_train, lgb_val),
                      num_boost_round=10000,
                      early_stopping_rounds=20,
                      verbose_eval=0)
    
    pred = model.predict(X_val, num_iteration=model.best_iteration)    
    score = mae(pred, y_val)
    
    history.append((params, score))
    
    return {"loss": score, "status": STATUS_OK}

In [None]:
lgb_params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "eval_metric": "mae",
    "num_leaves": 31,
    "learning_rate": 0.08,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "min_data_in_leaf": 20,
    "random_state": 93
}

# 探索するパラメータ空間を指定
params_space = {
    "num_leaves": hp.quniform("num_leaves", 25, 50, 1),
    "min_data_in_leaf": hp.quniform("min_data_in_leaf", 10, 70, 1),
    "bagging_fraction": hp.quniform("bagging_fraction", 0.6, 0.95, 0.025),
    "feature_fraction": hp.quniform("feature_fraction", 0.6, 0.95, 0.025)
}

trials = Trials()
history = []

fmin(lgb_score, params_space, algo=tpe.suggest, trials=trials, max_evals=100)

history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]

print(f"best_params: {best[0]}, mae: {best[1]:.10f}")

In [None]:
lgb_params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "eval_metric": "mae",
    "num_leaves": best[0]["num_leaves"],
    "learning_rate": 0.05,
    "bagging_fraction": best[0]["bagging_fraction"],
    "feature_fraction": best[0]["feature_fraction"],
    "min_data_in_leaf": best[0]["min_data_in_leaf"],
    "random_state": 93
}

In [None]:
pred_lgb1 = pd.DataFrame()

for k, (tr_id, vl_id) in enumerate(kf1.split(X, y)):
    
    X_train, X_val = X[tr_id, :], X[vl_id, :]
    y_train, y_val = y[tr_id], y[vl_id]
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    
    model = lgb.train(params=lgb_params,
                      train_set=lgb_train,
                      valid_sets=(lgb_train, lgb_val),
                      num_boost_round=20000,
                      early_stopping_rounds=100,
                      verbose_eval=0)
    
    pred = model.predict(X_val, num_iteration=model.best_iteration)
    print(f"k={k+1}, mae: {mae(pred, y_val)}")
    
    pred = model.predict(X_test, num_iteration=model.best_iteration)
    pred = pd.Series(pred)
    pred_lgb1 = pd.concat([pred_lgb1, pred], axis=1)

In [None]:
pred_lgb1

In [None]:
pred_lgb2 = pd.DataFrame()

for k, (tr_id, vl_id) in enumerate(kf2.split(X, y)):
    
    X_train, X_val = X[tr_id, :], X[vl_id, :]
    y_train, y_val = y[tr_id], y[vl_id]
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    
    model = lgb.train(params=lgb_params,
                      train_set=lgb_train,
                      valid_sets=(lgb_train, lgb_val),
                      num_boost_round=20000,
                      early_stopping_rounds=100,
                      verbose_eval=0)
    
    pred = model.predict(X_val, num_iteration=model.best_iteration)
    print(f"k={k+1}, mae: {mae(pred, y_val)}")
    
    pred = model.predict(X_test, num_iteration=model.best_iteration)
    pred = pd.Series(pred)
    pred_lgb2 = pd.concat([pred_lgb2, pred], axis=1)

In [None]:
pred_lgb2

# Submission

In [None]:
pred = pd.concat([pred_lgb1, pred_lgb2], axis=1)
pred = pred.mean(axis=1)
pred

In [None]:
sample_sub = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv")
sub = sample_sub.copy()
sub["time_to_eruption"] = pred

sub

In [None]:
sub.to_csv("submission.csv", index=False)