In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import random

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold

In [None]:
from xgboost import XGBRegressor
import tensorflow as tf

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
seed_everything(0)

For data I used a preprocessed data that I uploaded into kaggle. I will share the used notebook for producing the used features. I used classic statistical features (e.g. mean, rms, etc..) on each of the sensor signals, their first and second derivatives, the cummulative sum of the sensor values, and the wavelet transformation. 

In [None]:
train = pd.read_csv('../input/ingv-data/Train.csv')
test = pd.read_csv('../input/ingv-data/Test.csv')

## 1- Data Processing

In [None]:
targets_df = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')

In [None]:
train = train.merge(targets_df, right_on='segment_id', left_on='id').drop(['segment_id'], axis=1)

In [None]:
test_idx = test['id']
test.drop(['id'], axis=1, inplace=True)

In [None]:
targets = train['time_to_eruption']
train_idx = train['id']
train = train.drop(['time_to_eruption','id'], axis=1)

In [None]:
c_columns = [c for c in train.columns.tolist() if len(train[c].value_counts())>=40]

In [None]:
len(c_columns)

In [None]:
uni_val_cols = [c for c in train.columns.tolist() if len(train[c].value_counts())==1]

In [None]:
len(uni_val_cols)

In [None]:
cat_cols = [c for c in train.columns.tolist() if (len(train[c].value_counts())<40 and c not in uni_val_cols)]

In [None]:
len(cat_cols)

In [None]:
train_features = train.drop(uni_val_cols, axis=1)
test_features  = test .drop(uni_val_cols, axis=1)

In [None]:
train.isna().sum().sum()

In [None]:
train_features.fillna(0, inplace=True)
test_features.fillna(0, inplace=True)

In [None]:
train_features.reset_index(drop=True, inplace=True)
test_features.reset_index(drop=True, inplace=True)

In [None]:
train_features.isna().sum().sum()

In [None]:
train_features[c_columns[10]].hist(bins=100)
plt.show()

In [None]:
targets = pd.DataFrame(targets)
targets.reset_index(inplace=True)
targets.head()

In [None]:
y_train = targets['time_to_eruption']
y_train.head()

## 2- Modeling

In [None]:
def build_xgb_model(seed_):
    xgb_meta = XGBRegressor(tree_method='gpu_hist',
                            colsample_bytree=0.4,
                             gamma=0,
                            learning_rate=0.07,
                            max_depth=3,
                            min_child_weight=1.5,
                            n_estimators=1000,
                            reg_alpha=0.75,
                            reg_lambda=0.45,
                            subsample=0.6,
                            seed=seed_)
    return xgb_meta

In [None]:
def run_xgb(X, y, X_test, fold, seed):
    
    seed_everything(seed)
    
    
    train_mask = X['kfold'] != fold
    valid_idc = X.loc[~train_mask].index
    
    X_train = X.loc[train_mask].reset_index(drop=True)
    y_train = y.loc[train_mask].reset_index(drop=True)

    
    X_val = X.loc[~train_mask].reset_index(drop=True)
    y_val = y.loc[~train_mask].reset_index(drop=True)
    
    X_train.drop(columns=['kfold'], inplace=True)
    X_val.drop(columns=['kfold'], inplace=True)
    
    oof = np.zeros((X.shape[0], 1))
    
    model = build_xgb_model(seed)
    
    print(f'============={seed}========={fold}==================')
    
    model.fit(X_train, y_train)
    train_loss = mean_absolute_error(y_train, model.predict(X_train))
    print(f"Seed: {seed}, FOLD: {fold}, train_loss: {train_loss}")
    valid_preds = model.predict(X_val)
    oof[valid_idc] = valid_preds.reshape((len(valid_preds),1))
    valid_loss = mean_absolute_error(y_val, valid_preds)
    print(f"Seed: {seed}, FOLD: {fold}, val_loss: {valid_loss}")
    #------------------ Predictions -------------------

    predictions = np.zeros((X_test.shape[0], 1))
    predictions = model.predict(X_test[X_train.columns]).reshape((len(X_test),1))
    
    return oof, predictions

In [None]:
def run_k_fold_xgb(X, y, X_test, seed):
    oof = np.zeros((train_features.shape[0], 1))
    predictions = np.zeros((test_features.shape[0], 1))
    
    for fold in range(N_FOLDS):
        oof_, pred_ = run_xgb(X, y, X_test, fold, seed)
        
        predictions += pred_ / N_FOLDS
        oof += oof_
        
    return oof, predictions

In [None]:
N_FOLDS = 10

In [None]:
# Averaging on multiple SEEDS

seeds = [0, 1, 2, 3, 4, 5, 6]
oof = np.zeros((train_features.shape[0], 1))
predictions = np.zeros((test_features.shape[0], 1))

for seed in seeds:
    folds = train_features.copy()
    folds['idx'] = train_idx
    folds['kfold'] = np.zeros(len(folds))
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    for f, (t_idx, v_idx) in enumerate(kf.split(train_features)) :
        folds.loc[v_idx, 'kfold'] = int(f)
    folds['kfold'] = folds['kfold'].astype(int)
    oof_, predictions_ = run_k_fold_xgb(folds.drop(['idx'], axis=1), y_train, test_features, seed)
    oof += oof_ / len(seeds)
    predictions += predictions_ / len(seeds)

In [None]:
print("CV MAE = {}".format(mean_absolute_error(y_train, oof)))
print("CV R2 = {}".format(r2_score(y_train, oof)))

## 3- Predictions

In [None]:
submission = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')
submission.head()

In [None]:
preds = pd.DataFrame()
preds['segment_id'] = test_idx
preds['time_to_eruption'] = predictions
preds.head(2)

In [None]:
submission = submission.drop(['time_to_eruption'], axis=1).merge(preds, on='segment_id')

In [None]:
submission.to_csv('submission.csv', header=True, index=False)