In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ACKNOWLEDGEMENTS: IMPORTANT
    
I have used the great features created here:
https://www.kaggle.com/amanooo/ingv-volcanic-basic-solution-stft/

I have used the ideas / code on tuning the model from here:
https://www.kaggle.com/isaienkov/top-3-efficient-ensembling-in-few-lines-of-code

I am just aiming to optimise the great features with some folds and testing model parameters.

In [None]:
sample_submission = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')
sample_submission.head(10)

In [None]:
train_set = pd.read_csv('/kaggle/input/volcano-stft-data/output_train_set.csv', index_col=0).reset_index(drop=True)
train_set.head(10)

In [None]:
test_set = pd.read_csv('/kaggle/input/volcano-stft-data/output_test_set.csv', index_col=0).reset_index(drop=True)
test_set.head(10)

In [None]:
#check that sample submission is lined up
sum(sample_submission['segment_id']==test_set['segment_id']) / len(test_set)

In [None]:
#create some stratification for cross validation
train_set['time_to_eruption_pc'] = train_set['time_to_eruption'] / train_set['time_to_eruption'].max()

from sklearn.model_selection import StratifiedKFold
NFOLDS=10
skf = StratifiedKFold(n_splits=NFOLDS)

train_set['label_strat'] = np.round(train_set['time_to_eruption_pc'] * 20, 0)

train_set['fold']=0
f=0
for trn_idx, val_idx in skf.split(train_set[['segment_id']], train_set['label_strat']):
    train_set.loc[val_idx, 'fold']=f
    f+=1
train_set['fold'].value_counts()

In [None]:
DROP_FTS = ['segment_id', 'time_to_eruption','h:m:s', 'label_strat', 'fold', 'time_to_eruption_pc',]
SEL_FTS = [x for x in train_set.columns if x not in DROP_FTS]
LABEL = 'time_to_eruption'

In [None]:
train_set[LABEL]

In [None]:
len(SEL_FTS)

nc=5
nr=len(SEL_FTS)//nc+1

fig,axes=plt.subplots(nrows=nr, ncols=nc, figsize=(25,nr*4))

for count,sf in enumerate(SEL_FTS):
    sns.distplot(train_set[sf], ax=axes[count//nc, count%nc], color='Green')
    sns.distplot(test_set[sf], ax=axes[count//nc, count%nc], color='Red')
    
    axes[count//nc, count%nc].set_title(sf)
    sns.despine(ax=axes[count//nc, count%nc])
    
plt.tight_layout()

In [None]:
import optuna
from optuna.samplers import TPESampler
optuna.logging.set_verbosity(optuna.logging.WARNING)
import xgboost as xgb

OPTUNA_TRIALS = 30

class Optimizer:
    def __init__(self, metric, trials=OPTUNA_TRIALS):
        self.metric = metric
        self.trials = trials
        self.sampler = TPESampler(seed=42)
        
    def objective(self, trial):
        model = create_model(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        if self.metric == 'mae':
            return -mae(y_val, preds)
        else:
            return -np.sqrt(mse(y_val, preds))
        
            
    def optimize(self):
        study = optuna.create_study(direction="maximize", sampler=self.sampler)
        study.optimize(self.objective, n_trials=self.trials)
        return study.best_params

def create_model(trial):
    #max_depth = trial.suggest_int("max_depth", 2, 6)
    #n_estimators = trial.suggest_int("n_estimators", 2, 150)
    colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.2, 0.95)
    subsample = trial.suggest_uniform("subsample", 0.2, 0.9)
    reg_alpha = trial.suggest_uniform("reg_alpha", 0.001, 100)
    max_depth = trial.suggest_int("max_depth", 1, 15)
    min_child_samples = trial.suggest_int('min_child_samples', 1, 15)
    n_estimators = trial.suggest_int('n_estimators', 100, 250)
    
    model = LGBMRegressor(random_state = 42,
                    max_depth = max_depth,
                    n_estimators = n_estimators, 
                          colsample_bytree=colsample_bytree,
                          subsample=subsample,
                          reg_alpha=reg_alpha,
                    learning_rate = 0.05)
    
    return model

In [None]:
param_outputs = [] 

FOLD_VALUES = [x for x in train_set['fold'].unique()] 

for fold in train_set['fold'].unique():
    
    trn_idx = train_set['fold']!=fold
    val_idx = train_set['fold']==fold
    
    X_train = train_set.loc[trn_idx,SEL_FTS].values
    y_train = train_set.loc[trn_idx, LABEL].values

    X_val = train_set.loc[val_idx,SEL_FTS].values
    y_val = train_set.loc[val_idx, LABEL].values

    optimizer = Optimizer('mse')

    lgb_params = optimizer.optimize()
    param_outputs += [lgb_params]

    lgb_params['random_state'] = 42
    model_output = LGBMRegressor(**lgb_params)

    print(fold, lgb_params)

    model_output.fit(X_train, y_train)

    preds = model_output.predict(X_val)

    print(mae(y_val, preds))
    
    
param_headings = ['colsample_bytree','subsample', 'reg_alpha', 
                  'max_depth', 'min_child_samples', 'n_estimators', 'random_state']

params_output_df = pd.DataFrame(columns=param_headings, index=FOLD_VALUES, data=0.0)

for idx in params_output_df.index:
    for h in param_headings:
        params_output_df.at[idx, h] = param_outputs[idx][h]
    
print('saving CSV')
params_output_df.to_csv('optuna_lgbm_params.csv', index=True)

param_means = params_output_df.mean(axis=0)
param_means

In [None]:
FOLD_VALUES = [x for x in train_set['fold'].unique()] 
print('FOLDS', FOLD_VALUES)

oof = np.zeros((len(train_set),))
test_predictions = np.zeros((len(sample_submission),))

ft_importances = pd.Series(index=SEL_FTS, data=0.0)

RANDOM_SEEDS = [42, 0, 1, 2, 3, 4, 5, 6]

for RS in RANDOM_SEEDS:
    
    print('running random seed', RS)
    
    for fold in train_set['fold'].unique():

        trn_idx = train_set['fold']!=fold
        val_idx = train_set['fold']==fold

        X_train = train_set.loc[trn_idx,SEL_FTS].values
        y_train = train_set.loc[trn_idx, LABEL].values

        X_val = train_set.loc[val_idx,SEL_FTS].values
        y_val = train_set.loc[val_idx, LABEL].values

        #print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

        lgb = LGBMRegressor(random_state = RS,
                            n_estimators = int(param_means['n_estimators']),
                             subsample = param_means['subsample'],
                             reg_alpha = param_means['reg_alpha'],
                             max_depth = int(param_means['max_depth']),
                             min_child_samples = int(param_means['min_child_samples']),
                             colsample_bytree = param_means['colsample_bytree'],
                        )

        lgb.fit(X_train, y_train)
        preds = lgb.predict(X_val)
        oof[val_idx]+=preds

        test_predictions+=lgb.predict(test_set[SEL_FTS])

        ft_importances[:] += lgb.feature_importances_

        print('Random Seed', RS, 'Fold ', fold, 'Error', np.sqrt(mse(y_val, preds)))

oof = oof / len(RANDOM_SEEDS)
        
print('final OOF MSE', np.sqrt(mse(train_set[LABEL], oof)))

test_predictions = test_predictions/(len(FOLD_VALUES) * len(RANDOM_SEEDS))

ft_importances[:]  = ft_importances/(len(FOLD_VALUES) * len(RANDOM_SEEDS))

sns.kdeplot(test_predictions, color='Red')

In [None]:
#feature importances - distribution

sns.kdeplot(ft_importances, color='Blue')

In [None]:
#feature importances - visualise top results

fig,axes=plt.subplots(figsize=(8,20))
ft_importances = ft_importances.sort_values(ascending=False)
axes.barh(width=ft_importances[0:20],y=ft_importances.index[0:20])

In [None]:
#review distributions of train vs test for highest importance features

TOP_FTS = [x for x in ft_importances.index[0:20]]

nc=5
nr=len(TOP_FTS)//nc

fig,axes=plt.subplots(nrows=nr, ncols=nc, figsize=(25,nr*4))

for count,sf in enumerate(TOP_FTS):
    sns.distplot(train_set[sf], ax=axes[count//nc, count%nc], color='Green')
    sns.distplot(test_set[sf], ax=axes[count//nc, count%nc], color='Red')
    
    axes[count//nc, count%nc].set_title(sf)
    sns.despine(ax=axes[count//nc, count%nc])
    
plt.tight_layout()

In [None]:
#total summed feature importance for each measure e.g. 'BH_pow' across all sensors

measure_list = [x.lstrip('s9_') for x in SEL_FTS if 's9_' in x]

measure_list = pd.Series(index=measure_list, data=0.0)

for sc in measure_list.index:
    cols_ = [x for x in ft_importances.index if sc in x]
    
    measure_list[sc] = ft_importances[cols_].sum()

fig,axes=plt.subplots(figsize=(8,6))
axes.barh(width=measure_list,y=measure_list.index)

axes.set_title('Sum of Feature Importance across Sensors')

In [None]:
#standard deviation in feature importance for each measure e.g. 'BH_pow' across all sensors

measure_list = [x.lstrip('s9_') for x in SEL_FTS if 's9_' in x]

measure_list = pd.Series(index=measure_list, data=0.0)

for sc in measure_list.index:
    cols_ = [x for x in ft_importances.index if sc in x]
    
    measure_list[sc] = ft_importances[cols_].std()

fig,axes=plt.subplots(figsize=(8,6))
axes.barh(width=measure_list,y=measure_list.index)

axes.set_title('Standard Deviation of Feature Importance across Sensors')

In [None]:
sample_submission['time_to_eruption'] = test_predictions
sample_submission.to_csv('submission.csv', index=False)