# INGV - Volcanic Eruption Prediction

In this notebook I will describe the solution based on a simple workflow. This notebook will have the next sections:

1. Feature engineering
2. Experimentation
3. Feature selection
4. Hyperparameter tuning
5. Features importances
6. Submission

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import time
import os
import datetime
from tqdm import tqdm
from pathlib import Path

from tsfresh import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import eli5

## Feature engineering

For feature engineering I used the library [tsfresh](https://tsfresh.readthedocs.io/en/latest/). I customized the calculated features to be as efficient as it can. Note that more features could improve the results.

In [None]:
def feature_engineering(path, params, df=None, is_train=True):
    features_df = None
    
    if is_train:
        segments = df.iterrows()
    else:
        segments = enumerate(os.listdir(path))
    
    for idx, row in segments:
        
        if is_train:
            segment, time_to_eruption = row
            segment_timeseries_path = path/f'{segment}.csv'
        else:
            segment = row
            segment_timeseries_path = path/f'{segment}'
            
        
        segment_timeseries = pd.read_csv(segment_timeseries_path)
        segment_timeseries=segment_timeseries.fillna(0).reset_index() # Drop nan columns and adds a "time" identifier loc[:,~segment_timeseries.isna().any().values]
        segment_timeseries['id'] = idx # Every segment is from the same timeseries so lets asignate the same identifier for them
        extracted_features = extract_features(segment_timeseries, column_id="id", column_sort="index", default_fc_parameters = params) # Extract the features ~ 3 minutes per segment
        extracted_features['segment'] = segment
        
        if is_train:
            extracted_features['time_to_eruption'] = time_to_eruption # Set the target


        # Finally update the new dataset
        if features_df is None:
            features_df = extracted_features
        else:
            features_df = pd.concat([features_df,extracted_features], axis=0, ignore_index=True, sort=True)
        
    return features_df

In [None]:
volcanic_path = Path('/kaggle/input/predict-volcanic-eruptions-ingv-oe')
train_path = volcanic_path/'train.csv'
submission_path = volcanic_path/'sample_submission.csv'
train_df = pd.read_csv(train_path)
submission = pd.read_csv(submission_path)

In [None]:
params_tsfresh = MinimalFCParameters()
del params_tsfresh['length']
params_tsfresh['skewness'] = None
params_tsfresh['kurtosis'] = None
params_tsfresh['last_location_of_maximum'] = None
params_tsfresh['first_location_of_maximum'] = None
params_tsfresh['last_location_of_minimum'] = None
params_tsfresh['first_location_of_minimum'] = None
params_tsfresh['first_location_of_minimum'] = None
params_tsfresh['benford_correlation'] = None
params_tsfresh['number_peaks'] =  [{'n': 1}, {'n': 3}, {'n': 5}, {'n': 10}, {'n': 50}]
params_tsfresh['binned_entropy']  = [{'max_bins': 10}]
params_tsfresh['fft_aggregated']  = [{'aggtype': 'centroid'},
  {'aggtype': 'variance'},
  {'aggtype': 'skew'},
  {'aggtype': 'kurtosis'}]

params_tsfresh['percentage_of_reoccurring_values_to_all_values'] = None
params_tsfresh['percentage_of_reoccurring_datapoints_to_all_datapoints'] = None
params_tsfresh[ 'autocorrelation'] =[{'lag': 0},
  {'lag': 1},
  {'lag': 2},
  {'lag': 3},
  {'lag': 4},
  {'lag': 5},
  {'lag': 6},
  {'lag': 7},
  {'lag': 8},
  {'lag': 9}]
params_tsfresh['agg_autocorrelation'] = [{'f_agg': 'mean', 'maxlag': 40},
  {'f_agg': 'median', 'maxlag': 40},
  {'f_agg': 'var', 'maxlag': 40}]
params_tsfresh['friedrich_coefficients'] = [{'coeff': 0, 'm': 3, 'r': 30},
  {'coeff': 1, 'm': 3, 'r': 30},
  {'coeff': 2, 'm': 3, 'r': 30},
  {'coeff': 3, 'm': 3, 'r': 30}]
params_tsfresh['count_above'] = [{'t': 0}]
params_tsfresh['count_below'] = [{'t': 0}]
params_tsfresh

The calculation will take more than 2 hours so I will comment it. The results is in *features-volcanic* folder.

In [None]:
# features_train = feature_engineering(volcanic_path/'train', params_tsfresh, df)
# features_test = feature_engineering(volcanic_path/'test', params_tsfresh, is_train=False) 

## Experimentation

In this section I will experiment with several models and select the best one, the tested models are:

* KNN
* AdaBoost
* Gradient Boosting
* Decission tree
* Random Forest
* LGBM
* CatBoost

First lets define our metrics.

In [None]:
EPSILON = 1e-10

def _error(actual: np.ndarray, predicted: np.ndarray):
    """ Simple error """
    return actual - predicted


def _percentage_error(actual: np.ndarray, predicted: np.ndarray):
    """
    Percentage error

    Note: result is NOT multiplied by 100
    """
    return _error(actual, predicted) / (actual + EPSILON)

def mse(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Squared Error """
    return np.mean(np.square(_error(actual, predicted)))

def mae(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Absolute Error """
    return np.mean(np.abs(_error(actual, predicted)))

def mape(actual: np.ndarray, predicted: np.ndarray):
    """
    Mean Absolute Percentage Error

    Properties:
        + Easy to interpret
        + Scale independent
        - Biased, not symmetric
        - Undefined when actual[t] == 0

    Note: result is NOT multiplied by 100
    """
    return np.mean(np.abs(_percentage_error(actual, predicted)))

def rmse(actual: np.ndarray, predicted: np.ndarray):
    """ Root Mean Squared Error """
    return np.sqrt(mse(actual, predicted))

def evaluate(actual: np.ndarray, predicted: np.ndarray):
    results = {}
    metrics = [('MAE', mae), ('MAPE', mape), ('RMSE', rmse)]
    for name, metric in metrics:
        try:
            results[name] = metric(actual, predicted)
        except Exception as err:
            results[name] = np.nan
            print('Unable to compute metric {0}: {1}'.format(name, err))
    return results

def calculate_metrics(y_test, y_pred, target, model_name):

    metrics = pd.DataFrame(data=evaluate(y_test, y_pred), index=[model_name])
    
    return metrics

In [None]:
def split_scale_generator(df, target_name, n_splits=10, selector=None):
    kf = KFold(n_splits=n_splits,random_state=123, shuffle=True)
    y = df.loc[:, target_name]
    X = df.drop(target_name, axis=1).values
        
    if selector is not None:
        X = selector.transform(X)
        
    for train_index, test_index in kf.split(df.values):
        scaler = StandardScaler()
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        yield X_train, y_train, X_test, y_test

Train function for the model

In [None]:
def train(name, model, target, X_train, y_train, X_test, y_test):
    start_time = time.time()
    print("Training "+name+"....")

    model.fit(X_train, y_train)

    print(f"Trained on {round((time.time()-start_time)/60, 3)} minutes....")
    
    test_pred = model.predict(X_test)

    test_metrics = calculate_metrics(y_test, test_pred, target, name)
    
    return test_metrics

This function will train and evaluate all model with cross validation.

In [None]:
def execute_baseline(df, models, target,n_splits=10, selector=None):
    valid_metrics_global = None

    for idx, (X_train, y_train, X_test, y_test) in enumerate(split_scale_generator(df, target, n_splits,selector)):
        print(f'******************** iteration {idx} ********************')
        for name, model in models.items():
            valid_metrics = train(name, model, target, X_train, y_train, X_test, y_test)
            if valid_metrics_global is None:
                valid_metrics_global = valid_metrics
            else:
                valid_metrics_global = valid_metrics_global.append(valid_metrics)
        
    return valid_metrics_global
            

In [None]:
#KNN
knn_regr = KNeighborsRegressor(n_neighbors=3, weights='distance', n_jobs=-1)

# AdaBoost with Decission tree
#distributions_ada = {'estimator__base_estimator__max_depth':list(range(2,16))}
ada_regrt_model = AdaBoostRegressor(DecisionTreeRegressor(),
                          n_estimators=300,  random_state=123)


# Gradient boosting
grboost_model = GradientBoostingRegressor(n_estimators=300,loss='ls', learning_rate=0.1, random_state=123)

#distributions_dt = dict(max_depth=list(range(2,16)))
regrt_model = DecisionTreeRegressor(random_state=123)

#Random forest
rand_forest_model = RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=123)


# Light GBM
lgb_params = {
    
    'min_data_in_leaf': 2, 
    'objective':'regression',
    'max_depth': -1,
    'learning_rate': 0.001,
    'max_bins': 2048,
    "boosting": "gbdt",
    "feature_fraction": 0.91,
    "bagging_freq": 1,
    "bagging_fraction": 0.91,
    "bagging_seed": 42,
    "metric": 'mae',
    "lambda_l1": 0.1,
    "verbosity": -1,
    "nthread": -1,
    "random_state": 123
}
lgbm_regr = lgb.LGBMRegressor(**lgb_params, n_estimators = 300, n_jobs = -1)

# CatBoost                    
catboost_regr = CatBoostRegressor(learning_rate=1, random_seed=123, n_estimators=600, logging_level='Silent')


models = {
    "knn_regr": knn_regr,
    "adaboost_tree_regressor": ada_regrt_model,
    "gradient_boost": grboost_model,
    "decision_tree_regressor": regrt_model,
    "random_forest": rand_forest_model,
    "lgbm_regressor": lgbm_regr,
    "catboost_regressor": catboost_regr,
    
}

In [None]:
%%time
features_path = Path('/kaggle/input/features-volcanic')

train_features = pd.read_csv(features_path/'features_train.csv')
metrics = execute_baseline(train_features, models, 'time_to_eruption', n_splits=10)

Lets see the metrics

In [None]:
metrics.groupby(metrics.index).mean().sort_values('MAE')

The Adaboost solution is the best solution, if I submit this model the MAE is 4749935.
Lets improve it with feature selection and hyperparameter tuning.

## Feature selection

In [None]:
train = pd.read_csv(features_path/'features_train.csv')
test = pd.read_csv(features_path/'features_test.csv')

X_test = test.drop(['segment', 'time_to_eruption'], axis=1).values

In [None]:
%%time
model = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=300)

scaler = StandardScaler()
y = train.loc[:, 'time_to_eruption']
X = scaler.fit_transform(train.drop('time_to_eruption', axis=1).values)
X_test = scaler.transform(X_test)

selector = SelectFromModel(estimator=model).fit(X, y)
X_test_selected = selector.transform(X_test)
X_train_selected = selector.transform(X)

In [None]:
selected_features = train.drop('time_to_eruption', axis=1).columns[selector.get_support()].tolist()
selected_features

With feature selection the MAE improved to 4596505.

## Hyperparameter tuning

For hyperparameter tuning I used Bayesian Optimization from [skopt](https://scikit-optimize.github.io/stable/).

In [None]:
search_space = {"loss": Categorical(['linear', 'square', 'exponential']), # values for boostrap can be either True or False
        "base_estimator__max_depth": Integer(3, 200), # values of max_depth are integers from 6 to 20
        "base_estimator__splitter": Categorical(['best', 'random']), 
        "n_estimators": Integer(200, 5000),
        "learning_rate": Real(1e-12, 1)
    }

In [None]:
model = AdaBoostRegressor(DecisionTreeRegressor())

clf = BayesSearchCV(model, search_space, cv=10, n_iter=15, n_jobs=-1, random_state=123, scoring='neg_mean_absolute_error')

This will take a lot of time so lets comment this cell.

In [None]:
#%%time
# search = clf.fit(X_train_selected, y)

# search.best_params_, search.best_score_

The best params I found are:
* base_estimator__max_depth: 43
* base_estimator__splitter: random
* learning_rate: 0.09210493994507518
* loss: linear
* n_estimators: 943

## Features importances

In [None]:
model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=68, splitter='random'), learning_rate=.162163403819552, n_estimators=971, loss='exponential')

perm = eli5.sklearn.PermutationImportance(model, cv=10).fit(X_train_selected, y)
eli5.show_weights(perm, feature_names=selected_features)

## Submission

Finally lets see the MAE with hyperparameter tuning and selection.

In [None]:
#model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=search.best_params_['base_estimator__max_depth'], splitter=search.best_params_['base_estimator__splitter']), learning_rate=search.best_params_['learning_rate'], n_estimators=search.best_params_['n_estimators'], loss=search.best_params_['loss'])
model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=68, splitter='random'), learning_rate=.162163403819552, n_estimators=971, loss='exponential')

model.fit(X_train_selected, y)

In [None]:
predictions = model.predict(X_test_selected)
submission = pd.read_csv(submission_path)
submission['time_to_eruption'] = predictions
submission.to_csv('submission.csv', index=None)

The final MAE in private leaderboard was: 3928087.