# INGV Volcanic : Basic_solution (STFT)
**Volcanic earthquakes** have various characteristics depending on the volcano, and there are various classifications, but the typical classifications can be classified as follows.
 
**Volcanic earthquake**
 * type A (10Hz or higher): Earthquake caused by destruction of rocks surrounding magma chambers and conduits
 * type BH (5-8Hz) : Earthquake caused by magma intruding into the conduit and destroying the conduit and rocks around the conduit
 * type BL (1.5-2.5Hz) : An earthquake around the conduit due to gas etc. ejecting from the crater prior to the explosive eruption and reducing the pressure inside the conduit.

**Volcanic tremor**
 * type C (0.5-1.2Hz) : Vibration due to increase in gas pressure in the cavity along with BH
 * type D (2-4Hz) : Vibration due to gas ejection along with BL
 
> I am a complete amateur about volcanoes. The jargon may be wrong, but please forgive me.

**Version 3**
 * At the beginning, briefly add the mechanism of earthquake and tremor
 * Corrected the unit of `time_to_eruption` from millisecond to centisecond (Thanks Alex V B)
 * Changed LightGBM parameters (Thanks [Dave E](https://www.kaggle.com/davidedwards1/volcano-stft-data-optimisation))

In [None]:
import os
import datetime
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import scipy
import scipy.signal

import matplotlib
import matplotlib.pyplot as plt
pd.options.display.max_columns = None    # disp all columns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

# from lightgbm import LGBMRegressor
import lightgbm as lgb

In [None]:
### Kaggle or Local-PC ###
KAGGLE = True       # <==== SET ============

if KAGGLE:
    DIR = '../input/predict-volcanic-eruptions-ingv-oe'
else:              # local PC
    DIR = './predict-volcanic-eruptions-ingv-oe/'

# Data

In [None]:
train = pd.read_csv(os.path.join(DIR, 'train.csv'))
test = pd.read_csv(os.path.join(DIR, 'sample_submission.csv'))

train

In [None]:
# Convert 'time_to_eruption'to hours:minutes:seconds (Just for reference)
train['h:m:s'] = (train['time_to_eruption']
                  .apply(lambda x:datetime.timedelta(seconds = x/100)))
train

# Observe sample data

In [None]:
# plot utility function
def plot(ax, x, y, xlabel=None, ylabel=None, legend=None):
    ax.plot(x, y, label = legend)
    if xlabel != None:
        ax.set_xlabel(xlabel)
    if ylabel != None:
        ax.set_ylabel(ylabel)
    if legend != None:
        ax.legend()
    ax.grid(True)

In [None]:
# Serect sample segment
sample_df = (train.sort_values('time_to_eruption')
             .reset_index()
             .rename(columns={'index': 'train_id'}))
sample_df = sample_df[sample_df.index % (len(train) // 5) == 5].reset_index(drop = True)
sample_ids = sample_df['segment_id'].values
sample_df

### Time Domain

In [None]:
sensor = 4      #### 1 ～ 10

fig, ax = plt.subplots(len(sample_ids), 1, figsize = (12, len(sample_ids)*2))
for i, segment_id in enumerate(sample_ids):
    segment_df = pd.read_csv(os.path.join(DIR, f'train/{segment_id}.csv')).fillna(0)
    ax[i].plot(range(len(segment_df)), segment_df[f'sensor_{sensor}'])
    ax[i].set_title(f'segment_id : {segment_id},  sensor : {sensor}')

fig.tight_layout()

### Time-Frequency Domain (STFT)
STFT : Short Time Fourier Transform

In [None]:
fs = 100                # sampling frequency 
N = len(segment_df)     # data size
n = 256                 # FFT segment size

fig, ax = plt.subplots(len(sample_ids), 1, figsize = (12, len(sample_ids)*2))
for i, segment_id in enumerate(sample_ids):
    segment_df = pd.read_csv(os.path.join(DIR, f'train/{segment_id}.csv')).fillna(0)
    
    x = segment_df[f'sensor_{sensor}'][:N]
    f, t, Z = scipy.signal.stft(x, fs = fs, window = 'hann', nperseg = n)
    Z = np.abs(Z)

    ax[i].pcolormesh(t, f, Z, vmin = 0, vmax = Z.mean()*10)
    ax[i].set_ylim(0, 20)
    ax[i].set_ylabel('Frequency [Hz]'); plt.xlabel('Time [s]')
    ax[i].set_title(f'segment_id : {segment_id},  sensor : {sensor}')
fig.tight_layout()

# Features

In [None]:
# STFT(Short Time Fourier Transform) Specifications
fs = 100                # sampling frequency 
N = len(segment_df)     # data size
n = 256                 # FFT segment size
max_f = 20              # ～20Hz

delta_f = fs / n        # 0.39Hz
delta_t = n / fs / 2    # 1.28s

In [None]:
def make_features(tgt):
    tgt_df = train if tgt == 'train' else test
    feature_set = []
    for segment_id in tqdm(tgt_df['segment_id']):
        segment_df = pd.read_csv(os.path.join(DIR,f'{tgt}/{segment_id}.csv'))
        segment = [segment_id]
        for sensor in segment_df.columns:
            x = segment_df[sensor][:N]
            if x.isna().sum() > 1000:     ##########
                segment += ([np.NaN] * 10)
                continue
            f, t, Z = scipy.signal.stft(x.fillna(0), fs = fs, window = 'hann', nperseg = n)
            f = f[:round(max_f/delta_f)+1]
            Z = np.abs(Z[:round(max_f/delta_f)+1]).T    # ～max_f, row:time,col:freq

            th = Z.mean() * 1     ##########
            Z_pow = Z.copy()
            Z_pow[Z < th] = 0
            Z_num = Z_pow.copy()
            Z_num[Z >= th] = 1

            Z_pow_sum = Z_pow.sum(axis = 0)
            Z_num_sum = Z_num.sum(axis = 0)

            A_pow = Z_pow_sum[round(10/delta_f):].sum()
            A_num = Z_num_sum[round(10/delta_f):].sum()
            BH_pow = Z_pow_sum[round(5/delta_f):round(8/delta_f)].sum()
            BH_num = Z_num_sum[round(5/delta_f):round(8/delta_f)].sum()
            BL_pow = Z_pow_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            BL_num = Z_num_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            C_pow = Z_pow_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            C_num = Z_num_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            D_pow = Z_pow_sum[round(2/delta_f):round(4/delta_f)].sum()
            D_num = Z_num_sum[round(2/delta_f):round(4/delta_f)].sum()
            segment += [A_pow, A_num, BH_pow, BH_num, BL_pow, BL_num, C_pow, C_num, D_pow, D_num]

        feature_set.append(segment)

    cols = ['segment_id']
    for i in range(10):
        for j in ['A_pow', 'A_num','BH_pow', 'BH_num','BL_pow', 'BL_num','C_pow', 'C_num','D_pow', 'D_num']:
            cols += [f's{i+1}_{j}']
    feature_df = pd.DataFrame(feature_set, columns = cols)
    feature_df['segment_id'] = feature_df['segment_id'].astype('int')
    return feature_df

In [None]:
feature_df = make_features('train')
train_set = pd.merge(train, feature_df, on = 'segment_id')
train_set

In [None]:
fig, ax = plt.subplots(2, 5, figsize = (12, 6))
x = train_set['time_to_eruption']
for i,type in enumerate(['A_pow','A_num','BH_pow','BH_num','BL_pow','BL_num','C_pow','C_num','D_pow','D_num']):
    y = np.zeros(len(x))
    for j in range(10):
        y += train_set[f's{j+1}_{type}']
    y /= 10
    x1 = np.polyfit(x, y.fillna(y.mean()), 2)
    y1 = np.poly1d(x1)(x)
    ax[i%2, i//2].plot(x, y,'.')
    ax[i%2, i//2].plot(x, y1,'.')
    ax[i%2, i//2].set_ylim(0,)
    ax[i%2, i//2].set_title(type)
fig.tight_layout()

# Modeling and Predicting

In [None]:
df = train_set.drop(['segment_id', 'time_to_eruption','h:m:s'], axis=1)
y = train_set['time_to_eruption']

X_train, X_val, y_train, y_val = train_test_split(df, y,
                                                  random_state = 42,
                                                  test_size = 0.2,
                                                  shuffle = True)

features = X_train.columns.tolist()
cat_features = {}

In [None]:
# lgb = LGBMRegressor(random_state = 42,
#                     max_depth = 7,
#                     n_estimators = 250,       ######### 
#                     learning_rate = 0.05)
# lgb.fit(X_train, y_train)
# preds = lgb.predict(X_val)

# print('RMSE: ', np.sqrt(mse(y_val, preds)))

In [None]:
def do_lgb(X_train, y_train, X_val, y_val):
    params = {'objective': 'rmse',
              'metric': 'rmse',
              'max_depth':14,
              'min_data_in_leaf':5,         # = min_child_samples
              'num_leaves': 2**7 - 1,
              'learning_rate': 0.05,
              'feature_fraction': 0.7,      # = colsample_bytree
              'bagging_fraction': 0.5,      # = subsample
              'bagging_freq': 5,
              'lambda_l1':80,               # = reg_alpha
              'num_iterations': 10000,      # = n_estimators
              'seed': 42,
              'verbose': 1
             }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    evals_result = {}
    model = lgb.train(
        params,
        lgb_train,
        valid_sets = (lgb_train, lgb_eval), 
        feature_name = features,
        categorical_feature = cat_features,
        verbose_eval = 100,
        evals_result = evals_result,
        early_stopping_rounds = 200)

    return model

In [None]:
lgb_model = do_lgb(X_train, y_train, X_val, y_val)

# Predict test data

In [None]:
feature_df = make_features('test')
test_set = pd.merge(test, feature_df, on = 'segment_id')
test_set

In [None]:
# Predict test data
preds = lgb_model.predict(test_set.drop(['segment_id', 'time_to_eruption'], axis=1))
test['time_to_eruption'] = preds
test[['segment_id','time_to_eruption']]

In [None]:
test[['segment_id','time_to_eruption']].to_csv('submission.csv', index=False)