# Volcanic experiments

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
import matplotlib.pyplot as plt
from colorama import Fore, Style

Lets check how many train files we have

In [None]:
TRAIN_FOLDER = '../input/predict-volcanic-eruptions-ingv-oe/train/'
TEST_FOLDER = '../input/predict-volcanic-eruptions-ingv-oe/test/'

How does a train file look like:

In [None]:
train = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')
train_labels = np.expand_dims(np.array(train['time_to_eruption']), axis=-1)
train.head()

Let's have a look on how a sensor data (from a csv file). W peek a random train file.

In [None]:
random_sensor_file = pd.read_csv(os.path.join(TRAIN_FOLDER, random.choice(os.listdir(TRAIN_FOLDER))))
random_sensor_file.head()

In [None]:
fig, axis = plt.subplots(2, 5, figsize=(25, 10))

for i, sensor_name in enumerate(random_sensor_file):
    sensor_data = np.nan_to_num(np.array(random_sensor_file[sensor_name], dtype='float'))
    axis[i % 2, i % 5].plot(range(len(sensor_data)), sensor_data, '.')
    axis[i % 2, i % 5].grid(True)
    axis[i % 2, i % 5].legend([sensor_name])

So as to encrease speed of convergence, we need to scale our data. Pandas provides a useful method, which returns a lot of features, which perfectly describe the data you have.

### Extracting features
Here we need to build an extractor for our noisy dataset: 60000 valuse of signals take too much memory, which is not as useful as it could be for that much memory. So we can extract some mathematic features like mean, std, variance, max, min, skew, kurtosis etc. These will give a model much more knowladge for less memory.

In [None]:
def build_features(signal, ts, sensor_id):
    X = pd.DataFrame()
    f = np.fft.fft(signal)
    f_real = np.real(f)
    X.loc[ts, f'{sensor_id}_sum']       = signal.sum()
    X.loc[ts, f'{sensor_id}_mean']      = signal.mean()
    X.loc[ts, f'{sensor_id}_std']       = signal.std()
    X.loc[ts, f'{sensor_id}_var']       = signal.var() 
    X.loc[ts, f'{sensor_id}_max']       = signal.max()
    X.loc[ts, f'{sensor_id}_min']       = signal.min()
    X.loc[ts, f'{sensor_id}_skew']      = signal.skew()
    X.loc[ts, f'{sensor_id}_mad']       = signal.mad()
    X.loc[ts, f'{sensor_id}_kurtosis']  = signal.kurtosis()
    X.loc[ts, f'{sensor_id}_quantile99']= np.quantile(signal, 0.99)
    X.loc[ts, f'{sensor_id}_quantile95']= np.quantile(signal, 0.95)
    X.loc[ts, f'{sensor_id}_quantile85']= np.quantile(signal, 0.85)
    X.loc[ts, f'{sensor_id}_quantile75']= np.quantile(signal, 0.75)
    X.loc[ts, f'{sensor_id}_quantile55']= np.quantile(signal, 0.55)
    X.loc[ts, f'{sensor_id}_quantile45']= np.quantile(signal, 0.45) 
    X.loc[ts, f'{sensor_id}_quantile25']= np.quantile(signal, 0.25) 
    X.loc[ts, f'{sensor_id}_quantile15']= np.quantile(signal, 0.15) 
    X.loc[ts, f'{sensor_id}_quantile05']= np.quantile(signal, 0.05)
    X.loc[ts, f'{sensor_id}_quantile01']= np.quantile(signal, 0.01)
    X.loc[ts, f'{sensor_id}_fft_real_mean']= f_real.mean()
    X.loc[ts, f'{sensor_id}_fft_real_std'] = f_real.std()
    X.loc[ts, f'{sensor_id}_fft_real_max'] = f_real.max()
    X.loc[ts, f'{sensor_id}_fft_real_min'] = f_real.min()

    return X

In [None]:
def get_subtrain_items(all_items, part, shuffle=False):
    if shuffle:
        random.shuffle(all_items)
    split_idx = int(len(all_items) * part)
    all_items = all_items[:split_idx]
    return all_items

### Split train set
We need to have a smaller set due to huge size of our train dataset. Huge values are useful for training a model on different examples, but we are testing different perfomances, so we train (really test) on a smaller dataset (we're taking a subset).

In [None]:
sub_train_ids = get_subtrain_items(train.segment_id, 0.35, True)
print(f'sub train items {Fore.CYAN}{len(sub_train_ids)}{Style.RESET_ALL} of {Fore.BLUE}{len(train.segment_id)}')

In [None]:
train_set = list()
j=0
for seg in sub_train_ids:
    signals = pd.read_csv(f'/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/{seg}.csv')
    train_row = []
    if j%500 == 0:
        print(j)
    for i in range(0, 10):
        sensor_id = f'sensor_{i+1}'
        train_row.append(build_features(signals[sensor_id].fillna(0), seg, sensor_id))
    train_row = pd.concat(train_row, axis=1)
    train_set.append(train_row)
    j+=1
train_set = pd.concat(train_set)
train_set = train_set.reset_index()
train_set = train_set.rename(columns={'index': 'segment_id'})
train_set = pd.merge(train_set, train, on='segment_id')
train = train_set.drop(['segment_id', 'time_to_eruption'], axis=1)
y = np.expand_dims(train_set['time_to_eruption'], axis=-1)

In [None]:
VULCAN_INPUT_SIZE = (23,10, 1)
NUM_EXAMPLES = train.shape[0]

In [None]:
y = np.array(y)
dl_train_dict = {'X': np.resize(np.array(train), (NUM_EXAMPLES,) + VULCAN_INPUT_SIZE), 'y': np.expand_dims(y, axis=-1)}
ml_train_dict = {'X': np.array(train), 'y': y}
print(f'trained data shape (already squared for deep learning)={Fore.CYAN}',dl_train_dict['X'].shape,f'{Style.RESET_ALL}')
print(f'trained data shape (for machine learning)={Fore.BLUE}',ml_train_dict['X'].shape,f'{Style.RESET_ALL}')

We need to remove train files, which have bug sensors, which didnt detect any data

let's make a specific DataGenerator of volcanic data as our RAM isn't as big as train set, if ones wants to load the whole train dataset in memory. Our DataGenerator must contain all default methods for a classic DataGenerator

In [None]:
import tensorflow as tf
import sklearn
import xgboost

**Validation** is very important to check on unseen data your model. So I'd like to to check my model by the next sets

In [None]:
def get_train_test_sets(full_data, split_part):
    assert len(full_data['X']) == len(full_data['y'])
    split_index = int(split_part * full_data['X'].shape[0])
    #split full data by index into test part
    test_data = full_data['X'][:split_index]
    test_labels = full_data['y'][:split_index]
    #now get train data
    train_data = full_data['X'][split_index:]
    train_labels = full_data['y'][split_index:]
    return {'X': train_data, 'y':train_labels}, {'X': test_data, 'y': test_labels}

In [None]:
val_part = 0.1
split_algo_train_dict = {'ML': {'X': None, 'y':None},'DL':{'X': None, 'y':None}}
split_algo_valid_dict = {'ML': {'X': None, 'y':None},'DL':{'X': None, 'y':None}}
for name, algo_dict in [('DL', dl_train_dict), ('ML', ml_train_dict)]:
    split_algo_train_dict[name], split_algo_valid_dict[name] = get_train_test_sets(algo_dict, val_part)
    print(f'{name}: there are{Fore.BLUE}', len(split_algo_train_dict[name]['X']),
          f'{Style.RESET_ALL}train examples and{Fore.GREEN}', len(split_algo_valid_dict[name]['X']),
          f'{Style.RESET_ALL}valid examples')
    print('of shape', split_algo_train_dict[name]['X'][0].shape)
    print()

Models to experiment with:
- small DNN (~10 000 hidden neuron values) with ReLU in hidden layers. **253925043208192.0** 
- medium (~200 000 hidden neuron values) with ReLU in hidden layers. **307476037632000.0**
- medium DNN with convolution layers (~450 000 hidden neuron values) with mixed ReLU, LeakyReLU activations **200124135374848.0**
- RandomTreeForest **None SCORE**
- XGBoost **None SCORE**

### Deep Learning experiments

In [None]:
def get_small_dnn_model(name):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=VULCAN_INPUT_SIZE),
        tf.keras.layers.Dense(64),
        tf.keras.layers.Dense(128),
        tf.keras.layers.Dense(1)
    ], name=name)
    return model

In [None]:
def get_medium_dnn_model(name):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=VULCAN_INPUT_SIZE),
        tf.keras.layers.Dense(64),
        tf.keras.layers.Dense(128),
        tf.keras.layers.Dense(128),
        tf.keras.layers.Dense(256),
        tf.keras.layers.Dense(256),
        tf.keras.layers.Dense(1)
    ], name=name)
    return model

In [None]:
def get_medium_cnn_model(name):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(64, (3, 3), padding='same', input_shape=VULCAN_INPUT_SIZE),
        tf.keras.layers.MaxPooling2D(2, 2),
        tf.keras.layers.Conv2D(128, (3, 3), padding='same', input_shape=VULCAN_INPUT_SIZE),
        tf.keras.layers.AveragePooling2D(2, 2),
        tf.keras.layers.Conv2D(128, (3, 3), padding='same', input_shape=VULCAN_INPUT_SIZE),
        tf.keras.layers.MaxPooling2D(2, 2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256),        
        tf.keras.layers.Dense(512),        
        tf.keras.layers.Dense(1)
    ], name=name)
    return model

In [None]:
models_to_try = [get_small_dnn_model('small_dnn_model'),
                 get_medium_dnn_model('medium_dnn_model'),
                 get_medium_cnn_model('medium_cnn_model')]
models_history = []
for cur_model in models_to_try:
    cur_model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mse')
    cur_model.summary()
    cur_model_history = cur_model.fit(x=split_algo_train_dict['DL']['X'], y=split_algo_train_dict['DL']['y'],
                                      epochs=40, verbose=0,
                                      validation_data=(split_algo_valid_dict['DL']['X'], split_algo_valid_dict['DL']['y']))
    models_history.append((cur_model.name, cur_model_history))
    print(f'final loss={Fore.RED}', 
          cur_model.evaluate(split_algo_valid_dict['DL']['X'], 
                             split_algo_valid_dict['DL']['y'], verbose=0),
          f'{Style.RESET_ALL}')

In [None]:
colors = ['blue', 'red', 'orange']
legend = []
plt.figure(figsize=(10, 10))
for i, model_hist in enumerate(models_history):
    epochs = range(len(model_hist[1].history['loss']))
    plt.plot(epochs, model_hist[1].history['loss'], color=colors[i])
    legend.append((model_hist[0] + "loss"))
plt.legend(legend)
plt.grid()
plt.show()

As we can see, neural networks **dont** come to a nice solution at all. So it's proved, that using ML algorithms for this(table) problem is a lot better.

### Machine learning experiments

In [None]:
from sklearn.ensemble import RandomForestRegressor
def get_RFR_model(args):
    model = RandomForestRegressor(**args)
    return model

In [None]:
def get_XGBoost_model(args):
    model = xgboost.XGBRegressor(**args) 
    return model

Lets train on default values of our ML models, and than choose what we can use for model accurate modeling

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
good_args = {
    'max_depth': 10, 
    'n_estimators': 349, 
    'learning_rate': 0.03463499307472963, 
    'gamma': 0.2584804713489563,
    'random_state': 666
}
ml_models_to_try = [('random forest regressor', get_RFR_model({'n_estimators': 200})), 
                    ('XGBoost regressor', get_XGBoost_model({'n_estimators': 200})),
                   ('XGBoost regressor', get_XGBoost_model(good_args))]
ml_models_metrics = []
for model_name, ml_model in ml_models_to_try:
    print('current model:', model_name)
    ml_model.fit(split_algo_train_dict['ML']['X'], split_algo_train_dict['ML']['y'])
    val_prediction = ml_model.predict(split_algo_valid_dict['ML']['X'])
    mse = mean_squared_error(val_prediction, split_algo_valid_dict['ML']['y'])
    ml_models_metrics.append((model_name, mse))
    print(f'final loss={Fore.RED}', mse,f'{Style.RESET_ALL}')
    print()

In [None]:
test_estimators = range(10, 150, 10)
history = []
for n_estims in test_estimators:
    ml_model = get_RFR_model({'n_estimators': n_estims})
    ml_model.fit(split_algo_train_dict['ML']['X'], split_algo_train_dict['ML']['y'])
    val_prediction = ml_model.predict(split_algo_valid_dict['ML']['X'])
    mse = mean_squared_error(val_prediction, split_algo_valid_dict['ML']['y'])
    ml_models_metrics.append((model_name, mse))
    print(f'final loss={Fore.RED}', mse,f'{Style.RESET_ALL}')
    history.append(mse)

In [None]:
plt.plot(test_estimators, history)
plt.grid()

In [None]:
from xgboost import XGBRegressor

In [None]:
optimized_xgb = XGBRegressor(**{
'max_depth': 10, 
    'n_estimators': 349, 
    'learning_rate': 0.03463499307472963, 
    'gamma': 0.2584804713489563,
    'random_state': 666})

In [None]:
optimized_xgb.fit(split_algo_train_dict['ML']['X'], split_algo_train_dict['ML']['y'])
val_prediction = optimized_xgb.predict(split_algo_valid_dict['ML']['X'])
mse = mean_squared_error(val_prediction, split_algo_valid_dict['ML']['y'])

In [None]:
print('mse for xgboost regressor=', mse)

In [None]:
def predict_test_data(model, test_folder, test_csv, example_shape, x_rescale, y_rescale, verbose=1):
    print(test_folder, test_csv)
    test_indexes = pd.read_csv(os.path.join('../input/predict-volcanic-eruptions-ingv-oe/',test_csv))['segment_id']
    if verbose:
        print(f'there are{Fore.GREEN}', len(train_file_names), f'{Style.RESET_ALL}test files')    
    predictions = {}
    num_test_files = len(test_indexes)
    for i, ID in enumerate(test_indexes):
        # Store sample
        X = np.empty((*example_shape, 1))
        X = get_test_data(test_folder, ID, example_shape) / x_rescale
        predictions[ID] = model.predict(X) * y_rescale
        if verbose:
            if i % 20 == 0:
                print(i,'/', len(test_indexes))
    return predictions

**Predict test data**

In [None]:
pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv').head()

In [None]:
predictions = predict_test_data(model=current_model, test_folder=TEST_FOLDER, 
                                test_csv='sample_submission.csv',
                                example_shape=VULCAN_INPUT_SHAPE,
                                x_rescale=sensor_abs_max_value,
                                y_rescale=label_abs_max_value,
                                verbose=1)

In [None]:
predict_frame = pd.DataFrame({'segment_id': list(predictions.keys()), 'time_to_eruption': [int(val) for val in predictions.values()]})
predict_frame.to_csv(path_or_buf='../working/submission.csv', index=False)

In [None]:
predict_frame.head()