 # Load libraries and simple functions

# AKNOWLEGEMENTS

LGBM parameters were taken from here https://www.kaggle.com/amanjain1008/erupting-volcano-all-in-one-different-eda

In [None]:
ver = 35

In [None]:
import pandas as pd
import numpy as np
import zipfile as zf
import os
import datetime as dt
from tqdm import tqdm, tqdm_notebook
import re
import string
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

plt.style.use('ggplot')


def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name
def tstats(t, doplot = False):
    print('#'*20, get_df_name(t), t.shape)
    for _,i in enumerate(t):
        nulls = t[i].isna().sum()
        if nulls > 0:
            print(i,'=', t[i].nunique(), ',NULLS = ', nulls, ',% of nulls = ',round(100*nulls/t.shape[0]))
        else:
            print(i,'=', t[i].nunique())
    if doplot:
        print()
        print(t.sample(10))
        print()
        
warnings.filterwarnings('ignore')

 # Code to load files. Some logic is displayed and explained later

In [None]:
# '/Users/user/Downloads/'

import os
from tqdm import tqdm_notebook
import scipy.fftpack
from scipy.signal import chirp, find_peaks, peak_widths, peak_prominences

homedir = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'

def read_n_files(dirname, nfiles):
    tmp = pd.DataFrame()
    cnt = 0
    for dirname, _, filenames in os.walk(homedir + dirname):
        for filename in filenames:
            cnt += 1
            if cnt > nfiles: break
            t = pd.read_csv(os.path.join(dirname, filename))
            t['segment_id'] = filename[:-4]
            tmp = pd.concat([tmp, t], ignore_index=True)
        
    return tmp

t10 = read_n_files('train', 10).fillna(0)
float_cols = [c for c in t10 if t10[c].dtype == "float64"]
float32_cols = {c: np.float32 for c in float_cols}


nparts = 6 # number of variables = number of sensors * 4 variables for one part * nparts. if nparts == 10 => nvars = 10 parts * 10 sensors * 4 = 400
r = [int(j*30000/nparts) for j in range(nparts+1)]

def get_peaks(tab, col):
    
    # Number of samplepoints
    N = tab.shape[0]
    # sample spacing
    T = 1.0 / 800.0
    x = np.linspace(0.0, N*T, N)
    y = tab[col].values
    yf = scipy.fftpack.fft(y)
    xf = np.linspace(0.0, int(1.0/(2.0*T)), int(N/2))

    x = np.log(2.0/N * np.abs(yf[:N//2]))
    peaks, _ = find_peaks(x, distance=500)
    
    return x, peaks

def read_n_files32(dirname, nfiles):
    tmp_res = pd.DataFrame()
    cnt = 0
    for dirname, _, filenames in os.walk(homedir + dirname):
        for filename in tqdm_notebook(filenames):
            cnt += 1
            if cnt > nfiles: break
            t = pd.read_csv(os.path.join(dirname, filename), engine='c', dtype=float32_cols)
            #################################
            

            res = []
            for i in [i for i in range(1,11)]:

                x, peaks = get_peaks(t, 'sensor_'+str(i))

                for j in range(len(r)-1):

                    xpeaks = peaks[(peaks > r[j])&(peaks <= r[j+1])]
                    ypeaks = x[xpeaks]

                    mx = round(np.median(xpeaks))
                    sx = round(np.std(xpeaks))

                    my = round(np.median(ypeaks), 2)
                    sy = round(np.std(ypeaks), 2)
                    
                    r0, r1 = r[j], r[j+1]

                    res.append([f's{i:02d}_r{r0:05d}_{r1}_mx', mx])
                    res.append([f's{i:02d}_r{r0:05d}_{r1}_sx', sx])
                    res.append([f's{i:02d}_r{r0:05d}_{r1}_my', my])
                    res.append([f's{i:02d}_r{r0:05d}_{r1}_sy', sy])
                    
                    
                #################################

                df1 = pd.DataFrame({'ind':peaks, 'x':x[peaks]})
                df2 = pd.DataFrame({'ind':np.linspace(0,30000,30001).astype(int)})
                df = df2.merge(df1, how='left', on='ind') # we need to get nans in order to interpolate them

                df = df.interpolate(method='linear', limit_direction='both', axis=0)

                data = df['x'].values
                data[0] = data[:5000].min()# first value is often the largest, so it isn't a peak. we need it to be not the largest if we want a first peak

                peaks, _ = find_peaks(data)

                prominences = peak_prominences(data, peaks)[0]
                contour_heights = data[peaks] - prominences

                results_half = peak_widths(data, peaks, rel_height=0.5)
                results_half[0]  # widths

                results_full = peak_widths(data, peaks, rel_height=1)
                results_full[0]  # widths

                peak_data = pd.DataFrame({
                    'peak_x':peaks,
                    'peak_y':data[peaks],
                    'width':results_full[0],
                    'height':prominences
                }).sort_values(by='height', ascending=False)

                tmp = peak_data.reset_index(drop=True).reset_index().head(5)
                tmp['index'] = tmp['index'].astype(str).apply(lambda x: x.zfill(2))

                tmp = tmp.melt(id_vars=['index'], value_vars=['peak_x', 'peak_y', 'width', 'height'])

                tmp['variable'] = tmp['variable'] + tmp['index']

                tmp = np.array(tmp[['variable', 'value']])
                for titem in tmp:
                    res.append([f's{i:02d}p_' + titem[0], titem[1]])

                #################################
            t = pd.DataFrame(np.array(res), columns=['col', 'val'])
            t['segment_id'] = filename[:-4]
            t['val'] = t['val'].astype(float)
            t = t.fillna(0)
            t = t.pivot_table(
                index='segment_id',
                columns='col',
                values='val'
            ).reset_index()
            tmp_res = pd.concat([tmp_res, t], ignore_index=True)
        
    return tmp_res.fillna(0)

def read_all_files(dirname):
    tmp = pd.DataFrame()
    cnt = 0
    for dirname, _, filenames in os.walk(homedir + dirname):
        for filename in filenames:
            cnt += 1
#             if cnt > nfiles: break
            t = pd.read_csv(os.path.join(dirname, filename)).fillna(0)
            cols = t.columns
            t['segment_id'] = filename[:-4]
            t = t.groupby('segment_id', as_index=False).median()
            tmp = pd.concat([tmp, t], ignore_index=True)
        
    return tmp

 # Load first 10 files and explore some of them

In [None]:
t10 = read_n_files('train', 10).fillna(0)
t10['segment_id'].unique()

 # Plot sensors info

In [None]:
t = t10.loc[t10['segment_id'] == '800654756'].copy()
plt.figure(figsize=(15,15))

for i in [i for i in range(1,11)]:
    
    if i > 4: k = i - 4 
    else: k = i
    plt.subplot(3,4,i)

    plt.plot(t.index, t['sensor_'+str(i)])

plt.show()

 # Plot fast Fourier of sensors signals and point peaks on them

In [None]:
plt.figure(figsize=(15,15))

for i in [i for i in range(1,11)]:
    
    plt.subplot(3,4,i)

    # Number of samplepoints
    N = t.shape[0]
    # sample spacing
    T = 1.0 / 800.0
    x = np.linspace(0.0, N*T, N)
    y = t['sensor_'+str(i)].values
    yf = scipy.fftpack.fft(y)
    xf = np.linspace(0.0, int(1.0/(2.0*T)), int(N/2))
    
    x = 2.0/N * np.abs(yf[:N//2])
    peaks, _ = find_peaks(x, distance=500, height=0.5)
    
    plt.plot(x)
    plt.plot(peaks, x[peaks], "x")
    plt.plot(np.zeros_like(x), "--", color="gray")
    plt.title(str(np.median(x[peaks])))

plt.show()

 # Transform FFT to logscale and find peaks.
 Break peaks to intervals, and get summary in each interval

In [None]:
plt.figure(figsize=(15,15))

for i in [i for i in range(1,11)]:
    
    plt.subplot(3,4,i)

    # Number of samplepoints
    N = t.shape[0]
    # sample spacing
    T = 1.0 / 800.0
    x = np.linspace(0.0, N*T, N)
    y = t['sensor_'+str(i)].values
    yf = scipy.fftpack.fft(y)
    xf = np.linspace(0.0, int(1.0/(2.0*T)), int(N/2))
    
    x = np.log(2.0/N * np.abs(yf[:N//2]))
    peaks, _ = find_peaks(x, distance=500)
    
    plt.plot(x)
    plt.plot(peaks, x[peaks], "x")
    plt.plot(np.zeros_like(x), "--", color="gray")

#     r = [j*5000 for j in range(7)]
    for j in range(len(r)-1):

        xpeaks = peaks[(peaks > r[j])&(peaks <= r[j+1])]
        ypeaks = x[xpeaks]

        mx = round(np.median(xpeaks))
        sx = round(np.std(xpeaks))

        my = round(np.median(ypeaks), 2)
        sy = round(np.std(ypeaks), 2)
        print('plot', i, 'range =', r[j], r[j+1], f': x med = {mx}, x sd = {sx}, y med = {my}, y sd = {sy}')
    
    print()
#     plt.title(str(np.median(x[peaks])))

plt.show()

# Explore single sensor

In [None]:
i=2

plt.figure(figsize=(15,5))

x, peaks = get_peaks(t, 'sensor_'+str(i))

plt.plot(x)
plt.plot(peaks, x[peaks], "x")
plt.show()

df1 = pd.DataFrame({'ind':peaks, 'x':x[peaks]})
df2 = pd.DataFrame({'ind':np.linspace(0,30000,30001).astype(int)})
df = df2.merge(df1, how='left', on='ind') # we need to get nans in order to interpolate them

df = df.interpolate(method='linear', limit_direction='both', axis=0)

data = df['x'].values
data[0] = data[:5000].min()# first value is often the largest, so it isn't a peak. we need it to be not the largest if we want a first peak

peaks, _ = find_peaks(data)

prominences = peak_prominences(data, peaks)[0]
contour_heights = data[peaks] - prominences

results_half = peak_widths(data, peaks, rel_height=0.5)
results_half[0]  # widths

results_full = peak_widths(data, peaks, rel_height=1)
results_full[0]  # widths

plt.figure(figsize=(15,5))

plt.plot(data)
plt.plot(peaks, data[peaks], "x")

plt.hlines(*results_half[1:], color="C2")
plt.hlines(*results_full[1:], color="C3")

plt.vlines(x=peaks, ymin=contour_heights, ymax=data[peaks])

plt.show()

peak_data = pd.DataFrame({
    'peak_x':peaks,
    'peak_y':data[peaks],
    'width':results_full[0],
    'height':prominences
}).sort_values(by='height', ascending=False)

# peak_data

tmp = peak_data.reset_index(drop=True).reset_index().head(5)
tmp['index'] = tmp['index'].astype(str).apply(lambda x: x.zfill(2))

tmp = tmp.melt(id_vars=['index'], value_vars=['peak_x', 'peak_y', 'width', 'height'])

tmp['variable'] = tmp['variable'] + tmp['index']

# tmp = tmp.drop(columns='index').set_index('variable').T

tmp

# Transform peaks to line and find peaks on it
then state x,y,width and height for each of top 5 tall peaks

In [None]:
from scipy.signal import find_peaks_cwt

plt.figure(figsize=(15,15))

res = []

for i in [i for i in range(1,11)]:
    
    plt.subplot(3,4,i)

    x, peaks = get_peaks(t, 'sensor_'+str(i))
    
    for j in range(len(r)-1):

        xpeaks = peaks[(peaks > r[j])&(peaks <= r[j+1])]
        ypeaks = x[xpeaks]

        mx = round(np.median(xpeaks))
        sx = round(np.std(xpeaks))

        my = round(np.median(ypeaks), 2)
        sy = round(np.std(ypeaks), 2)

        r0, r1 = r[j], r[j+1]
        
        res.append([f's{i:02d}_r{r0:05d}_{r1}_mx', mx])
        res.append([f's{i:02d}_r{r0:05d}_{r1}_sx', sx])
        res.append([f's{i:02d}_r{r0:05d}_{r1}_my', my])
        res.append([f's{i:02d}_r{r0:05d}_{r1}_sy', sy])

    df1 = pd.DataFrame({'ind':peaks, 'x':x[peaks]})
    df2 = pd.DataFrame({'ind':np.linspace(0,30000,30001).astype(int)})
    df = df2.merge(df1, how='left', on='ind') # we need to get nans in order to interpolate them

    df = df.interpolate(method='linear', limit_direction='both', axis=0)

    data = df['x'].values
    data[0] = data[:5000].min()# first value is often the largest, so it isn't a peak. we need it to be not the largest if we want a first peak

    peaks, _ = find_peaks(data)

    prominences = peak_prominences(data, peaks)[0]
    contour_heights = data[peaks] - prominences

    results_half = peak_widths(data, peaks, rel_height=0.5)
    results_half[0]  # widths

    results_full = peak_widths(data, peaks, rel_height=1)
    results_full[0]  # widths

    plt.plot(data)
    plt.plot(peaks, data[peaks], "x")

    plt.hlines(*results_half[1:], color="C2")
    plt.hlines(*results_full[1:], color="C3")

    plt.vlines(x=peaks, ymin=contour_heights, ymax=data[peaks])

    peak_data = pd.DataFrame({
        'peak_x':peaks,
        'peak_y':data[peaks],
        'width':results_full[0],
        'height':prominences
    }).sort_values(by='height', ascending=False)

    # peak_data

    tmp = peak_data.reset_index(drop=True).reset_index().head(5) # top5. some sensors go flat, some show 100 peaks. let's count most visible ones
    tmp['index'] = tmp['index'].astype(str).apply(lambda x: x.zfill(2)) # just for fancy grouping of vars

    tmp = tmp.melt(id_vars=['index'], value_vars=['peak_x', 'peak_y', 'width', 'height'])

    tmp['variable'] = tmp['variable'] + tmp['index']

#     tmp = tmp.drop(columns='index').set_index('variable').T
    tmp = np.array(tmp[['variable', 'value']])
    for titem in tmp:
        res.append([f's{i:02d}_' + titem[0], titem[1]])
        
        
plt.show()

tmp = pd.DataFrame(np.array(res), columns=['col', 'val'])
tmp['segment_id'] = t['segment_id'][0]
tmp['val'] = tmp['val'].astype(float)
tmp = tmp.fillna(0)
tmp = tmp.pivot_table(
    index='segment_id',
    columns='col',
    values='val'
).reset_index()
# res = pd.concat([tmp, t], ignore_index=True)
tmp


 # Read all data using transformations mentioned earlier
 
 i've uploaded the result of this two inputs as a dataset, because it takes long to create the files

In [None]:
# t4m = read_n_files32('train', 4431)

In [None]:
# t4p = read_n_files32('test', 4520)

 # Load training labels

In [None]:
# label = pd.read_csv(homedir+'train.csv')
# tstats(label)

 # Add label to dataset

In [None]:
# label['segment_id'] = label['segment_id'].astype(str)
# t4m = t4m.merge(label, how='left', on='segment_id')
# tstats(t4m)

# Import ML staff

In [None]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import f1_score,recall_score,precision_score,roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
import gc

def plot_history(history, metricname):
    recall = history.history[metricname]
    val_recall = history.history['val_' + metricname]
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(recall) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, recall, 'b', label='Training')
    plt.plot(x, val_recall, 'r', label='Validation')
    plt.title('Training and validation')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()
    
from keras import regularizers as kreg
from keras.layers import Dense
from keras.models import Sequential

# Define x and y

and also create correlation matrix and make features as a sum and multiplication of features, 
correlated with target

then do a quantile transformation in order to get Gaussian distribution of target

In [None]:
def prepare_data(scale=False):
    
    t4m = pd.read_csv('/kaggle/input/volcanic-eruptions/t4m_10.csv')
    t4p = pd.read_csv('/kaggle/input/volcanic-eruptions/t4p_10.csv')
    features = t4m.columns[1:-1]
    target = 'time_to_eruption'

    df = t4m[t4m.columns[1:]].copy()
    t = df.corr()[target].reset_index()
    t[target] = abs(t[target])
    t.sort_values(by=target, ascending = False)

    t = t.loc[(t[target] > 0.05)&(t[target] < 1)].sort_values(by=target, ascending=False).reset_index()

    good_features = t['index'].values

    scaler = StandardScaler()

    # for train
    t = pd.DataFrame(scaler.fit_transform(t4m[good_features]), columns = good_features)

    t['sum_feat'] = t[good_features].sum(axis=1)
    t['mul_feat'] = t[good_features].prod(axis=1)
    t['s_m_feat'] = t['sum_feat'] * t['mul_feat']

    t4m['sum_feat'] = t['sum_feat'].values
    t4m['mul_feat'] = t['mul_feat'].values
    t4m['s_m_feat'] = t['s_m_feat'].values


    # for test
    t = pd.DataFrame(scaler.fit_transform(t4p[good_features]), columns = good_features)

    t['sum_feat'] = t[good_features].sum(axis=1)
    t['mul_feat'] = t[good_features].prod(axis=1)
    t['s_m_feat'] = t['sum_feat'] * t['mul_feat']

    t4p['sum_feat'] = t['sum_feat'].values
    t4p['mul_feat'] = t['mul_feat'].values
    t4p['s_m_feat'] = t['s_m_feat'].values

    feat = list(features)
    feat.append('sum_feat')
    feat.append('mul_feat')
    feat.append('s_m_feat')
    

    if scale:
        scaler = StandardScaler()
        t4m[feat] = scaler.fit_transform(t4m[feat])
        t4p[feat] = scaler.transform(t4p[feat])
    
    X, y = t4m[feat], t4m[target]

    data = y.values.reshape(-1, 1)

    from sklearn.preprocessing import QuantileTransformer

    rng = np.random.RandomState(304)
    qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal',
                             random_state=rng)

    z = qt.fit_transform(data)
    z_back = qt.inverse_transform(z)

#     plt.figure(figsize=(15,5))
#     plt.subplot(131)
#     plt.hist(data)
#     plt.title('y')

#     plt.subplot(132)
#     plt.hist(z, bins = 50)
#     plt.title('z')

#     plt.subplot(133)
#     plt.hist(z_back, bins = 50)
#     plt.title('inverse')

#     plt.show()
    
    return t4m, t4p, X, y, z, feat, target, qt

t4m, t4p, X, y, z, feat, target, qt = prepare_data()

def de_qt(val):
    return qt.inverse_transform(val.reshape(-1, 1)).ravel()

In [None]:
def show_model_performance(y_test_plot, y_pred_plot):

    converter2days = 1/60/60/24/1000
    #     res = pd.DataFrame({'y':inv_boxcox1p(y_test_plot, bclambda)*converter2mos,'preds':inv_boxcox1p(y_pred_plot, bclambda)*converter2mos})
    res = pd.DataFrame({
        'y':y_test_plot.reshape(-1, 1).ravel()*converter2days,
        'preds':y_pred_plot.reshape(-1, 1).ravel()*converter2days})
    res['diff'] = abs(res['preds'] - res['y'])
    res['err'] = res['diff']/res['y']
    res.sort_values(by = 'preds', inplace = True)
    res.reset_index(inplace = True)
    res['index'] = res.index

    lmetric = int(mean_absolute_error(y_test_plot, y_pred_plot))
    print('The MAE of prediction is:', lmetric)

    fig = plt.figure(figsize=(19, 4))
    ax1 = fig.add_subplot(1, 3, 1)
    ax2 = fig.add_subplot(1, 3, 2)
    ax3 = fig.add_subplot(1, 3, 3)
    ax1.hist(y_train, bins=25)
    ax1.set_title('train labels')

    ax2.hist(y_test, bins=25)
    ax2.set_title('test labels')

    ax3.hist(y_pred, bins=25)
    ax3.set_title('Predictions')

    plt.show()


    plt.figure(figsize = (15,5))
    plt.plot(res['index'], res['y'], 'o-b', label = 'true labels')
    plt.plot(res['index'], res['preds'], 'o-g', label = 'predictions')

    plt.title('Prediction Power')
    plt.xlabel('Measurement')
    plt.ylabel('Delta Magnitude')
    plt.legend(['true','preds'])

    plt.show()

# CONCLUSIONS:

    LGBM is the winner
    k-fold is must-have
    optimal number of folds = 5
    LGBM works better with scaling
    
    Accuracy on hold-out:
    
    keras - 4.8 (4.8 w/o k-fold)
    catboost - 3.3 (3.9 w/o k-fold)
    lgbm - 2.91

    SumFeat does cool!
    Both peak and interval features are important

In [None]:
%%time

t4m, t4p, X, y, z, feat, target, qt = prepare_data(scale=True)

n_fold = 7
folds = KFold(n_splits=n_fold, shuffle=True, random_state=101)

params = {
    "n_estimators": 5000,
    "boosting_type": "gbdt",
    "metric": "mae",
    "num_leaves": 66,
    "learning_rate": 0.005,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "agging_freq": 3,
    "max_bins": 2048,
    "verbose": 0,
    "random_state": 101,
    "nthread": -1,
    "device": "cpu",
}

y_pred = np.zeros(t4p.shape[0])

t_train = t4m.copy()

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(t_train)):
    print(f"Fold {n_fold}:")
    trn_x, trn_y = t_train[feat].iloc[trn_idx], t_train[target].iloc[trn_idx]
    val_x, val_y = t_train[feat].iloc[val_idx], t_train[target].iloc[val_idx]
    
    model = lgb.LGBMRegressor(**params)
    
    model.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric="mae", verbose=200, early_stopping_rounds=50
           )

    y_pred += model.predict(t4p[feat], num_iteration=model.best_iteration_) / folds.n_splits
    
plt.figure()
plt.hist(y_pred, bins=50)
plt.title('y pred distribution')
plt.show()

my_submission = pd.DataFrame({
    'segment_id':t4p['segment_id'].values,
    'time_to_eruption':y_pred
})

# my_submission.loc[my_submission[target] < 60000, target] = 60000 # in case of below zero predictions, correct them according to training data
# my_submission.loc[my_submission[target] > 4.8e+07, target] = 4.8e+07 # correct preds according to train limits

my_submission[target] = abs(my_submission[target])

my_submission.to_csv(f'submission_{ver}.csv', index=False)