In [None]:
# imports
import re
import pandas as pd
import numpy as np
from math import sqrt, ceil
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler



In [None]:
def get_train_dataset(train_features_df, labels_df):
    # this is a function to combine train data and label data and return X, Y
    _df = train_features_df.merge(labels_df, on='segment_id', how='left')
    _y = _df['time_to_eruption']
    _x = _df.drop(['segment_id', 'time_to_eruption'], axis=1)
    return (_x, _y)

In [None]:
def get_dataset_by_range(features_df, labels_df, lower_limit, upper_limit):
    # this function returns a training dataset of X and Y for a given lower limit and uper limit (on the label)
    _df = features_df.merge(labels_df, on='segment_id', how='left')
    _df = _df[_df['time_to_eruption'].notna()]
    _df = _df[(_df['time_to_eruption'] > lower_limit) & (_df['time_to_eruption'] < upper_limit)]
    _y = _df['time_to_eruption']
    _x = _df.drop(['segment_id', 'time_to_eruption'], axis=1)
    _x = _df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '_', x))
    return (_x, _y)

In [None]:


# read all our datasets
train_df = pd.read_parquet('/kaggle/input/ingv-parquet/train_features.parquet')
test_df = pd.read_parquet('/kaggle/input/ingv-parquet/test_features.parquet')
labels_df = pd.read_parquet('/kaggle/input/ingv-parquet/labels.parquet')



In [None]:


# lightGBM doesn't like column names with special characters like -, so we convert them to _
train_df = train_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '_', x))
test_df = test_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '_', x))



In [None]:
# top 501 features
features = [

 'sensor_9__fft_coefficient__attr_"abs"__coeff_21',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_22',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_23',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_24',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_25',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_26',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_27',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_28',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_29',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_30',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_31',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_32',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_33',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_34',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_35',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_36',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_37',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_38',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_39',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_40',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_41',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_42',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_43',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_44',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_45',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_47',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_50',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_52',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_54',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_55',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_56',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_58',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_59',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_60',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_61',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_62',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_63',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_64',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_69',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_71',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_79',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_80',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_81',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_82',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_83',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_87',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_90',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_92',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_93',
 'sensor_9__fft_coefficient__attr_"abs"__coeff_94',
 'sensor_9__fft_coefficient__attr_"angle"__coeff_23',
 'sensor_9__fft_coefficient__attr_"angle"__coeff_32',
 'sensor_9__fft_coefficient__attr_"angle"__coeff_44',
 'sensor_9__fft_coefficient__attr_"angle"__coeff_75',
 'sensor_9__fft_coefficient__attr_"angle"__coeff_77',
 'sensor_9__approximate_entropy__m_2__r_0.1',
 'sensor_9__approximate_entropy__m_2__r_0.3',
 'sensor_9__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"var"',
 'sensor_9__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"max"',
 'sensor_9__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"mean"',
 'sensor_9__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"var"',
 'sensor_9__number_crossing_m__m_0',
 'sensor_9__ratio_beyond_r_sigma__r_0.5',
 'sensor_9__permutation_entropy__dimension_3__tau_1',
 'sensor_9__permutation_entropy__dimension_4__tau_1',
 'sensor_8__abs_energy',
 'sensor_8__mean_abs_change',
 'sensor_8__standard_deviation',
 'sensor_8__kurtosis',
 'sensor_8__longest_strike_below_mean',
 'sensor_8__longest_strike_above_mean',
 'sensor_8__maximum',
 'sensor_8__minimum',
 'sensor_8__cid_ce__normalize_True',
 'sensor_8__quantile__q_0.2',
 'sensor_8__quantile__q_0.3',
 'sensor_8__quantile__q_0.4',
 'sensor_8__quantile__q_0.6',
 'sensor_8__quantile__q_0.7',
 'sensor_8__quantile__q_0.8',
 'sensor_8__agg_autocorrelation__f_agg_"mean"__maxlag_40',
 'sensor_8__number_cwt_peaks__n_1',
 'sensor_8__number_cwt_peaks__n_5',
 'sensor_8__number_peaks__n_1',
 'sensor_8__number_peaks__n_3',
 'sensor_8__number_peaks__n_5',
 'sensor_8__number_peaks__n_10',
 'sensor_8__number_peaks__n_50',
 'sensor_8__binned_entropy__max_bins_10',
 'sensor_8__spkt_welch_density__coeff_2',
 'sensor_8__spkt_welch_density__coeff_5',
 'sensor_8__spkt_welch_density__coeff_8',
 'sensor_8__ar_coefficient__coeff_1__k_10',
 'sensor_8__ar_coefficient__coeff_2__k_10',
 'sensor_8__ar_coefficient__coeff_3__k_10',
 'sensor_8__ar_coefficient__coeff_4__k_10',
 'sensor_8__ar_coefficient__coeff_5__k_10',
 'sensor_8__ar_coefficient__coeff_6__k_10',
 'sensor_8__ar_coefficient__coeff_7__k_10',
 'sensor_8__ar_coefficient__coeff_8__k_10',
 'sensor_8__ar_coefficient__coeff_9__k_10',
 'sensor_8__ar_coefficient__coeff_10__k_10',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_15',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_16',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_17',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_18',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_19',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_20',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_21',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_22',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_23',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_24',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_25',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_26',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_27',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_28',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_29',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_30',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_31',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_32',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_33',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_34',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_35',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_36',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_37',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_38',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_39',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_40',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_41',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_42',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_43',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_44',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_45',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_46',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_47',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_48',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_49',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_50',
 'sensor_8__fft_coefficient__attr_"abs"__coeff_83',
 'sensor_8__value_count__value_1',
 'sensor_8__value_count__value_-1',
 'sensor_8__approximate_entropy__m_2__r_0.1',
 'sensor_8__approximate_entropy__m_2__r_0.3',
 'sensor_8__approximate_entropy__m_2__r_0.5',
 'sensor_8__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"max"',
 'sensor_8__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"min"',
 'sensor_8__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"var"',
 'sensor_8__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"var"',
 'sensor_8__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"var"',
 'sensor_8__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"max"',
 'sensor_8__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"mean"',
 'sensor_8__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"var"',
 'sensor_8__number_crossing_m__m_0',
 'sensor_8__ratio_beyond_r_sigma__r_0.5',
 'sensor_8__permutation_entropy__dimension_3__tau_1',
 'sensor_3__abs_energy',
 'sensor_3__standard_deviation',
 'sensor_3__kurtosis',
 'sensor_3__longest_strike_below_mean',
 'sensor_3__longest_strike_above_mean',
 'sensor_3__maximum',
 'sensor_3__cid_ce__normalize_True',
 'sensor_3__quantile__q_0.2',
 'sensor_3__quantile__q_0.4',
 'sensor_3__quantile__q_0.6',
 'sensor_3__quantile__q_0.7',
 'sensor_3__agg_autocorrelation__f_agg_"mean"__maxlag_40',
 'sensor_3__agg_autocorrelation__f_agg_"median"__maxlag_40',
 'sensor_3__number_cwt_peaks__n_1',
 'sensor_3__number_cwt_peaks__n_5',
 'sensor_3__number_peaks__n_1',
 'sensor_3__number_peaks__n_3',
 'sensor_3__number_peaks__n_5',
 'sensor_3__number_peaks__n_10',
 'sensor_3__number_peaks__n_50',
 'sensor_3__binned_entropy__max_bins_10',
 'sensor_3__spkt_welch_density__coeff_2',
 'sensor_3__spkt_welch_density__coeff_5',
 'sensor_3__spkt_welch_density__coeff_8',
 'sensor_3__ar_coefficient__coeff_1__k_10',
 'sensor_3__ar_coefficient__coeff_2__k_10',
 'sensor_3__ar_coefficient__coeff_3__k_10',
 'sensor_3__ar_coefficient__coeff_4__k_10',
 'sensor_3__ar_coefficient__coeff_5__k_10',
 'sensor_3__ar_coefficient__coeff_6__k_10',
 'sensor_3__ar_coefficient__coeff_7__k_10',
 'sensor_3__ar_coefficient__coeff_8__k_10',
 'sensor_3__ar_coefficient__coeff_9__k_10',
 'sensor_3__change_quantiles__f_agg_"var"__isabs_False__qh_0.6__ql_0.4',
 'sensor_3__fft_coefficient__attr_"real"__coeff_11',
 'sensor_3__fft_coefficient__attr_"real"__coeff_12',
 'sensor_3__fft_coefficient__attr_"real"__coeff_13',
 'sensor_3__fft_coefficient__attr_"real"__coeff_14',
 'sensor_3__fft_coefficient__attr_"imag"__coeff_1',
 'sensor_3__fft_coefficient__attr_"imag"__coeff_11',
 'sensor_3__fft_coefficient__attr_"imag"__coeff_12',
 'sensor_3__fft_coefficient__attr_"imag"__coeff_13',
 'sensor_3__fft_coefficient__attr_"imag"__coeff_14',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_0',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_1',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_2',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_3',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_4',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_5',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_6',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_7',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_8',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_9',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_10',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_11',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_12',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_13',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_14',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_15',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_16',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_17',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_18',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_19',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_20',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_21',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_22',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_23',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_24',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_25',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_26',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_27',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_28',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_29',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_30',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_31',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_32',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_33',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_34',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_35',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_36',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_37',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_38',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_39',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_40',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_41',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_42',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_43',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_44',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_45',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_46',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_47',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_48',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_49',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_53',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_72',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_77',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_84',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_86',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_88',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_89',
 'sensor_3__fft_coefficient__attr_"abs"__coeff_94',
 'sensor_3__value_count__value_1',
 'sensor_3__value_count__value_-1',
 'sensor_3__approximate_entropy__m_2__r_0.1',
 'sensor_3__approximate_entropy__m_2__r_0.3',
 'sensor_3__approximate_entropy__m_2__r_0.5',
 'sensor_3__agg_linear_trend__attr_"slope"__chunk_len_5__f_agg_"mean"',
 'sensor_3__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"var"',
 'sensor_3__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"var"',
 'sensor_3__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"max"',
 'sensor_3__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"mean"',
 'sensor_3__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"var"',
 'sensor_3__number_crossing_m__m_0',
 'sensor_3__ratio_beyond_r_sigma__r_0.5',
 'sensor_3__permutation_entropy__dimension_3__tau_1',
 'sensor_3__permutation_entropy__dimension_4__tau_1',
 'sensor_5__mean_abs_change',
 'sensor_5__longest_strike_below_mean',
 'sensor_5__longest_strike_above_mean',
 'sensor_5__cid_ce__normalize_True',
 'sensor_5__cid_ce__normalize_False',
 'sensor_5__number_cwt_peaks__n_1',
 'sensor_5__number_cwt_peaks__n_5',
 'sensor_5__number_peaks__n_1',
 'sensor_5__number_peaks__n_3',
 'sensor_5__number_peaks__n_5',
 'sensor_5__number_peaks__n_10',
 'sensor_5__spkt_welch_density__coeff_2',
 'sensor_5__spkt_welch_density__coeff_5',
 'sensor_5__spkt_welch_density__coeff_8',
 'sensor_5__ar_coefficient__coeff_1__k_10',
 'sensor_5__ar_coefficient__coeff_2__k_10',
 'sensor_5__ar_coefficient__coeff_3__k_10',
 'sensor_5__ar_coefficient__coeff_4__k_10',
 'sensor_5__ar_coefficient__coeff_5__k_10',
 'sensor_5__ar_coefficient__coeff_6__k_10',
 'sensor_5__ar_coefficient__coeff_7__k_10',
 'sensor_5__ar_coefficient__coeff_8__k_10',
 'sensor_5__ar_coefficient__coeff_9__k_10',
 'sensor_5__ar_coefficient__coeff_10__k_10',
 'sensor_5__change_quantiles__f_agg_"mean"__isabs_True__qh_0.6__ql_0.0',
 'sensor_5__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.0',
 'sensor_5__change_quantiles__f_agg_"var"__isabs_False__qh_0.8__ql_0.2',
 'sensor_5__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2',
 'sensor_5__fft_coefficient__attr_"real"__coeff_11',
 'sensor_5__fft_coefficient__attr_"real"__coeff_12',
 'sensor_5__fft_coefficient__attr_"real"__coeff_13',
 'sensor_5__fft_coefficient__attr_"imag"__coeff_11',
 'sensor_5__fft_coefficient__attr_"imag"__coeff_12',
 'sensor_5__fft_coefficient__attr_"imag"__coeff_13',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_4',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_5',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_6',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_7',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_8',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_9',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_10',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_11',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_12',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_13',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_14',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_15',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_16',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_17',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_18',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_19',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_20',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_21',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_22',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_23',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_24',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_25',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_26',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_27',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_28',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_29',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_30',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_31',
 'sensor_5__fft_coefficient__attr_"abs"__coeff_32',
 'sensor_5__value_count__value_1',
 'sensor_5__value_count__value_-1',
 'sensor_5__approximate_entropy__m_2__r_0.1',
 'sensor_5__approximate_entropy__m_2__r_0.3',
 'sensor_5__approximate_entropy__m_2__r_0.5',
 'sensor_5__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"mean"',
 'sensor_5__number_crossing_m__m_0',
 'sensor_5__permutation_entropy__dimension_3__tau_1'
]

In [None]:


# also renaming features so they match to the current column names in our dataset
features = [re.sub('[^A-Za-z0-9_]+', '_', feature) for feature in features]



In [None]:


# get columns only available in test set
seg_id_test = test_df['segment_id']
x_test = test_df.drop(['segment_id'], axis=1)[features]



In [None]:


# get train data
train, label = get_train_dataset(train_df, labels_df)
# keep only the top features 
train = train[features]
test_df = test_df[features + ['segment_id']]
# scaling is a must
scaler = MinMaxScaler().fit(pd.concat([train, x_test]))
x_test = pd.DataFrame(scaler.transform(x_test))
train = pd.DataFrame(scaler.transform(train))
# renaming the columns back after scaling has been done
x_test.columns = features
train.columns = features



In [None]:
# split the train data into train and val. we will use val for early stopping
x_train, x_val, y_train, y_val = train_test_split(train, label, random_state=786, test_size=0.2, shuffle=False)

In [None]:


# hyper params for our LightGBM
params = {'application':'regression',
         'boosting ': 'dart',
         'num_iterations':10000, 
         'learning_rate':0.03, 
         'num_leaves': 45,
         'extra_trees': True,
         'feature_fraction':0.8, 
         'bagging_fraction':0.9,
         'lambda_l1':0.1, 
         'lambda_l2':0.1, 
         'min_split_gain':0.01, 
         'early_stopping_round':100, 
         'max_depth':6,
         'min_child_weight':40, 
         'n_estimators': 400,
         'metric':'mse',
         'verbosity': -1}



In [None]:


# train, fit, and get prediction for validation dataset
lgb_first = LGBMRegressor(**params)
lgb_first.fit(x_train, y_train, eval_set=(x_val, y_val))
val_preds = lgb_first.predict(x_val)



In [None]:
# create an output dataframe of values and predictions
output = pd.DataFrame(list(zip(y_val, val_preds)))
output.columns = ['val', 'pred']
# we dont have segment id here but since the label (time to eruption) for six rows will be the same, we groupby val
output = output.groupby('val').mean().reset_index()
print('Simple LGB model rmse: ', sqrt(mse(output['val'].to_numpy(), output['pred'].to_numpy())))
print('Simple LGB model mae: ', mae(output['val'].to_numpy(), output['pred'].to_numpy()))

In [None]:


# we will zoom into the performance of the model for these different segments
steps = [(0, 2500000), (2500000, 15000000), (10000000, 25000000), 
         (20000000, 35000000), (30000000, 44000000), (44000000, 50000000)]

_mae_list = []

# loop over the range defined in steps and get performance
for l, u in steps:
    _output = output[(output['val'] > l) & (output['val'] < u)]
    _range = str(l)+'-'+str(u)
    _mae_list.append((_range, mae(_output['val'].to_numpy(), _output['pred'].to_numpy())))
    
mae_df = pd.DataFrame(_mae_list, columns=['range', 'mae'])



In [None]:


output['diff'] = output['pred'] - output['val']
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20,5))
fig.suptitle('Figure 1 - Zooming Into 1st Round Model Performance')
ax1.hist([output['val'], output['pred']])
ax2.scatter(output['val'], output['diff'], marker='x')
ax3.tick_params(labelrotation=45)
ax3.bar(mae_df['range'], mae_df['mae'])



In [None]:
# first test prediction set
test_preds = lgb_first.predict(x_test)
test_first_preds_df = pd.DataFrame(list(zip(seg_id_test, test_preds)))
test_first_preds_df.columns = ['segment_id', 'time_to_eruption']
test_first_preds_df = test_first_preds_df.groupby('segment_id').mean().reset_index()

In [None]:
# define steps of ranges for which we want to create smaller models
steps = [(0, 2500000), (0, 15000000), (10000000, 25000000), 
         (20000000, 35000000), (30000000, 50000000), (44000000, 50000000)]
# empty lists to store results
output_list, model_list = [], []
y_val_list, val_preds_list = [], []

In [None]:
# hyper-params for our specialized model. notice how we make these model a bit more complex by increasing number of leaves and max depth
params = {'application':'regression',
         'boosting ': 'dart',
         'num_iterations':8000, 
         'learning_rate':0.05, 
         'num_leaves': 95,
         'extra_trees': True,
         'feature_fraction':0.8, 
         'bagging_fraction':0.9,
         'lambda_l1':0.1, 
         'lambda_l2':0.1, 
         'min_split_gain':0.01, 
         'early_stopping_round':100, 
         'max_depth': 7,
         'min_child_weight':40, 
         'n_estimators': 400,
         'metric':'mae',
         'verbosity': -1}

In [None]:


# iterate over the steps and train models
for l, u in steps:
    # get data for range and scale
    _x, _y = get_dataset_by_range(train_df, labels_df, l, u)
    _x = _x[features]
    _x = pd.DataFrame(scaler.transform(_x))
    _x.columns = features
    # split the data for this range
    _x_train, _x_val, _y_train, _y_val = train_test_split(_x, _y, random_state=786, test_size=0.2, shuffle=False)
    # train model for this range
    _lgb = LGBMRegressor(**params)
    _lgb.fit(_x_train, _y_train, eval_set=(_x_val, _y_val))
    _val_preds = _lgb.predict(_x_val)
    # create a dataframe to compare actual values vs predictions
    _output = pd.DataFrame(list(zip(_y_val, _val_preds)))
    _output.columns = ['val', 'pred']
    _output = _output.groupby('val').median().reset_index()
    _output['diff'] = _output['val'] - _output['pred']
    # save model in the model list
    model_list.append(_lgb)
    # save output dataframe containing actual value vs predicted value in a list
    output_list.append(_output)



In [None]:


_mae_list = []

for idx, _output in enumerate(output_list):
    l, h = steps[idx]
    _mae = mae(_output['val'].to_numpy(), _output['pred'].to_numpy())
    _range = '%s-%s' % (l, h)
    _mae_list.append((_range, _mae))

new_mae_df = pd.DataFrame(_mae_list, columns=['range', 'mae'])



In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))
fig.suptitle('Figure 2 - Model Performance Comparision For Round One and Two')
ax1.tick_params(labelrotation=45)
ax2.tick_params(labelrotation=45)
ax1.bar(mae_df['range'], mae_df['mae'])
ax2.bar(new_mae_df['range'], new_mae_df['mae'])

In [None]:


fig, ax = plt.subplots(len(steps), 2, figsize=(15,15))
fig.suptitle('Figure 3 - Individual Model Performance Round Two')
for idx, _output in enumerate(output_list):
    ax[idx][0].hist([_output['val'], _output['pred']])
    ax[idx][1].scatter(_output['val'], _output['diff'], marker='x')



In [None]:
# create a dataframe with test features and their predictions
test_pred_df = pd.merge(test_df, test_first_preds_df, on='segment_id')

In [None]:


# empty list for storing results
segment_id, time_to_eruption = [], []



In [None]:
# loop over each segment
for idx, value in enumerate(steps):
    l, h = value
    if l == 0:
        l = -50000000
    if h == 50000000:
        h = 100000000
    # get segments that fall within the range
    _test_df = test_pred_df[(test_pred_df['time_to_eruption'] >= l) & 
                            (test_pred_df['time_to_eruption'] <= h)]
    _test_id = _test_df['segment_id']
    _x_test = _test_df.drop(['segment_id', 'time_to_eruption'], axis=1)
    _x_test = pd.DataFrame(scaler.transform(_x_test))
    _x_test.columns = features
    _preds = model_list[idx].predict(_x_test)
    segment_id += list(_test_id) 
    time_to_eruption += list(_preds)

In [None]:


kaggle_submit = pd.DataFrame(list(zip(segment_id, time_to_eruption)))
kaggle_submit.columns = ['segment_id', 'time_to_eruption']
kaggle_submit = kaggle_submit.groupby('segment_id').median().reset_index()



In [None]:
# check our submission is good
assert len(kaggle_submit) == 4520
assert kaggle_submit['segment_id'].dtypes == 'int64'
assert kaggle_submit['time_to_eruption'].dtypes == 'float64'

In [None]:
kaggle_submit.to_csv('./submission.csv', index=False)