In [None]:
import os

from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt

from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

import optuna

from xgboost import XGBRegressor

Задача состоит в том, чтобы по поведению временного ряда в 10-минутном интервале предсказать величину `time_to_eruption`.
При этом не требуется декомпозировать ряд или прогнозировать его изменение.
В таком случае важно вычислять такие характеристики, как наибольший выброс, коэффициенты эксцесса, ассиметрии и так далее.

In [None]:
data_folder = Path('../input/predict-volcanic-eruptions-ingv-oe/')

df = pd.read_csv(data_folder / 'train.csv')

df.head().T

Рассмотрим, как выглядит временной ряд в 10-минутном промежутке.
Выберем наблюдения для промежутков с наименьшим и наибольшим доступными показателями `time_to_eruption`.

In [None]:
df = df.sort_values('time_to_eruption')

time_min = df.head(1).iloc[0]['segment_id']
time_max = df.tail(1).iloc[0]['segment_id']

df_min = pd.read_csv(data_folder / 'train' / f'{time_min}.csv')
df_max = pd.read_csv(data_folder / 'train' / f'{time_max}.csv')

df_min['time_to_eruption'] = df.head(1).iloc[0]['time_to_eruption']
df_max['time_to_eruption'] = df.tail(1).iloc[0]['time_to_eruption']

In [None]:
df_min.describe()

In [None]:
df_max.describe()

Заметим, что у ряда с минимальным `time_to_eruption` не работают 3 датчика, а у ряда с максимальным значением всего 2.
Можем использовать число отсутствующих датчиков как признак при предсказании.

In [None]:
for i in range(10):
    sensor = f'sensor_{i + 1}'
    if df_min[sensor].isnull().all() or df_max[sensor].isnull().all():
        del df_min[sensor]
        del df_max[sensor]

In [None]:
def plot_sensors_data(ld, rd):
    figure, axs = plt.subplots(5, 2, figsize=(16, 20))

    for (i, c) in zip(range(5), df_min.columns):
        axs[i, 0].plot(ld[c])
        axs[i, 0].set_title(f'Minimal time to eruption, {c}')
    
        axs[i, 1].plot(rd[c])
        axs[i, 1].set_title(f'Maximal time to eruption, {c}')

In [None]:
plot_sensors_data(df_min, df_max)

Видно, что при малом `time_to_eruption` наблюдается всплеск около `28000` на всех датчиках, особенно массивный на последних двух.
На `sensor_7` также есть заметный всплеск около `48000`.

Уменьшим масштаб и посмотрим, как ведет себя ряд на малых промежутках времени.

In [None]:
plot_sensors_data(df_min[:100], df_max[:100])

Нетрудно заметить, что частоты отличаются.
Для извлечения этой и других характеристик используем библиотеку `tsfresh`. 

In [None]:
# !pip install tsfresh

from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

In [None]:
tsfresh_parameters = MinimalFCParameters()

Так как данных много, то вручную настраиваем параметры, которые хотим извлечь.

In [None]:
del tsfresh_parameters['length']

tsfresh_parameters['skewness'] = None
tsfresh_parameters['kurtosis'] = None
tsfresh_parameters['last_location_of_maximum'] = None
tsfresh_parameters['first_location_of_maximum'] = None
tsfresh_parameters['last_location_of_minimum'] = None
tsfresh_parameters['first_location_of_minimum'] = None
tsfresh_parameters['first_location_of_minimum'] = None
tsfresh_parameters['benford_correlation'] = None
tsfresh_parameters['percentage_of_reoccurring_values_to_all_values'] = None
tsfresh_parameters['percentage_of_reoccurring_datapoints_to_all_datapoints'] = None

tsfresh_parameters['number_peaks'] =  [
    {'n': 1}, 
    {'n': 3}, 
    {'n': 5}, 
    {'n': 10}, 
    {'n': 50}
]
tsfresh_parameters['binned_entropy']  = [
    {'max_bins': 10}
]
tsfresh_parameters['fft_aggregated']  = [
    {'aggtype': 'centroid'},
    {'aggtype': 'variance'},
    {'aggtype': 'skew'},
    {'aggtype': 'kurtosis'}
]
tsfresh_parameters['autocorrelation'] = [
    {'lag': 0},
    {'lag': 1},
    {'lag': 2},
    {'lag': 3},
    {'lag': 4},
    {'lag': 5},
    {'lag': 6},
    {'lag': 7},
    {'lag': 8},
    {'lag': 9}
]
tsfresh_parameters['agg_autocorrelation'] = [
    {'f_agg': 'mean', 'maxlag': 40},
    {'f_agg': 'median', 'maxlag': 40},
    {'f_agg': 'var', 'maxlag': 40}
]
tsfresh_parameters['friedrich_coefficients'] = [
    {'coeff': 0, 'm': 3, 'r': 30},
    {'coeff': 1, 'm': 3, 'r': 30},
    {'coeff': 2, 'm': 3, 'r': 30},
    {'coeff': 3, 'm': 3, 'r': 30}
]
tsfresh_parameters['count_above'] = [{'t': 0}]
tsfresh_parameters['count_below'] = [{'t': 0}]

Читаем данные всех рядов.

In [None]:
def preprocess_timeseries(data, parameters, is_train=True):
    df_result = None
    
    if is_train:
        segments = df.iterrows()
    else:
        segments = enumerate(os.listdir(data_folder / 'test'))
        
    for idx, row in segments:
        if is_train:
            segment, time_to_eruption = row
            segment_timeseries_path = data_folder / 'train/{segment}.csv'
        else:
            segment = row
            segment_timeseries_path = data_folder / 'test/{segment}'

        segment_timeseries = pd.read_csv(segment_timeseries_path)
        segment_timeseries = segment_timeseries.fillna(0).reset_index()
        segment_timeseries['id'] = idx
        
        # Извлекаем признаки из временного ряда
        extracted_features = extract_features(segment_timeseries, \
                                              column_id='id', \
                                              column_sort='index', \
                                              disable_progressbar=True,
                                              default_fc_parameters=parameters)
        extracted_features['segment'] = segment
        
        if is_train:
            extracted_features['time_to_eruption'] = time_to_eruption

        if df_result is None:
            df_result = extracted_features
        else:
            df_result = pd.concat([df_result, extracted_features], \
                                  axis=0, \
                                  ignore_index=True, \
                                  sort=True)
            
        print(f'Processed segment #{idx}')
        
    return df_result

Сохраняем полученные датафреймы, чтобы не пересчитывать вновь. 

In [None]:
def save(parameters):
    ts_train = pd.read_csv(data_folder / 'train.csv')
    df_train = preprocess_timeseries(ts_train, parameters)
    df_train.to_csv('train.csv', index=False)
    
    df_test = preprocess_timeseries(None, parameters, is_train=False)
    df_test.to_csv('test.csv', index=False)

Мы загрузили посчитанные значения в отдельный `kaggle`-датасет. 

In [None]:
df_train = pd.read_csv('../input/volcanoes-ts-processed-with-tsfresh/train.csv')
df_train = df_train.dropna(axis='columns')

features = [c for c in df_train.columns if c not in ['time_to_eruption', 'segment']]

X = df_train[features]
y = df_train['time_to_eruption']

df_train.head()

Используем случайный лес, чтобы отсеять признаки с низкой информативностью.

In [None]:
class LowImportanceSelector(TransformerMixin):
    def __init__(self, threshold, n_estimators=100):
        self.features = None
        self.threshold = threshold
        self.n_estimators = n_estimators

    def fit(self, X, y):
        estimator = RandomForestRegressor(n_estimators=self.n_estimators)
        estimator.fit(X, y)
    
        importances = pd.DataFrame({
            'feature': X.columns,
            'importance': estimator.feature_importances_
        })
        importances = importances[importances['importance'] > self.threshold]
        
        self.features = importances['feature']
        
        return self

    def transform(self, X):
        return X[self.features]

Удаляем признаки с высоким коэффициентом корреляции.

In [None]:
class CorrelationSelector(TransformerMixin):
    def __init__(self, threshold):
        self.columns = None
        self.threshold = threshold
   
    def fit(self, X, y=None):
        X = X.copy()
        self.columns = set()
        C = X.corr()
        for i in range(len(C.columns)):
            for j in range(i):
                if (C.iloc[i, j] >= self.threshold) and (C.columns[j] not in self.columns):
                    c = C.columns[i]
                    self.columns.add(c)
                    if c in X.columns:
                        del X[c]
        
        return self

    def transform(self, X, y=None):
        X.drop(columns=list(self.columns)).shape
        
        return X.drop(columns=list(self.columns))

In [None]:
def objective(trial, data=X, target=y):
    parameters = {
        'tree_method': 'gpu_hist',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008, 0.009, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
        'n_estimators': 1000,
        'max_depth': trial.suggest_categorical('max_depth', [5, 7, 9, 11, 13, 15, 17, 20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48, 2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1337)
    
    model = XGBRegressor(**parameters)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
    
    return mean_absolute_error(y_test, model.predict(X_test))

In [None]:
"""
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print(f'Number of finished trials: {len(study.trials)}')
print(f'Best trial: {study.best_trial.params}')
"""

In [None]:
parameters = {
    'lambda': 0.0020555245431348778, 
    'alpha': 0.11298627316540845, 
    'colsample_bytree': 0.6, 
    'subsample': 1.0, 
    'learning_rate': 0.01, 
    'max_depth': 20, 
    'random_state': 48, 
    'min_child_weight': 18
}

In [None]:
pipe = Pipeline([
    ('correlation', CorrelationSelector(threshold=0.85)),
    # ('importance', LowImportanceSelector(threshold=1e-4)),
    ('scaler', MinMaxScaler()),
    ('xgboost', XGBRegressor(objective='reg:squarederror', n_estimators=1000, **parameters))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

In [None]:
pipe.fit(X, y)

In [None]:
df_predict = pd.read_csv('../input/volcanoes-ts-processed-with-tsfresh/predict.csv')
df_predict = df_predict.dropna(axis='columns')
df_predict['segment'] = df_predict['segment'].apply(lambda s: s.split('.')[0])

In [None]:
target = pipe.predict(df_predict.drop(columns='segment'))
target = pd.DataFrame({
    'segment_id': df_predict['segment'], 'time_to_eruption': target
})
target.to_csv('submission.csv', index=False)