# 1

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import os
from tqdm import tqdm
import gc
import pickle
import scipy
import scipy.signal

In [None]:
train = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
sample_submission = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv")

In [None]:
def generate_feature_timedomain():
    
    def helper(path):
        data = []
        for file in tqdm(os.listdir(path)):
            tmp = []
            file_path = os.path.join(path, file)
            d = pd.read_csv(file_path)
            tmp.append(eval(file[:-4]))
            # mean
            tmp += d.mean(axis=0).values.astype('float32').tolist()
            # std
            tmp += d.std(axis=0).values.astype('float32').tolist()
            # min
            tmp += d.min(axis=0).values.astype('float32').tolist()
            # max
            tmp += d.max(axis=0).values.astype('float32').tolist()
            # 5 percentile
            tmp += d.quantile(0.05, axis=0).values.astype('float32').tolist()
            # 10 percentile
            tmp += d.quantile(0.1, axis=0).values.astype('float32').tolist()
            # 20 percentile
            tmp += d.quantile(0.2, axis=0).values.astype('float32').tolist()
            # 40 percentile
            tmp += d.quantile(0.4, axis=0).values.astype('float32').tolist()
            # 60 percentile
            tmp += d.quantile(0.6, axis=0).values.astype('float32').tolist()
            # 80 percentile
            tmp += d.quantile(0.8, axis=0).values.astype('float32').tolist()
            # shift
            for col in d:
                d[col+'_5000'] = d[col].shift(5000)
                d[col+'_10000'] = d[col].shift(10000)
                d[col+'_20000'] = d[col].shift(20000)
                d[col+'_30000'] = d[col].shift(30000)
                
            # +5000 / +10000 / +20000 / +30000 self-corr
            for col in d.columns[:10]:
                col1 = col+'_5000'
                col2 = col+'_10000'
                col3 = col+'_20000'
                col4 = col+'_30000'
                tmp1 = d.loc[:, [col, col1]].dropna()
                tmp2 = d.loc[:, [col, col2]].dropna()
                tmp3 = d.loc[:, [col, col3]].dropna()
                tmp4 = d.loc[:, [col, col4]].dropna()
                tmp += [tmp1[col].corr(tmp1[col1]), 
                        tmp2[col].corr(tmp2[col2]), 
                        tmp3[col].corr(tmp3[col3]),
                        tmp4[col].corr(tmp4[col4])]
                
            data.append(tmp)
        return data
                   
    print('train_part: ')
    train_part_fea = helper('../input/predict-volcanic-eruptions-ingv-oe/train')
    print('test_part: ')
    test_part_fea = helper('../input/predict-volcanic-eruptions-ingv-oe/test')
    
    return train_part_fea, test_part_fea

In [None]:
def generate_feature_freq_domain():
    # STFT
    fs = 100
    n = 256
    N = 60001
    max_f = 20
    delta_f = fs / n
    delta_t = n / fs / 2
    
    def helper(fs, n, N, max_f, delta_f, path):
        data = []
        for file in tqdm(os.listdir(path)):
            tmp = []
            file_path = os.path.join(path, file)
            d = pd.read_csv(file_path)
            tmp.append(eval(file[:-4]))
            
            for i in range(d.shape[1]):
                if d.iloc[:, i].isna().sum() > 1000:
                    tmp += [np.nan] * 7 * 65
                    tmp += [np.nan] * 10
                else:
                    # STFT
                    f, t, Z = scipy.signal.stft(d.iloc[:, i].fillna(0).values, fs = fs, window = 'hann', nperseg = n)
                    f = f[:round(max_f/delta_f)+1]
                    
                    Z_half = np.abs(Z[:round(Z.shape[0]//2)+1]).T
                    tmp += Z_half.min(axis=0).astype('float32').tolist()
                    tmp += Z_half.max(axis=0).astype('float32').tolist()
                    tmp += Z_half.std(axis=0).astype('float32').tolist()
                    tmp += Z_half.mean(axis=0).astype('float32').tolist()
                    tmp += np.quantile(Z_half, 0.25, axis=0).astype('float32').tolist()
                    tmp += np.quantile(Z_half, 0.5, axis=0).astype('float32').tolist()
                    tmp += np.quantile(Z_half, 0.75, axis=0).astype('float32').tolist()
                    
                    Z = np.abs(Z[:round(max_f/delta_f)+1]).T    # ～max_f, row:time,col:freq

                    th = Z.mean() * 1     ##########
                    Z_pow = Z.copy()
                    Z_pow[Z < th] = 0
                    Z_num = Z_pow.copy()
                    Z_num[Z >= th] = 1

                    Z_pow_sum = Z_pow.sum(axis = 0)
                    Z_num_sum = Z_num.sum(axis = 0)

                    A_pow = Z_pow_sum[round(10/delta_f):].sum()
                    A_num = Z_num_sum[round(10/delta_f):].sum()
                    BH_pow = Z_pow_sum[round(5/delta_f):round(8/delta_f)].sum()
                    BH_num = Z_num_sum[round(5/delta_f):round(8/delta_f)].sum()
                    BL_pow = Z_pow_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
                    BL_num = Z_num_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
                    C_pow = Z_pow_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
                    C_num = Z_num_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
                    D_pow = Z_pow_sum[round(2/delta_f):round(4/delta_f)].sum()
                    D_num = Z_num_sum[round(2/delta_f):round(4/delta_f)].sum()
                    tmp += [A_pow, A_num, BH_pow, BH_num, BL_pow, BL_num, C_pow, C_num, D_pow, D_num]
            data.append(tmp)
        return data
    
    print('train_part: ')
    train_part_fea = helper(fs, n, N, max_f, delta_f, path='../input/predict-volcanic-eruptions-ingv-oe/train')
    print('test_part: ')
    test_part_fea = helper(fs, n, N, max_f, delta_f, path='../input/predict-volcanic-eruptions-ingv-oe/test')
    
    return train_part_fea, test_part_fea

In [None]:
train_part_fea, test_part_fea = generate_feature_timedomain()

In [None]:
train_part_fea_freq, test_part_fea_freq = generate_feature_freq_domain()

In [None]:
base_colname = ['sensor_'+str(i) for i in range(1, 11)]
fea_colname = ['segment_id'] + [j + '_mean' for j in base_colname] + [j + '_std' for j in base_colname] + \
                [j + '_min' for j in base_colname] + [j + '_max' for j in base_colname] + \
                    [j + '_5_quant' for j in base_colname] + [j + '_10_quant' for j in base_colname] + \
                        [j + '_20_quant' for j in base_colname] + [j + '_40_quant' for j in base_colname] + \
                        [j + '_60_quant' for j in base_colname] + [j + '_80_quant' for j in base_colname] + \
                    [j + i for j in base_colname for i in ['_5000_self_corr', '_10000_self_corr', 
                                                           '_20000_self_corr', '_30000_self_corr']]

train = pd.merge(train, pd.DataFrame(train_part_fea, columns=fea_colname), on='segment_id', how='left')
sample_submission = pd.merge(sample_submission, pd.DataFrame(test_part_fea, columns=fea_colname), on='segment_id', how='left')

In [None]:
fea_freq_colname = ['segment_id']
for i in base_colname:
    for j in range(65):
        for s in ['min','max', 'std', 'mean', '25_quant', '50_quant', '75_quant']:
            fea_freq_colname.append(i+'_freq'+str(j)+'_'+s)
    fea_freq_colname.extend([i + ss for ss in ['_A_pow', '_A_num', '_BH_pow', '_BH_num', '_BL_pow', 
                                               '_BL_num', '_C_pow', '_C_num', '_D_pow', '_D_num']])

train = pd.merge(train, pd.DataFrame(train_part_fea_freq, columns=fea_freq_colname), on='segment_id', how='left')
sample_submission = pd.merge(sample_submission, pd.DataFrame(test_part_fea_freq, 
                                                             columns=fea_freq_colname), on='segment_id', how='left')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(['segment_id', 'time_to_eruption'], axis=1).values, train['time_to_eruption'].values, test_size=0.25, random_state=28)

In [None]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, y_val, reference=train_data,)


params = { 'num_leaves': 85,
          'n_estimators': 1000,
    'min_data_in_leaf': 10, 
    'objective':'mae',
    'max_depth': -1,
    'learning_rate': 0.01,
    'max_bins': 2048,
    "boosting": "gbdt",
    "feature_fraction": 0.91,
    "bagging_freq": 1,
    "bagging_fraction": 0.91,
    "bagging_seed": 42,
    "metric": 'mae',
    "lambda_l1": 0.1,
    "verbosity": -1,
    "nthread": -1,
    "random_state": 42}

model = lgb.train(params=params, train_set=train_data, valid_sets=[train_data, val_data], valid_names=['train', 'val'], 
                  early_stopping_rounds=50)

In [None]:
y_pred_1 = model.predict(sample_submission.iloc[:, 2:].values)
y_pred_1 = [x if x>=0 else 0 for x in y_pred_1]

# 2

In [None]:
import os
import gc
import numpy as np
import pandas as pd

from time import time
from time import ctime

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm_notebook
from tqdm import tqdm

import joblib
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()-1

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

In [None]:
%time

from sklearn.model_selection import KFold

X = train.drop(['segment_id', 'time_to_eruption'], axis=1)
Y = train['time_to_eruption']
X_test = sample_submission.iloc[:, 2:]

n_fold = 5
cv = KFold(n_splits=n_fold, shuffle=True, random_state=14)

oof = np.zeros(len(X))
cat_prediction = np.zeros(len(X_test))
mae, r2 = [], []

PARAMS = {
    
             'random_seed': 42,
             'eval_metric': 'MAE'

        }

for fold_n, (train_index, valid_index) in enumerate(cv.split(X)):

    X_train = X.iloc[train_index,:]
    X_valid = X.iloc[valid_index,:]
    
    Y_train = Y.iloc[train_index]
    Y_valid = Y.iloc[valid_index]
          
    best_model = CatBoostRegressor(**PARAMS, thread_count = -1)  
    
    train_dataset = Pool(data=X_train,
                     label=Y_train,
                     )
    
    eval_dataset = Pool(data=X_valid,
                    label=Y_valid,
                    )
    
    best_model.fit(train_dataset,
              use_best_model=True,
              verbose = False,
              plot = True,
              eval_set=eval_dataset,
              early_stopping_rounds=100)

   
    y_pred = best_model.predict(Pool(data=X_valid))

    mae.append(mean_absolute_error(Y_valid, y_pred))
    r2.append(r2_score(Y_valid, y_pred))

    print('MAE: ', mean_absolute_error(Y_valid, y_pred))
    print('R2: ', r2_score(Y_valid, y_pred))

    cat_prediction += best_model.predict(Pool(data=X_test))
        
cat_prediction /= n_fold

In [None]:
y_pred_2 = cat_prediction
y_pred_2 = [x if x>=0 else 0 for x in y_pred_2]

# 3

In [None]:
import datetime
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import scipy
import scipy.signal

import matplotlib
import matplotlib.pyplot as plt
pd.options.display.max_columns = None    # disp all columns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

# from lightgbm import LGBMRegressor
import lightgbm as lgb

In [None]:
### Kaggle or Local-PC ###
KAGGLE = True       # <==== SET ============

if KAGGLE:
    DIR = '../input/predict-volcanic-eruptions-ingv-oe'
else:              # local PC
    DIR = './predict-volcanic-eruptions-ingv-oe/'

In [None]:
train = pd.read_csv(os.path.join(DIR, 'train.csv'))
test = pd.read_csv(os.path.join(DIR, 'sample_submission.csv'))

In [None]:
train['h:m:s'] = (train['time_to_eruption']
                  .apply(lambda x:datetime.timedelta(seconds = x/100)))

In [None]:
sample_df = (train.sort_values('time_to_eruption')
             .reset_index()
             .rename(columns={'index': 'train_id'}))
sample_df = sample_df[sample_df.index % (len(train) // 5) == 5].reset_index(drop = True)
sample_ids = sample_df['segment_id'].values
sample_df

In [None]:
sensor = 4      #### 1 ～ 10

fig, ax = plt.subplots(len(sample_ids), 1, figsize = (12, len(sample_ids)*2))
for i, segment_id in enumerate(sample_ids):
    segment_df = pd.read_csv(os.path.join(DIR, f'train/{segment_id}.csv')).fillna(0)
    ax[i].plot(range(len(segment_df)), segment_df[f'sensor_{sensor}'])
    ax[i].set_title(f'segment_id : {segment_id},  sensor : {sensor}')

fig.tight_layout()

In [None]:
fs = 100                # sampling frequency 
N = len(segment_df)     # data size
n = 256                 # FFT segment size

fig, ax = plt.subplots(len(sample_ids), 1, figsize = (12, len(sample_ids)*2))
for i, segment_id in enumerate(sample_ids):
    segment_df = pd.read_csv(os.path.join(DIR, f'train/{segment_id}.csv')).fillna(0)
    
    x = segment_df[f'sensor_{sensor}'][:N]
    f, t, Z = scipy.signal.stft(x, fs = fs, window = 'hann', nperseg = n)
    Z = np.abs(Z)

    ax[i].pcolormesh(t, f, Z, vmin = 0, vmax = Z.mean()*10)
    ax[i].set_ylim(0, 20)
    ax[i].set_ylabel('Frequency [Hz]'); plt.xlabel('Time [s]')
    ax[i].set_title(f'segment_id : {segment_id},  sensor : {sensor}')
fig.tight_layout()

In [None]:
# STFT(Short Time Fourier Transform) Specifications
fs = 100                # sampling frequency 
N = len(segment_df)     # data size
n = 256                 # FFT segment size
max_f = 20              # ～20Hz

delta_f = fs / n        # 0.39Hz
delta_t = n / fs / 2    # 1.28s

In [None]:
def make_features(tgt):
    tgt_df = train if tgt == 'train' else test
    feature_set = []
    for segment_id in tqdm(tgt_df['segment_id']):
        segment_df = pd.read_csv(os.path.join(DIR,f'{tgt}/{segment_id}.csv'))
        segment = [segment_id]
        for sensor in segment_df.columns:
            x = segment_df[sensor][:N]
            if x.isna().sum() > 1000:     ##########
                segment += ([np.NaN] * 10)
                continue
            f, t, Z = scipy.signal.stft(x.fillna(0), fs = fs, window = 'hann', nperseg = n)
            f = f[:round(max_f/delta_f)+1]
            Z = np.abs(Z[:round(max_f/delta_f)+1]).T    # ～max_f, row:time,col:freq

            th = Z.mean() * 1     ##########
            Z_pow = Z.copy()
            Z_pow[Z < th] = 0
            Z_num = Z_pow.copy()
            Z_num[Z >= th] = 1

            Z_pow_sum = Z_pow.sum(axis = 0)
            Z_num_sum = Z_num.sum(axis = 0)

            A_pow = Z_pow_sum[round(10/delta_f):].sum()
            A_num = Z_num_sum[round(10/delta_f):].sum()
            BH_pow = Z_pow_sum[round(5/delta_f):round(8/delta_f)].sum()
            BH_num = Z_num_sum[round(5/delta_f):round(8/delta_f)].sum()
            BL_pow = Z_pow_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            BL_num = Z_num_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            C_pow = Z_pow_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            C_num = Z_num_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            D_pow = Z_pow_sum[round(2/delta_f):round(4/delta_f)].sum()
            D_num = Z_num_sum[round(2/delta_f):round(4/delta_f)].sum()
            segment += [A_pow, A_num, BH_pow, BH_num, BL_pow, BL_num, C_pow, C_num, D_pow, D_num]

        feature_set.append(segment)

    cols = ['segment_id']
    for i in range(10):
        for j in ['A_pow', 'A_num','BH_pow', 'BH_num','BL_pow', 'BL_num','C_pow', 'C_num','D_pow', 'D_num']:
            cols += [f's{i+1}_{j}']
    feature_df = pd.DataFrame(feature_set, columns = cols)
    feature_df['segment_id'] = feature_df['segment_id'].astype('int')
    return feature_df

In [None]:
feature_df = make_features('train')
train_set = pd.merge(train, feature_df, on = 'segment_id')

In [None]:
df = train_set.drop(['segment_id', 'time_to_eruption','h:m:s'], axis=1)
y = train_set['time_to_eruption']

X_train, X_val, y_train, y_val = train_test_split(df, y,
                                                  random_state = 42,
                                                  test_size = 0.2,
                                                  shuffle = True)

features = X_train.columns.tolist()
cat_features = {}

In [None]:
def do_lgb(X_train, y_train, X_val, y_val):
    params = {'objective': 'rmse',
              'metric': 'rmse',
              'max_depth':14,
              'min_data_in_leaf':5,         # = min_child_samples
              'num_leaves': 2**7 - 1,
              'learning_rate': 0.05,
              'feature_fraction': 0.7,      # = colsample_bytree
              'bagging_fraction': 0.5,      # = subsample
              'bagging_freq': 5,
              'lambda_l1':80,               # = reg_alpha
              'num_iterations': 10000,      # = n_estimators
              'seed': 42,
              'verbose': 1
             }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    evals_result = {}
    model = lgb.train(
        params,
        lgb_train,
        valid_sets = (lgb_train, lgb_eval), 
        feature_name = features,
        categorical_feature = cat_features,
        verbose_eval = 100,
        evals_result = evals_result,
        early_stopping_rounds = 200)

    return model

In [None]:
lgb_model = do_lgb(X_train, y_train, X_val, y_val)

In [None]:
feature_df = make_features('test')
test_set = pd.merge(test, feature_df, on = 'segment_id')
test_set

In [None]:
preds = lgb_model.predict(test_set.drop(['segment_id', 'time_to_eruption'], axis=1))
y_pred_3 = preds
y_pred_3 = [x if x>=0 else 0 for x in y_pred_3]
y_pred_3

# FINAL

In [None]:
submission = pd.DataFrame()
submission['segment_id'] = test['segment_id']
submission['time_to_eruption'] = [(y_pred_1[i] + y_pred_2[i] + y_pred_3[i])/3 for i in range(len(y_pred_1))]
submission.to_csv('submission_recent.csv', header=True, index=False)

In [None]:
submission