  
 **stft code copied from** [https://www.kaggle.com/amanooo/ingv-volcanic-basic-solution-stft](https://www.kaggle.com/amanooo/ingv-volcanic-basic-solution-stft) 

In [None]:
import numpy as np 
import pandas as pd 
from scipy.stats import skew, kurtosis
from scipy.signal import stft
import os
from tqdm.notebook import tqdm
import scipy
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
from xgboost import plot_importance
import xgboost as xgb


In [None]:
for dirname, _, filenames_train in os.walk('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train'): 
    continue

In [None]:
for dirname, _, filenames_test in os.walk('/kaggle/input/predict-volcanic-eruptions-ingv-oe/test'): 
    continue

In [None]:
train = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')


**COPIED**

In [None]:
# STFT(Short Time Fourier Transform) Specifications
fs = 100                # sampling frequency 
N = 60001     # data size
n = 256                 # FFT segment size
max_f = 20              # ～20Hz

delta_f = fs / n        # 0.39Hz
delta_t = n / fs / 2    # 1.28s

DIR = '../input/predict-volcanic-eruptions-ingv-oe'

In [None]:
def make_features(tgt):
    tgt_df = train if tgt == 'train' else test
    feature_set = []
    for segment_id in tqdm(tgt_df['segment_id']):
        segment_df = pd.read_csv(os.path.join(DIR,f'{tgt}/{segment_id}.csv'))
        segment = [segment_id]
        for sensor in segment_df.columns:
            x = segment_df[sensor][:N]
            if x.isna().sum() > 1000:     ##########
                segment += ([np.NaN] * 10)
                continue
            f, t, Z = scipy.signal.stft(x.fillna(0), fs = fs, window = 'hann', nperseg = n)
            f = f[:round(max_f/delta_f)+1]
            Z = np.abs(Z[:round(max_f/delta_f)+1]).T    # ～max_f, row:time,col:freq

            th = Z.mean() * 1     ##########
            Z_pow = Z.copy()
            Z_pow[Z < th] = 0
            Z_num = Z_pow.copy()
            Z_num[Z >= th] = 1

            Z_pow_sum = Z_pow.sum(axis = 0)
            Z_num_sum = Z_num.sum(axis = 0)

            A_pow = Z_pow_sum[round(10/delta_f):].sum()
            A_num = Z_num_sum[round(10/delta_f):].sum()
            BH_pow = Z_pow_sum[round(5/delta_f):round(8/delta_f)].sum()
            BH_num = Z_num_sum[round(5/delta_f):round(8/delta_f)].sum()
            BL_pow = Z_pow_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            BL_num = Z_num_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            C_pow = Z_pow_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            C_num = Z_num_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            D_pow = Z_pow_sum[round(2/delta_f):round(4/delta_f)].sum()
            D_num = Z_num_sum[round(2/delta_f):round(4/delta_f)].sum()
            segment += [A_pow, A_num, BH_pow, BH_num, BL_pow, BL_num, C_pow, C_num, D_pow, D_num]

        feature_set.append(segment)

    cols = ['segment_id']
    for i in range(10):
        for j in ['A_pow', 'A_num','BH_pow', 'BH_num','BL_pow', 'BL_num','C_pow', 'C_num','D_pow', 'D_num']:
            cols += [f's{i+1}_{j}']
    feature_df = pd.DataFrame(feature_set, columns = cols)
    feature_df['segment_id'] = feature_df['segment_id'].astype('int')
    return feature_df

In [None]:
feature_df = make_features('train')

In [None]:
train_set = pd.merge(train, feature_df, on = 'segment_id')

In [None]:
def create_columns(df,result):
    df = df.fillna(0)
    for column in df:
        result.at[index,'sum_'+column] = sum(df[column])
        result.at[index,'med_'+column] = np.median(df[column])
        result.at[index,'permiss_'+column] =  df[column].isnull().sum() / df[column].size
        result.at[index, 'skew_'+ column] = skew(df[column])
        result.at[index, 'kurtosis'+ column] = kurtosis(df[column])
        result.at[index,'max_'+column] = df[column].max()
        result.at[index,'min_'+column] = df[column].min()
        result.at[index, 'std_'+ column] = np.std(df[column])
        result.at[index, 'var_'+ column] = np.var(df[column])
        result.at[index,'quan_0.05'+ column] = np.quantile(df[column],0.05)
        result.at[index,'quan_0.1'+ column] = np.quantile(df[column],0.1)
        result.at[index,'quan_0.15'+ column] = np.quantile(df[column],0.15)
        result.at[index,'quan_0.2'+ column] = np.quantile(df[column],0.2)
        result.at[index,'quan_0.025'+ column] = np.quantile(df[column],0.25)
        result.at[index,'quan_0.3'+ column] = np.quantile(df[column],0.3)
        result.at[index,'quan_0.35'+ column] = np.quantile(df[column],0.35)
        result.at[index,'quan_0.4'+ column] = np.quantile(df[column],0.4)
        result.at[index,'quan_0.45'+ column] = np.quantile(df[column],0.45)
        result.at[index,'quan_0.5'+ column] = np.quantile(df[column],0.5)
        result.at[index,'quan_0.55'+ column] = np.quantile(df[column],0.55)
        result.at[index,'quan_0.6'+ column] = np.quantile(df[column],0.6)
        result.at[index,'quan_0.65'+ column] = np.quantile(df[column],0.65)
        result.at[index,'quan_0.7'+ column] = np.quantile(df[column],0.7)
        result.at[index,'quan_0.75'+ column] = np.quantile(df[column],0.75)
        result.at[index,'quan_0.8'+ column] = np.quantile(df[column],0.8)
        result.at[index,'quan_0.85'+ column] = np.quantile(df[column],0.85)
        result.at[index,'quan_0.9'+ column] = np.quantile(df[column],0.9)
        result.at[index,'quan_0.95'+ column] = np.quantile(df[column],0.95)
        
    return result
    

In [None]:
for row in  train_set.itertuples():
    index = row[0]
    segmentid = row[1]
    if str(segmentid)+".csv" in filenames_train:
    
        df_segement = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/'+str(segmentid)+".csv")
       
        result = pd.concat([df_segement, df_segement.abs().add_suffix("_abs")], axis=1, join="inner")
        df_segemnt = result
        train_set = create_columns(df_segement,train_set)
    if index % 100 == 0:
        print(index)
           
            
        


In [None]:
test = pd.DataFrame([(re.findall("[0-9]+",a)) for a in filenames_test] , columns=['segment_id'])

In [None]:
feature_df_test = make_features('test')


In [None]:
feature_df_test.drop('segment_id', axis = 1, inplace = True)

In [None]:
test_set = test.join(feature_df_test)

In [None]:
test_set

In [None]:
for row in test_set.itertuples():
    index = row[0]
    segementId = row[1]
    
    if str(segementId)+".csv" in filenames_test:
        df_segement = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/test/'+str(segementId)+".csv")
       
        result = pd.concat([df_segement, df_segement.abs().add_suffix("_abs")], axis=1, join="inner")
        df_segemnt = result
        test_set = create_columns(df_segement,test_set)
    if index % 100 == 0:
        print(index)
           

In [None]:
test_set

In [None]:
train_set

In [None]:
test_set.to_csv('test_work.csv')

In [None]:
train_set.to_csv('train_work.csv')

In [None]:
y_train = train_set['time_to_eruption']
train_df = train_set.drop(['time_to_eruption','segment_id'], axis = 1)

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:

from sklearn.model_selection import RandomizedSearchCV
import random
param = {'eta': [0.05,0.1,0.2,0.3],
        'max_depth': [4,5,6,7,8,9,10],
         'subsample ': [0.5,0.75,1],
         'gamma': [0.05,0.075,0.09,0.1,0.15,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
         
        }
search = RandomizedSearchCV(xgb.XGBRegressor(),param,n_iter = 30,cv = 5, scoring = 'neg_mean_squared_error')
search.fit(train_df,y_train)


In [None]:
report(search.cv_results_)

In [None]:
params = {'subsample': 0.75, 'max_depth': 9, 'gamma': 0.1, 'eta': 0.1}

In [None]:
xgb_reg = xgb.XGBRegressor(**params)
xgb_reg.fit(train_df, y_train)

In [None]:
sorted_idx = xgb_reg.feature_importances_.argsort()
imp = xgb_reg.feature_importances_
col = train_df.columns

In [None]:
plt.barh(col[sorted_idx[:50]], imp[sorted_idx[:50]])
plt.show()

In [None]:
todrop = col[sorted_idx[:50]]

In [None]:
train_df.drop(todrop, axis = 1, inplace = True)
test_set.drop(todrop, axis = 1, inplace = True)

In [None]:
xgb_reg = xgb.XGBRegressor(**params)
xgb_reg.fit(train_df, y_train)

In [None]:
test_df = test_set.drop('segment_id', axis = 1)

In [None]:
submission = pd.DataFrame(test_set['segment_id'], columns=['segment_id'])

In [None]:
submission['time_to_eruption'] = xgb_reg.predict(test_df)

In [None]:
submission.set_index('segment_id', inplace = True)

In [None]:

submission.to_csv('out.csv')