In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        continue

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgbm
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

* **Version 1:** Usage of entire data with no special feature selection, model is LGBM with ~default parameters.
* **Version 2:** Version 1 plus Grid Search of best LGBM parameters.

# Data

In [None]:
PATH = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'

!echo 'Files: train and test'
!ls -l /kaggle/input/predict-volcanic-eruptions-ingv-oe/train/ | wc -l
!ls -l /kaggle/input/predict-volcanic-eruptions-ingv-oe/test/  | wc -l

train_files = []
test_files  = []

for file in os.listdir(PATH+'/train/'):
    train_files.append(file)
    
for file in os.listdir(PATH+'/test/'):
    test_files.append(file)
    
print('Number of train files: {}'.format(len(train_files)))
print('Number of test  files: {}'.format(len(test_files )))

In [None]:
# Train file
train = pd.read_csv(PATH+'train.csv')

# Submission file
sample_submission = pd.read_csv(PATH+'sample_submission.csv')

test_files = []
for dirname, _, filenames in os.walk(PATH+'/test/'):
    for filename in filenames:
        test_files.append(filename[:-4]) # without .csv extension
        
test = pd.DataFrame(test_files, columns=["segment_id"])

In [None]:
train

In [None]:
test

## EDA
### Time to eruption distribution  
* Looks almost uniform

In [None]:
sns.distplot(train['time_to_eruption'], 
             hist=True, 
             kde=True, 
             bins=100, 
             color = 'blue', 
             hist_kws={'edgecolor':'black'})

In [None]:
# Check min and max in segment_id by sorting
# train.sort_values('time_to_eruption', axis=0, ascending=True)
display(train.sort_values('time_to_eruption', axis=0, ascending=True).iloc[[0,-1],:])

segment_id_min =  601524801
segment_id_max = 1923243961

df_segment_id_min = pd.read_csv(PATH+'/train/'+str(segment_id_min)+'.csv')
df_segment_id_max = pd.read_csv(PATH+'/train/'+str(segment_id_max)+'.csv')

In [None]:
df_segment_id_min.head(3)

### Sensors

In [None]:
df_segment_id_min.plot(figsize=(20,20),
                       subplots=True, 
                       layout=(10,1),
                       rot=0, 
                       lw=1, 
                       #colormap='jet',
                       title='segment_id #601524801 (min)'
                      )

plt.show()

In [None]:
df_segment_id_max.plot(figsize=(20,20),
                       subplots=True, 
                       layout=(10,1),
                       rot=0, 
                       lw=2, 
                       #colormap='jet',
                       title='segment_id #1923243961 (max)'
                      )

plt.show()

So,
  * sensors 2, 3, 8 are out (empty) in `id_min`; some spikes suggest something abnormal
  * sensors 1 and 5   are out (empty) in `id_max`; no spikes, smoothness suggests something normal

## Features
* So, as we see, we able to get outputs for sensor 1-10 from each {segment_id}.csv.  
* Let's build features on that. With a lot of notebooks available we can select some findings there (as below).  
* Please upvote [this](https://www.kaggle.com/isaienkov/ingv-volcanic-eruption-prediction-eda-modeling) amazing notebook from [Kostiantyn Isaienkov](https://www.kaggle.com/isaienkov) where this clear function below is taken from. 

In [None]:
def build_features(signal, ts, sensor_id):
    X = pd.DataFrame()
    f = np.fft.fft(signal)
    f_real = np.real(f)
    X.loc[ts, f'{sensor_id}_sum']       = signal.sum()
    X.loc[ts, f'{sensor_id}_mean']      = signal.mean()
    X.loc[ts, f'{sensor_id}_std']       = signal.std()
    X.loc[ts, f'{sensor_id}_var']       = signal.var() 
    X.loc[ts, f'{sensor_id}_max']       = signal.max()
    X.loc[ts, f'{sensor_id}_min']       = signal.min()
    X.loc[ts, f'{sensor_id}_skew']      = signal.skew()
    X.loc[ts, f'{sensor_id}_mad']       = signal.mad()
    X.loc[ts, f'{sensor_id}_kurtosis']  = signal.kurtosis()
    X.loc[ts, f'{sensor_id}_quantile99']= np.quantile(signal, 0.99)
    X.loc[ts, f'{sensor_id}_quantile95']= np.quantile(signal, 0.95)
    X.loc[ts, f'{sensor_id}_quantile85']= np.quantile(signal, 0.85)
    X.loc[ts, f'{sensor_id}_quantile75']= np.quantile(signal, 0.75)
    X.loc[ts, f'{sensor_id}_quantile55']= np.quantile(signal, 0.55)
    X.loc[ts, f'{sensor_id}_quantile45']= np.quantile(signal, 0.45) 
    X.loc[ts, f'{sensor_id}_quantile25']= np.quantile(signal, 0.25) 
    X.loc[ts, f'{sensor_id}_quantile15']= np.quantile(signal, 0.15) 
    X.loc[ts, f'{sensor_id}_quantile05']= np.quantile(signal, 0.05)
    X.loc[ts, f'{sensor_id}_quantile01']= np.quantile(signal, 0.01)
    X.loc[ts, f'{sensor_id}_fft_real_mean']= f_real.mean()
    X.loc[ts, f'{sensor_id}_fft_real_std'] = f_real.std()
    X.loc[ts, f'{sensor_id}_fft_real_max'] = f_real.max()
    X.loc[ts, f'{sensor_id}_fft_real_min'] = f_real.min()

    return X

We will perform some statistics on each column (sensors 1-10) obtained from {segment_id}.csv files.

## Dataset

### Train

In [None]:
train_set = list()
seg=0

for seg, segment_id in enumerate(train.segment_id):
    signals = pd.read_csv(PATH+'/train/'+str(segment_id)+'.csv')
    train_row = []
    
    if seg%200 == 0:
        print('Processing segment_id={}'.format(seg))
        
    for sensor in range(0, 10):
        sensor_id = f'sensor_{sensor+1}'
        train_row.append(build_features(signals[sensor_id].fillna(0), segment_id, sensor_id))
        
    train_row = pd.concat(train_row, axis=1)
    train_set.append(train_row)
    seg+=1
    
train_set = pd.concat(train_set)

In [None]:
train_set = train_set.reset_index()                                        
train_set = train_set.rename(columns={'index': 'segment_id'}) # change column index->segment_id
train_set = pd.merge(train_set, train, on='segment_id')       # merge with our train by segment_id column

In [None]:
train_set.head(3)

### Test

In [None]:
test_set = list()
seg=0

for seg, segment_id in enumerate(test.segment_id):
    signals = pd.read_csv(PATH+'/test/'+str(segment_id)+'.csv')
    test_row = []
    
    if seg%200 == 0:
        print('Processing segment_id={}'.format(seg))
        
    for sensor in range(0, 10):
        sensor_id = f'sensor_{sensor+1}'
        test_row.append(build_features(signals[sensor_id].fillna(0), segment_id, sensor_id))
        
    test_row = pd.concat(test_row, axis=1)
    test_set.append(test_row)
    seg+=1
    
test_set = pd.concat(test_set)

In [None]:
test_set = test_set.reset_index()                                        
test_set = test_set.rename(columns={'index': 'segment_id'}) # change column index->segment_id
test_set = pd.merge(test_set, test, on='segment_id')        # merge with our train by segment_id column

In [None]:
test_set.head(3)

## Train/Test split

In [None]:
X = train_set.drop(['segment_id', 'time_to_eruption'], axis=1)
y = train_set['time_to_eruption']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2, 
                                                      random_state=42)

# Models

We will use:
* LGBM
* XGBoost

## LGBM

Light GBM is a gradient boosting framework that uses tree based learning algorithm.

Some parameters:  
* **boosting_type** – ‘gbdt’ (default), ‘dart’, ‘goss’, ‘rf’. Default gbdt is gradient boosting decision tree.
* **num_leaves** – Maximum tree leaves (default=31) for base learners.
* **max_depth** – Maximum tree depth (default=-1) for base learners, <=0 means no limit.
* **learning_rate**  – Boosting learning rate (default=0.1). 
* **n_estimators** – Number of boosted trees (default=100) to fit.
* **subsample_for_bin** – Number of samples for constructing bins (default=200000).
* **objective** – Specify the learning task and the corresponding learning objective, e.g, ‘binary’ or ‘multiclass’.
* **min_split_gain** – Minimum loss reduction (default=0.) required to make a further partition on a leaf node of tree.
* **min_child_weight** – Minimum sum of instance weight (hessian) needed in a child/leaf (default=1e-3).
* **min_child_samples** – Minimum number of data needed in a child/leaf (default=20).
* **subsample** – Subsample ratio of the training instance (default=1.).
* **subsample_freq** – Frequence of subsample (default=0), <=0 means no enable.
* **colsample_bytree** – Subsample ratio (default=1.) of columns when constructing each tree.
* **reg_alpha** – L1 regularization term on weights (default=0.).
* **reg_lambda** – L2 regularization term on weights (default=0.).
* **random_state** – Random number seed. 
* **n_jobs** – Number of parallel threads (default=-1).
* **silent** – Whether to print messages while running boosting (default=True).  


* **min_data_in_leaf** – Minimal number of data in one leaf (default=20). Can be used to deal with over-fitting
* **feature_fraction** – LightGBM will randomly select part of features (default=1.0) on each iteration (tree) if feature_fraction smaller than 1.0. For example, if you set it to 0.8, LightGBM will select 80% of features before training each tree. Can be used to speed up training and to deal with over-fitting.
* **bagging_fraction** – LightGBM will randomly select part of data (default=1.0) without resampling. Can be used to speed up training and to deal with over-fitting

        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
        https://lightgbm.readthedocs.io/en/latest/Parameters.html

### Parameters

In [None]:
# Default parameters
params = {
    'boosting_type': 'gbdt', 
    'num_leaves': 31,
    'max_depth': -1,
    'learning_rate': 0.1,
    'n_estimators': 100, 
    'subsample_for_bin': 200, # 200000 is default 
    #'objective': 'binary'
    'min_split_gain': 0.5,    # 0.0 is default 
    'min_child_weight': 1e-3, 
    'min_child_samples': 20,
    'subsample': 1,
    'colsample_bytree': 1.0,
    'min_data_in_leaf': 20,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'random_state': 42
    #'device': 'cpu', # you can use GPU to achieve faster learning
}
        
# Initiate classifier to use
model_lgbm_regr = LGBMRegressor(boosting_type = params['boosting_type'], 
                                num_leaves = params['num_leaves'],
                                max_depth = params['max_depth'],
                                learning_rate = params['learning_rate'],
                                n_estimators = params['n_estimators'],
                                subsample_for_bin = params['subsample_for_bin'],
                                #objective = params['objective'],
                                min_split_gain = params['min_split_gain'], 
                                min_child_weight = params['min_child_weight'], 
                                min_child_samples = params['min_child_samples'],
                                subsample = params['subsample'],
                                colsample_bytree = params['colsample_bytree'],
                                min_data_in_leaf = params['min_data_in_leaf'],
                                feature_fraction = params['feature_fraction'],
                                bagging_fraction = params['bagging_fraction'],
                                random_state = params['random_state'],
                                #n_jobs = 5, 
                                silent = True
                               )

# To view the default model parameters:
model_lgbm_regr.get_params().keys()

### Grid Search

In [None]:
gridParams = {
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy might try dart, check both
    'num_leaves': [10,31],        # large num_leaves helps improve accuracy but might lead to over-fitting
    'max_depth': [30,10,-1],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'subsample_for_bin': [200,500],
    'objective' : ['binary'],
    'min_split_gain': [0.5,0.8],    # 0.0 is default 
    'min_child_weight': [1e-3,1e-1,1e-2], 
    'min_child_samples': [20],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_data_in_leaf': [5, 10, 20],
    'feature_fraction': [0.8, 1.0],
    'bagging_fraction': [0.8, 1.0],
    'random_state' : [42],
    }

grid = GridSearchCV(model_lgbm_regr, 
                    gridParams, 
                    cv=5, 
                    verbose=1, 
                    n_jobs=-1)

# Run the grid
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

### Best model selection

In [None]:
params['boosting_type'] = grid.best_params_['boosting_type']
params['num_leaves'] = grid.best_params_['num_leaves']
params['max_depth'] = grid.best_params_['max_depth']
params['learning_rate'] = grid.best_params_['learning_rate']
params['n_estimators'] = grid.best_params_['n_estimators']
params['subsample_for_bin'] = grid.best_params_['subsample_for_bin']
params['objective'] = grid.best_params_['objective']
params['min_split_gain'] = grid.best_params_['min_split_gain']
params['min_child_weight'] = grid.best_params_['min_child_weight']
params['min_child_samples'] = grid.best_params_['min_child_samples']
params['subsample'] = grid.best_params_['subsample']
params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['min_data_in_leaf'] = grid.best_params_['min_data_in_leaf']
params['feature_fraction'] = grid.best_params_['feature_fraction']
params['bagging_fraction'] = grid.best_params_['bagging_fraction']
params['random_state'] = grid.best_params_['random_state']

In [None]:
#model_lgbm_regr.fit(X_train, y_train, 
#                    eval_set= [(X_train, y_train), (X_valid, y_valid)], 
#                    eval_metric="mae", 
#                    verbose=200, 
#                    early_stopping_rounds=50
#                   )

In [None]:
dtrain = lgbm.Dataset(X_train, label=y_train)
dvalid = lgbm.Dataset(X_valid, label=y_valid) 

In [None]:
model_lgbm = lgbm.train(params, 
                        train_set=dtrain, 
                        num_boost_round=100, 
                        valid_sets=[dvalid, dtrain], 
                        early_stopping_rounds=20, 
                        verbose_eval=4
                       )

In [None]:
model_lgbm

### Plot importance

In [None]:
#lgbm.plot_importance(model_lgbm)
#plt.show()

### Predict

In [None]:
#predictions = model_lgbm_regr.predict(test_set.drop(columns=['segment_id']))
predictions  = model_lgbm.predict(test_set.drop(columns=['segment_id']))

### Submission

In [None]:
sample_submission

In [None]:
submission = pd.DataFrame()
submission['segment_id'] = test_set["segment_id"]
submission['time_to_eruption'] = predictions
submission.to_csv('submission.csv', header=True, index=False)

In [None]:
submission