In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm

print('Import done')

In [None]:
train = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv')
test = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')

train_set = glob('../input/predict-volcanic-eruptions-ingv-oe/train/*')
test_set = glob('../input/predict-volcanic-eruptions-ingv-oe/test/*')

print('Reading done')

In [None]:
signals_mean = pd.DataFrame()

for signals in tqdm(train_set):
    signals_df = pd.read_csv(signals)
    signals_df_T = pd.DataFrame(signals_df.mean()).T
    signals_df_T['id'] = signals.split('/')[-1].split('.')[0]
    signals_mean = pd.concat([signals_mean,signals_df_T], ignore_index=True)

signals_mean.head()

In [None]:
signals_mean.info()

In [None]:
signals_mean['id'] = signals_mean['id'].astype('int64')

In [None]:
signals_mean = signals_mean.join(train.set_index('segment_id'), on='id')
signals_mean.head()

In [None]:
signals_mean_test = pd.DataFrame()

for signals in tqdm(test_set):
    signals_df = pd.read_csv(signals)
    signals_df_T = pd.DataFrame(signals_df.mean()).T
    signals_df_T['id'] = signals.split('/')[-1].split('.')[0]
    signals_mean_test = pd.concat([signals_mean_test, signals_df_T], ignore_index=True)

signals_mean_test.head()

In [None]:
signals_mean_test = signals_mean_test.fillna(signals_mean_test.mean())

Build out the Light GBM Regression model using MAE as the eval metric

In [None]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold,StratifiedKFold, RepeatedKFold

print('Import done')

Preprocessing data using StandardScaler

In [None]:
feature_train_df = signals_mean.drop(['id','time_to_eruption'],axis=1)
y_train = signals_mean['time_to_eruption']

feature_test_df = signals_mean_test.drop('id', axis=1)

scaler = StandardScaler()
scaler.fit(feature_train_df)
scaled_feature_train_df = pd.DataFrame(scaler.transform(feature_train_df), columns=feature_train_df.columns)
scaled_test_df    = pd.DataFrame(scaler.transform(feature_test_df), columns=feature_test_df.columns)

In [None]:
print(scaled_feature_df.shape)
print(scaled_test_df.shape)

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
scaled_feature_df_columns = scaled_feature_train_df.columns.values

params = {
    'num_leaves': 85,
    'min_data_in_leaf': 10, 
    'objective':'regression',
    'max_depth': -1,
    'learning_rate': 0.001,
    'max_bins': 2048,
    "boosting": "gbdt",
    "feature_fraction": 0.91,
    "bagging_freq": 1,
    "bagging_fraction": 0.91,
    "bagging_seed": 42,
    "metric": 'mae',
    "lambda_l1": 0.1,
    "verbosity": -1,
    "nthread": -1,
    "random_state": 42
}


oof = np.zeros(len(scaled_feature_train_df))
predictions = np.zeros(len(scaled_test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(scaled_feature_train_df, y.values)):
    
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr, X_val = scaled_feature_train_df.iloc[trn_idx], scaled_feature_train_df.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = lgbm.LGBMRegressor(**params, n_estimators = 20000, n_jobs = -1)
    model.fit(X_tr, y_tr, 
              eval_set=[(X_tr, y_tr), (X_val, y_val)], eval_metric='mae',
              verbose=1000, early_stopping_rounds=400)
    
    oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration_)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = scaled_feature_df_columns
    fold_importance_df["importance"] = model.feature_importances_[:len(scaled_feature_df_columns)]
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    #predictions
    predictions += model.predict(scaled_test_df, num_iteration=model.best_iteration_) / folds.n_splits

In [None]:
submission = pd.DataFrame()
submission['segment_id'] = signals_mean_test['id']
submission['time_to_eruption'] = predictions
submission.to_csv('submission_recent.csv', header=True, index=False)