In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import dask.dataframe as dd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


#import lightgbm as lgb
from optuna.integration import lightgbm as lgb

import glob

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read Data

In [None]:
# meta data
train = pd.read_csv("/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv")
train

In [None]:
# Sort data by time to eruption
sorted_train = train.sort_values("time_to_eruption",ascending=False)
sorted_train

In [None]:
# meta data for test
test_ = pd.DataFrame([os.path.basename(f)[:-4] for f in glob.glob('/kaggle/input/predict-volcanic-eruptions-ingv-oe/test/*')], columns=["segment_id"])
test_

# EDA

In [None]:
# See time to eruption distribution -> Roughly uniform distribution

sorted_train[["time_to_eruption"]].plot(kind="hist",bins=100,figsize=(10,7))

In [None]:
# Max and Min data
display(sorted_train.iloc[[0,-1],:])

max_id = 1923243961
min_id =  601524801

In [None]:
# Plot the first (probably normal) and the last (probably abnormal) raw signal data -> Looks different

pd.read_csv(f"/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/{max_id}.csv").plot(figsize=(15,10),title="max time_to_eruption (probably normal)",subplots=True,ylim=(-10000,10000))
plt.show()


pd.read_csv(f"/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/{min_id}.csv").plot(figsize=(15,10),title="min time_to_eruption (probably abnormal)",subplots=True,ylim=(-10000,10000))
plt.show()

In [None]:
# Check NaN data -> Some Sensors are completely empty.

display(pd.read_csv(f"/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/{max_id}.csv").isnull().sum())

display(pd.read_csv(f"/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/{min_id}.csv").isnull().sum())

# Feature Extraction

In [None]:
# Define feature extraction function

fs = ["_mean","_std","_max","_min","_mad","_skew","_kurt","_nunique",
      "_quantile_05","_quantile_10","_quantile_30","_quantile_70","_quantile_90","_quantile_95",
      "_fft_power_mean","_fft_power_std","_fft_power_min","_fft_power_max",
      "_fft_power_sum_low","_fft_power_sum_middle","_fft_power_sum_high",
      "_fft_power_mad","_fft_power_skew","_fft_power_kurt","_fft_power_nunique",
      "_fft_power_quantile_05","_fft_power_quantile_10","_fft_power_quantile_30","_fft_power_quantile_70","_fft_power_quantile_90","_fft_power_quantile_95",
      "_cross_0_count",
      "_roll_mean_min","_roll_mean_max","_roll_dist_min","_roll_dist_max","_roll_dist_diff_min","_roll_dist_diff_max",
      #"_first_005","_last_005","_first_010","_last_010","_first_030","_last_030","_first_070","_last_070","_first_090","_last_090","_first_095","_last_095",
      #"_abs_0250_min","_abs_0250_max","_abs_0500_min","_abs_0500_max","_abs_0750_min","_abs_0750_max","_abs_1000_min","_abs_1000_max","_abs_1250_min","_abs_1250_max","_abs_1500_min","_abs_1500_max",
     ]

def extract(segment_id,dir_="train"):
    """
    Extract statistical features for each sensor signal
    
    - Mean
    - Standard Deviation
    - Maximum
    - Minimum
    - Mean Absolute Deviation
    - Skewness
    - Kurtosis
    - Median
    - Mode
    - (Unbiased) Standard Error of the Mean
    - Number of Unique Values
    """
    #display(segment_id)
    f = pd.read_csv(f"/kaggle/input/predict-volcanic-eruptions-ingv-oe/{dir_}/{segment_id}.csv")
    
    # Fill NaN
    f.interpolate(axis=0,inplace=True)
    #display(f)
    
    # Quantile
    q005 = f.quantile(0.05,axis=0)
    q010 = f.quantile(0.1 ,axis=0)
    q030 = f.quantile(0.3 ,axis=0)
    q070 = f.quantile(0.7 ,axis=0)
    q090 = f.quantile(0.9 ,axis=0)
    q095 = f.quantile(0.95,axis=0)
    
    # Rolling
    roll = f.rolling(500)
    roll_mean = roll.mean()
    roll_max = roll.max()
    roll_min = roll.min()
    roll_dist = roll_max - roll_min
    roll_dist_diff = roll_dist.diff()

    # FFT power
    # Remove under flowed 0 frequency and mirrored higher half.
    fft = pd.DataFrame(np.fft.fft(f.fillna(0)),columns=f.columns).abs().iloc[1:30001,:]
    fft[f.iloc[1:30001,:].isnull()] = np.nan
    #display(fft)
    
    # Timing information inside 10 minute.
    f005 = f.where(f < q005)
    f010 = f.where(f < q010)
    f030 = f.where(f < q030)

    f070 = f.where(f > q070)
    f090 = f.where(f > q090)
    f095 = f.where(f > q095)
    
    f_abs = f.abs()
    f_abs_0250 = f_abs.where(f_abs >  250)
    f_abs_0500 = f_abs.where(f_abs >  500)
    f_abs_0750 = f_abs.where(f_abs >  750)
    f_abs_1000 = f_abs.where(f_abs > 1000)
    f_abs_1250 = f_abs.where(f_abs > 1250)
    f_abs_1500 = f_abs.where(f_abs > 1500)


    return pd.concat((f.mean(axis=0).add_suffix("_mean"),
                      f.std(axis=0).add_suffix("_std"),
                      f.max(axis=0).add_suffix("_max"),
                      f.min(axis=0).add_suffix("_min"),
                      f.mad(axis=0).add_suffix("_mad"),
                      f.skew(axis=0).add_suffix("_skew"),
                      f.kurt(axis=0).add_suffix("_kurt"),
                      f.nunique(axis=0).add_suffix("_nunique"),
                      q005.add_suffix("_quantile_05"),
                      q010.add_suffix("_quantile_10"),
                      q030.add_suffix("_quantile_30"),
                      q070.add_suffix("_quantile_70"),
                      q090.add_suffix("_quantile_90"),
                      q095.add_suffix("_quantile_95"),
                      fft.mean(axis=0).add_suffix("_fft_power_mean"),
                      fft.std(axis=0).add_suffix("_fft_power_std"),
                      fft.min(axis=0).add_suffix("_fft_power_min"),
                      fft.max(axis=0).add_suffix("_fft_power_max"),
                      fft.iloc[:10000,:].sum(axis=0).add_suffix("_fft_power_sum_low"),
                      fft.iloc[10000:20000,:].sum(axis=0).add_suffix("_fft_power_sum_middle"),
                      fft.iloc[20000:,:].sum(axis=0).add_suffix("_fft_power_sum_high"),
                      fft.mad(axis=0).add_suffix("_fft_power_mad"),
                      fft.skew(axis=0).add_suffix("_fft_power_skew"),
                      fft.kurt(axis=0).add_suffix("_fft_power_kurt"),
                      fft.nunique(axis=0).add_suffix("_fft_power_nunique"),
                      fft.quantile(0.05,axis=0).add_suffix("_fft_power_quantile_05"),
                      fft.quantile(0.1,axis=0).add_suffix("_fft_power_quantile_10"),
                      fft.quantile(0.3,axis=0).add_suffix("_fft_power_quantile_30"),
                      fft.quantile(0.7,axis=0).add_suffix("_fft_power_quantile_70"),
                      fft.quantile(0.9,axis=0).add_suffix("_fft_power_quantile_90"),
                      fft.quantile(0.95,axis=0).add_suffix("_fft_power_quantile_95"),
                      ((f * f.shift()) < 0).sum(axis=0).add_suffix("_cross_0_count"),
                      roll_mean.min(axis=0).add_suffix("_roll_mean_min"),
                      roll_mean.max(axis=0).add_suffix("_roll_mean_max"),
                      roll_dist.min(axis=0).add_suffix("_roll_dist_min"),
                      roll_dist.max(axis=0).add_suffix("_roll_dist_max"),
                      roll_dist_diff.min(axis=0).add_suffix("_roll_dist_diff_min"),
                      roll_dist_diff.max(axis=0).add_suffix("_roll_dist_diff_max"),
                      f005.idxmin().add_suffix("_first_005"),
                      f005.idxmax().add_suffix("_last_005"),
                      f010.idxmin().add_suffix("_first_010"),
                      f010.idxmax().add_suffix("_last_010"),
                      f030.idxmin().add_suffix("_first_030"),
                      f030.idxmax().add_suffix("_last_030"),
                      f070.idxmin().add_suffix("_first_070"),
                      f070.idxmax().add_suffix("_last_070"),
                      f090.idxmin().add_suffix("_first_090"),
                      f090.idxmax().add_suffix("_last_090"),
                      f095.idxmin().add_suffix("_first_095"),
                      f095.idxmax().add_suffix("_last_095"),
                      f_abs_0250.idxmin().add_suffix("_abs_0250_min"),
                      f_abs_0250.idxmax().add_suffix("_abs_0250_max"),
                      f_abs_0500.idxmin().add_suffix("_abs_0500_min"),
                      f_abs_0500.idxmax().add_suffix("_abs_0500_max"),
                      f_abs_0750.idxmin().add_suffix("_abs_0750_min"),
                      f_abs_0750.idxmax().add_suffix("_abs_0750_max"),
                      f_abs_1000.idxmin().add_suffix("_abs_1000_min"),
                      f_abs_1000.idxmax().add_suffix("_abs_1000_max"),
                      f_abs_1250.idxmin().add_suffix("_abs_1250_min"),
                      f_abs_1250.idxmax().add_suffix("_abs_1250_max"),
                      f_abs_1500.idxmin().add_suffix("_abs_1500_min"),
                      f_abs_1500.idxmax().add_suffix("_abs_1500_max"),
                     ),
                     axis=0)

In [None]:
# Test with small data

%time small_features = sorted_train.iloc[[0,1,2],:]["segment_id"].apply(extract)

display(small_features)

frame = small_features.iloc[:0]

In [None]:
#Extract features for train data

%time features = dd.from_pandas(sorted_train["segment_id"],npartitions=4).apply(extract,meta=frame).compute(scheduler="processes")

data = pd.concat((sorted_train,features),axis=1)

# Save features to resuse
data.to_csv("train_data.csv")

data

In [None]:
# Extract features for test data

%time _test = dd.from_pandas(test_["segment_id"],npartitions=4).apply(extract,dir_="test",meta=frame).compute(scheduler="processes")
test = pd.concat((test_,_test),axis=1)

# Save features to reuse
test.to_csv("test_data.csv")

test

In [None]:
# Check Features

for _fs in fs:
    data.plot(x="time_to_eruption",
              y=[f"sensor_{i}" + _fs for i in range(1,11)],
              marker=".",linestyle="",figsize=(15,15),subplots=True)
    plt.show()

In [None]:
# Check Features


for _fs in fs:
    for i in range(1,11):
        _c = f"sensor_{i}{_fs}"
        plot_data = data[_c].sort_values()
        _nd = plot_data.notna().sum()
        plt.plot(plot_data,np.arange(1,plot_data.shape[0]+1)/_nd,
                 color="tab:blue",label="train data",marker=".",linestyle=":",alpha=0.5)

        plot_test = test[_c].sort_values()
        _nt = plot_test.notna().sum()
        plt.plot(plot_test,np.arange(1,plot_test.shape[0]+1)/_nt,
                 color="tab:red" ,label="test data" ,marker=".",linestyle=":",alpha=0.5)

        plt.title(_c)
        plt.legend()
        plt.show()

# Train

In [None]:
scale_X = StandardScaler()
scale = StandardScaler()


train_X, test_X, train_y, test_y = train_test_split(data.drop(columns=["segment_id","time_to_eruption"]),
                                                    data[["time_to_eruption"]],
                                                    test_size=0.05)


cols = [f"sensor_{i}{_fs}" for i in range(1,11) for _fs in fs]


# Scale X
scaled_train_X = pd.DataFrame(scale_X.fit_transform(train_X[cols]),index=train_X.index,columns=cols)
scaled_test_X  = pd.DataFrame(scale_X.transform(test_X[cols])     ,index=test_X.index ,columns=cols)
scaled_test    = pd.DataFrame(scale_X.transform(test[cols])       ,index=test.index   ,columns=cols)

# Scale y
scaled_train_y = scale.fit_transform(train_y)[:,0]
scaled_test_y  = scale.transform(test_y)[:,0]


# LightGBM parameters
params = {'task' : 'train',
          'boosting_type' : 'gbdt',
          'objective' : 'regression',
          'metric' : 'mae',
          'verbose' : 0}

    
# Create GBM
# create dataset for lightgbm
lgb_train = lgb.Dataset(scaled_train_X, scaled_train_y)
lgb_eval  = lgb.Dataset(scaled_test_X , scaled_test_y , reference=lgb_train)

# Train LightGBM
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=[lgb_train, lgb_eval],
                valid_names=["train","test"],
                early_stopping_rounds=30)
print(f"Best Iteration: {gbm.best_iteration}")
print(f"Best Score: {gbm.best_score}")

# Plot Training Results
plt.figure(figsize=(12,7))
plt.plot(scaled_train_y,gbm.predict(scaled_train_X),marker=".",linestyle="",color="tab:blue",label="train data")
plt.plot(scaled_test_y ,gbm.predict(scaled_test_X) ,marker=".",linestyle="",color="tab:red" ,label="test data")
plt.plot(np.arange(-2.0,2.0,0.1),np.arange(-2.0,2.0,0.1),color="tab:green")
plt.legend()
plt.xlabel("(Scaled) True time_to_eruption")
plt.ylabel("(Scaled) Pred time_to_eruption")
plt.show()

# Plot Feature Importance
lgb.plot_importance(gbm,figsize=(10,70))
plt.show()

# Create submission file

In [None]:
submit = pd.DataFrame(scale.inverse_transform(gbm.predict(scaled_test)),
                      index=test["segment_id"],
                      columns=["time_to_eruption"],
                      dtype="int")
submit.clip(lower=0,inplace=True)

display(submit)

submit.to_csv("submission.csv")

!cat submission.csv