In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Enviroments

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
%matplotlib inline

# Kaggle Data

In [None]:
PATH = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'

!echo 'Files: train and test'
train_files = []
test_files  = []

for file in os.listdir(PATH+'/train/'):
    train_files.append(file)
    
for file in os.listdir(PATH+'/test/'):
    test_files.append(file)
    
print('Number of train files: {}'.format(len(train_files)))
print('Number of test  files: {}'.format(len(test_files )))

In [None]:
# Train file
train = pd.read_csv(PATH+'train.csv')

# Submission file
sample_submission = pd.read_csv(PATH+'sample_submission.csv')

test_files = []
for dirname, _, filenames in os.walk(PATH+'/test/'):
    for filename in filenames:
        test_files.append(filename[:-4]) # without .csv extension
        
test = pd.DataFrame(test_files, columns=["segment_id"])

In [None]:
train

In [None]:
test

# The relation between eruption time

In [None]:
sns.distplot(train['time_to_eruption'], 
             hist=True, 
             kde=True, 
             bins=100, 
             color = 'blue', 
             hist_kws={'edgecolor':'black'})

*Deriving the mininum and maximum time of erruption and it's segment id*

In [None]:
display(train.sort_values('time_to_eruption', axis=0, ascending=True).iloc[[0,-1],:])

In [None]:
segment_id_min =  601524801
segment_id_max = 1923243961
df_segment_id_min = pd.read_csv(PATH+'/train/'+str(segment_id_min)+'.csv')
df_segment_id_max = pd.read_csv(PATH+'/train/'+str(segment_id_max)+'.csv')

Data about the sensors of the  maximum enurption time segment 

In [None]:

df_segment_id_min.plot(figsize=(20,20),
                       subplots=True, 
                       layout=(10,1),
                       rot=0, 
                       lw=1, 
                       #colormap='jet',
                       title='601524801 (min)'
                      )

plt.show()

In [None]:
df_segment_id_min.plot(figsize=(20,20),
                       subplots=False, 
                       layout=(10,1),
                       rot=0, 
                       lw=1, 
                       #colormap='jet',
                       title='601524801 (min)'
                      )

plt.show()

Data about the sensors of the  maximum enurption time segment 

In [None]:

df_segment_id_max.plot(figsize=(20,20),
                       subplots=True, 
                       layout=(10,1),
                       rot=0, 
                       lw=1, 
                       #colormap='jet',
                       title='1923243961 (min)'
                      )

plt.show()

In [None]:
df_segment_id_max.plot(figsize=(20,20),
                       subplots=False, 
                       layout=(10,1),
                       rot=0, 
                       lw=2, 
                       #colormap='jet',
                       title='1923243961 (max)'
                      )

plt.show()

This clear function below is taken from Kostiantyn Isaienkov to derive data from the sensors

In [None]:
def build_features(signal, ts, sensor_id):
    X = pd.DataFrame()
    f = np.fft.fft(signal)
    f_real = np.real(f)
    X.loc[ts, f'{sensor_id}_sum']       = signal.sum()
    X.loc[ts, f'{sensor_id}_mean']      = signal.mean()
    X.loc[ts, f'{sensor_id}_std']       = signal.std()
    X.loc[ts, f'{sensor_id}_var']       = signal.var() 
    X.loc[ts, f'{sensor_id}_max']       = signal.max()
    X.loc[ts, f'{sensor_id}_min']       = signal.min()
    X.loc[ts, f'{sensor_id}_skew']      = signal.skew()
    X.loc[ts, f'{sensor_id}_mad']       = signal.mad()
    X.loc[ts, f'{sensor_id}_kurtosis']  = signal.kurtosis()
    X.loc[ts, f'{sensor_id}_quantile99']= np.quantile(signal, 0.99)
    X.loc[ts, f'{sensor_id}_quantile95']= np.quantile(signal, 0.95)
    X.loc[ts, f'{sensor_id}_quantile85']= np.quantile(signal, 0.85)
    X.loc[ts, f'{sensor_id}_quantile75']= np.quantile(signal, 0.75)
    X.loc[ts, f'{sensor_id}_quantile55']= np.quantile(signal, 0.55)
    X.loc[ts, f'{sensor_id}_quantile45']= np.quantile(signal, 0.45) 
    X.loc[ts, f'{sensor_id}_quantile25']= np.quantile(signal, 0.25) 
    X.loc[ts, f'{sensor_id}_quantile15']= np.quantile(signal, 0.15) 
    X.loc[ts, f'{sensor_id}_quantile05']= np.quantile(signal, 0.05)
    X.loc[ts, f'{sensor_id}_quantile01']= np.quantile(signal, 0.01)
    X.loc[ts, f'{sensor_id}_fft_real_mean']= f_real.mean()
    X.loc[ts, f'{sensor_id}_fft_real_std'] = f_real.std()
    X.loc[ts, f'{sensor_id}_fft_real_max'] = f_real.max()
    X.loc[ts, f'{sensor_id}_fft_real_min'] = f_real.min()

    return X

# Building datasets

In [None]:
train_set = list()
seg=0

for seg, segment_id in enumerate(train.segment_id):
    signals = pd.read_csv(PATH+'/train/'+str(segment_id)+'.csv')
    train_row = []
    
    if seg%200 == 0:
        print('Processing segment_id={}'.format(seg))
        
    for sensor in range(0, 10):
        sensor_id = f'sensor_{sensor+1}'
        train_row.append(build_features(signals[sensor_id].fillna(0), segment_id, sensor_id))
        
    train_row = pd.concat(train_row, axis=1)
    train_set.append(train_row)
    seg+=1
    
train_set = pd.concat(train_set)

*  saving the training set with all the trash to be sure

In [None]:
train_set_safe=train_set


In [None]:
train_set = train_set.rename(columns={'index': 'segment_id'})

In [None]:
train_set


In [None]:
X = train_set.drop(['segment_id','level_0'], axis=1)
Y = train.drop(['time_to_eruption'], axis=1)
X=Y.join(X)


In [None]:
X

In [None]:
train_set=X

In [None]:
test_set = list()
seg=0

for seg, segment_id in enumerate(test.segment_id):
    signals = pd.read_csv(PATH+'/test/'+str(segment_id)+'.csv')
    test_row = []
    
    if seg%200 == 0:
        print('Processing segment_id={}'.format(seg))
        
    for sensor in range(0, 10):
        sensor_id = f'sensor_{sensor+1}'
        test_row.append(build_features(signals[sensor_id].fillna(0), segment_id, sensor_id))
        
    test_row = pd.concat(test_row, axis=1)
    test_set.append(test_row)
    seg+=1
    
test_set = pd.concat(test_set)

In [None]:
test_set