In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import librosa
import librosa.display

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings("ignore")
        
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.ndimage import maximum_filter1d
from scipy.ndimage import minimum_filter1d

from datetime import datetime

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def reduce_mem(df):
    for col in df.columns:
        if df[col].dtype in ['float64','float32']:
            df[col] = df[col].astype(np.float16)
            
    return df

In [None]:
V_PATH = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'
TRAIN_PATH = V_PATH + 'train/'

In [None]:
SENSOR_COLS = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
       'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10']


SENSOR_RMEANS = [x+'_rmin' for x in SENSOR_COLS] 
SENSOR_RSTDS = [x+'_rstd' for x in SENSOR_COLS] 
SENSOR_RMINS = [x+'_rmin' for x in SENSOR_COLS] 
SENSOR_RMAXES = [x+'_rmax' for x in SENSOR_COLS]
SENSOR_RGRADMEAN = [x+'_grad_rmean' for x in SENSOR_COLS]
SENSOR_RGRADSTD = [x+'_grad_rstd' for x in SENSOR_COLS]

SENSOR_RSTATS = [SENSOR_RMEANS, SENSOR_RSTDS, SENSOR_RMINS, SENSOR_RMAXES,
               SENSOR_RGRADMEAN,  SENSOR_RGRADSTD]

ROLL_DESCR = ['rmin', 'rstd', 'rmin', 'rmax', 'grad_rmean','grad_rstd']

In [None]:
dtypes_dict = {'sensor_1': 'float32',
 'sensor_2': 'float32',
 'sensor_3': 'float32',
 'sensor_4': 'float32',
 'sensor_5': 'float32',
 'sensor_6': 'float32',
 'sensor_7': 'float32',
 'sensor_8': 'float32',
 'sensor_9': 'float32',
 'sensor_10': 'float32'}

I don't have tons and tons of time, so this is not particularly neat or tidy, just trying to get some ideas of the data as quickly as possible

# 1. Inspect train and test CSV
There are 4431 train data files, each with a row in train

In [None]:
train = pd.read_csv(V_PATH+'train.csv')
print(train.shape)
print(train.columns)
train.head(10)

Individual data files are around 60000 rows each

In [None]:
total_rows_estimate = 60001 * len(train) / 1000000
print('estimate of total TRAIN rows (millions)',total_rows_estimate)

In [None]:
sample_submission = pd.read_csv(V_PATH+'sample_submission.csv')
print(sample_submission.shape)
print(sample_submission.columns)

total_rows_estimate = 60001 * len(sample_submission) / 1000000
print('estimate of total TEST rows (millions)',total_rows_estimate)

sample_submission.head(5)

The rows of the train file show a pretty even distribution of times until next eruption

In [None]:
fig,axes=plt.subplots(figsize=(10,5))
axes.set_title('Distribution of Train time to eruption (scaled 0 to 1 = max)', size=16)
sns.kdeplot(train['time_to_eruption'] / train['time_to_eruption'].max(), color='Red')
axes.set_xlabel('Time To Eruption (Scaled 0 - 1)', size=12)
axes.set_ylabel('Density of Train Data', size=12)
sns.despine()

In [None]:
print('Minimum time to eruption', np.round(train['time_to_eruption'].min(),0)) 
print('Max time to eruption', np.round(train['time_to_eruption'].max(),0))
print('Mean time to eruption', np.round(train['time_to_eruption'].mean(),0))

In [None]:
#rescaling time to eruption to simplify looking at some of the numbers later
sz = train['time_to_eruption'].size-1
train['PCNT_TIME'] = np.round(train['time_to_eruption'].rank(method='max').apply(lambda x: 100.0*(x-1)/sz),0)
train.head(10)

In [None]:
def get_rolling(df, cols, window=50):
    for col in cols:
        df[col+'_grad'] = np.gradient(df[col])
        df[col+'_grad'] = df[col+'_grad'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_grad_abs'] = np.gradient(np.abs(df[col]))
        df[col+'_grad'] = df[col+'_grad'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rmin'] = minimum_filter1d(df[col].values, size=window)
        df[col+'_rmax'] = maximum_filter1d(df[col].values, size=window)
        
        df[col+'_rmin'] = df[col+'_rmin'].fillna(method='bfill').fillna(method='ffill')
        df[col+'_rmax'] = df[col+'_rmax'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rmean'] = df[col].rolling(window=window, center=True).mean().fillna(method='bfill').fillna(method='ffill')
        df[col+'_rstd'] = df[col].rolling(window=window, center=True).std().fillna(method='bfill').fillna(method='ffill')
        
        #add also for gradients
        df[col+'_grad_rmin'] = minimum_filter1d(df[col+'_grad_abs'].values, size=window)
        df[col+'_grad_rmax'] = maximum_filter1d(df[col+'_grad_abs'].values, size=window)
        
        df[col+'_grad_rmin'] = df[col+'_grad_rmin'].fillna(method='bfill').fillna(method='ffill')
        df[col+'_grad_rmax'] = df[col+'_grad_rmax'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_grad_rmean'] = df[col+'_grad_abs'].rolling(window=window, center=True).mean().fillna(method='bfill').fillna(method='ffill')
        df[col+'_grad_rstd'] = df[col+'_grad_abs'].rolling(window=window, center=True).std().fillna(method='bfill').fillna(method='ffill')
        
    return df

# 2. Explore some sample data
Select some example train data frames to run some initial analysis, without needing to import all of the data - use some quantiles to ensure a mix of durations to next eruption (simplified - only for EDA)

Create some rolling data analysis as they are imported (get rolling mins, maxes, stds for each sensor)

Note: there are quite a few NAs / missing values in the data. For now these are just filled with zeros.

In [None]:
select_quantiles = np.arange(0.01, 0.99, 0.02)
print('Quantiles list',select_quantiles)

loaded_dfs = pd.DataFrame()

for count,q in enumerate(select_quantiles):
    
    s_ID = train['segment_id'][train['PCNT_TIME']==int(q*100)].values[0]
    #print(s_ID)
    
    temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv', dtype=dtypes_dict)
                    #     na_values=0)
    #print('count NAs Percent, ', temp_df.isna().sum().sum() / len(temp_df.values.flatten()))
    #for col in temp_df.columns:
    temp_df = temp_df.fillna(method='ffill').fillna(method='bfill')
    temp_df = temp_df.fillna(value=0)
    
    temp_df = get_rolling(temp_df, SENSOR_COLS)
    
    
    #print('count NAs Percent, ', temp_df.isna().sum().sum() / len(temp_df.values.flatten()))
    temp_df['time_to_eruption'] = q
    temp_df['segment'] = count
    loaded_dfs = pd.concat([loaded_dfs, temp_df])
    #print('    ')
loaded_dfs = loaded_dfs.reset_index(drop=True) 
print(' Total DF Size ',loaded_dfs.shape)
loaded_dfs.head(10)

# 2a - Ranges of Sensor Data
I realised that my earlier versions neglected any basic analysis of the range of values covered by each sensor in these example extracts
Lets see if the sensors report similar ranges in this sample data

In [None]:
print('Sensor Max values (absolute)')
np.abs(loaded_dfs[SENSOR_COLS]).max()

In [None]:
print('Sensor Mean values (absolute)')
np.abs(loaded_dfs[SENSOR_COLS]).mean()

In [None]:
print('Sensor Median values (absolute)')
np.abs(loaded_dfs[SENSOR_COLS]).median()

We can see there are some significant differences in the max and median sensor readings.
Next question - how do the distributions look if we plot them? I'm going to ignore zero values as these are 'missings'.

In [None]:
fig,axes = plt.subplots(figsize=(20,5))

for sc in SENSOR_COLS:
    sns.kdeplot(np.abs(loaded_dfs[sc])[loaded_dfs[sc]!=0], ax=axes)
    
axes.set_xlim(0, 2500)
axes.set_title('Absolute Sensor Reading Distributions (zero values dropped)', size=18)

axes.set_xlabel('Absolute Sensor Reading (graph clipped at max = 2500)', size=12)
axes.set_ylabel('Density of Sample Train Data', size=12)
axes.legend(fontsize=12)
sns.despine()

OK, so we definitely have some significant differences in the spread of readings. 

# 2b. Correlations between sensor readings

Let's check how well correlated the sensors are with one another.

In [None]:
sensor_corr = np.abs(loaded_dfs[SENSOR_COLS]).corr()

In [None]:
fig,axes=plt.subplots(figsize=(8,8))

sns.heatmap(sensor_corr, annot=True, cbar=False, cmap='seismic_r',
           vmin=0,vmax=1,ax=axes)

axes.set_title('Sensor Signal (Absolute) Correlations', size=20)
axes.yaxis.set_tick_params(labelsize=14)
axes.xaxis.set_tick_params(labelsize=14)

# Correlations between the signals seem to be modest. Are the correlations stronger closer to an eruption?

In [None]:
fig,axes=plt.subplots(ncols=2,figsize=(20,8))

sensor_corr = np.abs(loaded_dfs[SENSOR_COLS][loaded_dfs['time_to_eruption']<0.5]).corr()

sns.heatmap(sensor_corr, annot=True, cbar=False, cmap='seismic_r',
           vmin=0,vmax=1,ax=axes[0])

axes[0].set_title('Sensor Signal (Absolute) Correlations Closer to Eruption', size=20)

sensor_corr = np.abs(loaded_dfs[SENSOR_COLS][loaded_dfs['time_to_eruption']>=0.5]).corr()

sns.heatmap(sensor_corr, annot=True, cbar=False, cmap='seismic_r',
           vmin=0,vmax=1,ax=axes[1])

axes[1].set_title('Sensor Signal (Absolute) Correlations Further From Eruption', size=20)


axes[0].yaxis.set_tick_params(labelsize=12)
axes[0].xaxis.set_tick_params(labelsize=12)

axes[1].yaxis.set_tick_params(labelsize=12)
axes[1].xaxis.set_tick_params(labelsize=12)

plt.tight_layout()

Interesting. It looks like the correlation between signals is extremely poor when we are far away from an eruption, at least for this sample data.
It also looks to me like there are some plausible 'groupings' of sensors

In [None]:
fig,axes=plt.subplots(ncols=1,figsize=(8,8))

sensor_corr = np.abs(loaded_dfs[SENSOR_COLS][loaded_dfs['time_to_eruption']<0.5]).corr()

grouped_list = ['sensor_1','sensor_2','sensor_3','sensor_4','sensor_9',
               'sensor_5','sensor_10',
             'sensor_6','sensor_8','sensor_7'  ]

sensor_corr = sensor_corr[grouped_list].T[grouped_list].T

sns.heatmap(sensor_corr, annot=True, cbar=False, cmap='seismic_r',
           vmin=0,vmax=1,ax=axes)

axes.set_title('Sensor Signal (Absolute) Correlations - Reordered', size=20)
axes.yaxis.set_tick_params(labelsize=14)
axes.xaxis.set_tick_params(labelsize=14)

This kind of looks like 'blocks' to me. 
* Sensors 1-2-3-4-9
* Sensors 5-10
* Sensors 6-8-7

The last 3 only seem to share the characteristic that they are not really very well correlated with any of the others.
We'll have to see whether this can provide any useful outcome with predictions

Finally in this section, lets check how often each sensor has the largest absolute value.

In [None]:
print('Percentage of Rows where the sensor is the highest absolute value')

loaded_dfs['strongest_signal']  = np.argmax(np.abs(loaded_dfs[SENSOR_COLS].values), axis=1) + 1
loaded_dfs['strongest_signal'].value_counts() / loaded_dfs['strongest_signal'].value_counts().sum()

Sensors 10, 6 and 2 have the highest proportion of rows where the are the largest absolute value recorded. 
How about any correlation with eruptions?

In [None]:
print('Average Time to Eruption grouped by strongest signal at point in time')
print('Note: time to eruption is scaled 0-1.0 for easier viewing')
loaded_dfs.groupby(['strongest_signal'])['time_to_eruption'].mean()

The best indicator from this data seems to be when signal 2 is the strongest. But overall it doesn't look all that helpful - many of the values are around the 0.5 mark

So we know there is some correlation between sensor readings, I guess this makes sense as we could guess they would be picking up the same events, to some extent. Let's compare how they record during some data which shows what look like some strong signals.

Note: this is not an attempt to start trying to identify actual real sensor locations etc, I'm assuming that comparing and analysing readings from multiple sensors is a generally applicable approach in this field.

In [None]:
s_ID=1424510231 

#print('Segment ID' ,s_ID)

print(train[train['segment_id']==s_ID])

fig,axes=plt.subplots(nrows=1,ncols=1,figsize=(20,6))
temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv')

for s in SENSOR_COLS:
    axes.plot(temp_df[s], linewidth=1)
    
axes.set_title('Readings from each Sensor for segment ID '+str(s_ID), size=18)
axes.legend(SENSOR_COLS, fontsize=12)


axes.set_xlabel('Time (total duration = one 10 minute segment)', size=14)
axes.set_ylabel('Absolute Sensor Reading', size=14)
#axes.legend(fontsize=12)
sns.despine()

What does the correlation look like for this segment of data?

In [None]:
fig,axes=plt.subplots(figsize=(8,8))

sns.heatmap(np.abs(temp_df[SENSOR_COLS]).corr()[grouped_list].T[grouped_list], annot=True, cbar=False, cmap='seismic_r',
           vmin=0,vmax=1,ax=axes)

axes.set_title('Sensor Signal (Absolute) Correlations for Segment '+str(s_ID), size=20)

axes.yaxis.set_tick_params(labelsize=14)
axes.xaxis.set_tick_params(labelsize=14)

Lets look at 3 comparisons: 
* Sensors 1 and 3 show high correlation
* Sensors 4 and 6 show moderate correlation
* Sensors 7 and 8 are fairly well correlated between themselves, but less so with other sensors.

In [None]:
fig,axes=plt.subplots(nrows=1,ncols=1,figsize=(20,6))
temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv')

plot_sensors=['sensor_1', 'sensor_3']

for s in plot_sensors:
    axes.plot(temp_df[s], linewidth=1)
    
axes.set_title('Selected Sensor Readings for segment ID '+str(s_ID), size=18)
axes.legend(plot_sensors, fontsize=12)


axes.set_xlabel('Time (total duration = one 10 minute segment)', size=14)
axes.set_ylabel('Absolute Sensor Reading', size=14)
#axes.legend(fontsize=12)
sns.despine()

In [None]:
fig,axes=plt.subplots(nrows=1,ncols=1,figsize=(20,6))
temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv')

plot_sensors=['sensor_4', 'sensor_6']

for s in plot_sensors:
    axes.plot(temp_df[s], linewidth=1)
    
axes.set_title('Selected Sensor Readings for segment ID '+str(s_ID), size=18)
axes.legend(plot_sensors, fontsize=12)

axes.set_xlabel('Time (total duration = one 10 minute segment)', size=14)
axes.set_ylabel('Absolute Sensor Reading', size=14)
#axes.legend(fontsize=12)
sns.despine()

In [None]:
fig,axes=plt.subplots(nrows=1,ncols=1,figsize=(20,6))
temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv')

plot_sensors=['sensor_7', 'sensor_8', 'sensor_1']

for s in plot_sensors:
    axes.plot(temp_df[s], linewidth=1)
    
axes.set_title('Selected Sensor Readings for segment ID '+str(s_ID), size=18)
axes.legend(plot_sensors, fontsize=12)

axes.set_xlabel('Time (total duration = one 10 minute segment)', size=14)
axes.set_ylabel('Absolute Sensor Reading', size=14)
#axes.legend(fontsize=12)
sns.despine()

# 3. Distributions of data / gradients by time to next eruption


Red = Closer to Eruption, Green = further away

In [None]:
SEGMENTS = pd.Series(loaded_dfs['time_to_eruption'])
nr = len(SENSOR_COLS)
nc = 4
tg = nr * nc

fig,axes=plt.subplots(nrows=nr, ncols=nc,figsize=(20,nr*4))


color_grades = ['Red', 'darkorange','wheat','yellowgreen','seagreen']

for count, SC in enumerate(SENSOR_COLS):
    for Q in np.flip(select_quantiles):
        sns.kdeplot(loaded_dfs[SC][loaded_dfs['time_to_eruption']==Q], ax=axes[count, 0],
                   alpha=0.7, color=color_grades[int(0.5+Q*4)])
        
        sns.kdeplot(loaded_dfs[SC+'_grad_rmax'][loaded_dfs['time_to_eruption']==Q], ax=axes[count, 1],
                   alpha=0.7, color=color_grades[int(0.5+Q*4)], bw=0.1)
        
        sns.kdeplot(loaded_dfs[SC+'_grad_rstd'][loaded_dfs['time_to_eruption']==Q], ax=axes[count, 2],
                   alpha=0.7, color=color_grades[int(0.5+Q*4)], bw=0.1)
        
        sns.kdeplot(loaded_dfs[SC+'_grad_rmin'][loaded_dfs['time_to_eruption']==Q], ax=axes[count, 3],
                   alpha=0.7, color=color_grades[int(0.5+Q*4)], bw=0.1)       
        
        
        
    xulim = loaded_dfs[SC].quantile(0.95)
    xllim = loaded_dfs[SC].quantile(0.05)
    #axes[count, 0].legend(None)
    axes[count, 0].set_xlim(xllim, xulim)
    axes[count, 0].get_legend().remove()    
    axes[count, 0].set_title(SC+' Signal Values')
    
    xulim = loaded_dfs[SC+'_grad_rmax'].quantile(0.95)
    xllim = loaded_dfs[SC+'_grad_rmax'].quantile(0.05)
    axes[count, 1].set_xlim(xllim, xulim)
    axes[count, 1].get_legend().remove()    
    axes[count, 1].set_title(SC+' Grads RollMax')
    
    xulim = loaded_dfs[SC+'_grad_rstd'].quantile(0.95)
    xllim = loaded_dfs[SC+'_grad_rstd'].quantile(0.05)
    axes[count, 2].set_xlim(xllim, xulim)
    axes[count, 2].get_legend().remove()    
    axes[count, 2].set_title(SC+' Grads RollSTD')
    
    xulim = loaded_dfs[SC+'_grad_rmin'].quantile(0.95)
    xllim = loaded_dfs[SC+'_grad_rmin'].quantile(0.05)
    axes[count, 3].set_xlim(xllim, xulim)
    axes[count, 3].get_legend().remove()    
    axes[count, 3].set_title(SC+' Grads RollMin')
    
plt.tight_layout()

The data looks quite scattered, and the sensors don't look that consistent. Sensor 5 readings look a bit odd in these data samples. It looks like there are some specific high concentrations of readings in a narrow range.

# 4. Check on some sample data for NA / missing values

There are quite a lot of missing values. Let's check how many / where these are. Just looking at the first 1000 data files for train and test to save some time and RAM

In [None]:
def get_nas(path, file_id):
    temp_df = pd.read_csv(path+str(file_id)+'.csv', dtype=dtypes_dict)
    return temp_df.isna().sum().values

In [None]:
%%time

SENSOR_COLS_MISSING = [x+'_missing' for x in SENSOR_COLS]
for SC in SENSOR_COLS_MISSING:
    train[SC] = 0.0
    sample_submission[SC] = 0.0

for count,i in enumerate(train.index[0:1000]):
    #print(count/len(train))
    s_ID = train.loc[i,'segment_id']    
    #temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv')
    train.loc[i,SENSOR_COLS_MISSING] = get_nas(TRAIN_PATH,s_ID)
    #del temp_df
    #gc.collect()
print('finished Train load')
    
TEST_PATH = V_PATH + 'test/'  
for count,i in enumerate(sample_submission.index[0:1000]):
    #print(count/len(sample_submission))
    s_ID = sample_submission.loc[i,'segment_id']    
    sample_submission.loc[i,SENSOR_COLS_MISSING] = get_nas(TEST_PATH,s_ID)
    
gc.collect()
print('finished Test load')

In [None]:
sample_submission[SENSOR_COLS_MISSING] = sample_submission[SENSOR_COLS_MISSING] / sample_submission[SENSOR_COLS_MISSING].max().max()
train[SENSOR_COLS_MISSING] = train[SENSOR_COLS_MISSING] / train[SENSOR_COLS_MISSING].max().max()

In [None]:
train_missings = pd.DataFrame(columns=['mean','pc dfs with missing'],
                             index=SENSOR_COLS_MISSING)
train_missings['mean'] = train[SENSOR_COLS_MISSING].mean().values
for sc in SENSOR_COLS_MISSING:
    train_missings.loc[sc, 'pc dfs with missing'] = len(train[train[sc]>0]) / len(train)
#train_missings['count with missing'] = train[SENSOR_COLS_MISSING].count().va

In [None]:
test_missings = pd.DataFrame(columns=['mean','pc dfs with missing'],
                             index=SENSOR_COLS_MISSING)
test_missings['mean'] = sample_submission[SENSOR_COLS_MISSING].mean().values
for sc in SENSOR_COLS_MISSING:
    test_missings.loc[sc, 'pc dfs with missing'] = len(sample_submission[sample_submission[sc]>0]) / len(sample_submission)
#train_missings['count with missing'] = train[SENSOR_COLS_MISSING].count().values

It looks like distributions of missing values are different in Test data. we have a lot more missing values for sensors 9 and 10 in particular, it seems

In [None]:
fig,axes=plt.subplots(ncols=2,figsize=(15,6), sharey=True)

sns.heatmap(train_missings.fillna(value=0), annot=True, cbar=False, cmap='seismic',fmt='.1%', annot_kws={"fontsize":12},
           vmin=0,vmax=0.05,ax=axes[0])

sns.heatmap(test_missings.fillna(value=0), annot=True, cbar=False, cmap='seismic',fmt='.1%', annot_kws={"fontsize":12},
           vmin=0,vmax=0.05, ax=axes[1])

axes[0].set_title('Sample Train DFs - Missings', size=18)
axes[1].set_title('Sample Test DFs - Missings', size=18)

axes[0].yaxis.set_tick_params(labelsize=14)
axes[0].xaxis.set_tick_params(labelsize=14)

axes[1].yaxis.set_tick_params(labelsize=14)
axes[1].xaxis.set_tick_params(labelsize=14)

plt.tight_layout()

In [None]:
for col in SENSOR_COLS_MISSING:
    train = train.drop(col, axis=1)
    sample_submission = sample_submission.drop(col, axis=1)
    
gc.collect()

# 5. Add in some analysis across sensors
Let's take some mins, maxes, means etc across rows (of original sensor data, and of the groups of rolling columns e.g. the max, min sensor rolling gradients) to see what information combining sensor readings can give us.

In [None]:
def get_stats(df, sensor_cols=SENSOR_COLS, rolling_cols=SENSOR_RSTATS):
    #we create the min max etc of original sensor columns
    df['max'] = df[sensor_cols].max(axis=1).astype(np.float16)
    df['min'] = df[sensor_cols].min(axis=1).astype(np.float16)
    df['std'] = df[sensor_cols].std(axis=1).astype(np.float16)
    
    #and with absolute values
    df['max_abs'] = np.abs(df[sensor_cols]).max(axis=1).astype(np.float16)
    df['min_abs'] = np.abs(df[sensor_cols]).min(axis=1).astype(np.float16)
    df['std_abs'] = np.abs(df[sensor_cols]).std(axis=1).astype(np.float16)
    
    #we take mins and maxes of groups of rolling columns
    for count,rc in enumerate(rolling_cols): #this takes a SINGLE mean, max across each GROUP of rolling
        #columns - e.g. the max of all rolling mins
        df[ROLL_DESCR[count]+'_max'] = df[rolling_cols[count]].max(axis=1).astype(np.float16)
        df[ROLL_DESCR[count]+'_min'] = df[rolling_cols[count]].min(axis=1).astype(np.float16)
        df[ROLL_DESCR[count]+'_std'] = df[rolling_cols[count]].std(axis=1).astype(np.float16)
        df[ROLL_DESCR[count]+'_mean'] = df[rolling_cols[count]].mean(axis=1)  .astype(np.float16)  
   
    gc.collect()
    return df

loaded_dfs = get_stats(loaded_dfs, sensor_cols=SENSOR_COLS, rolling_cols=SENSOR_RSTATS)
loaded_dfs = reduce_mem(loaded_dfs)

In [None]:
ldf_corr = loaded_dfs.corr()

Plot some correlations with our target column

In [None]:
ldf_corr = ldf_corr.sort_values('time_to_eruption', ascending=False)

ldf_corr = ldf_corr[['time_to_eruption'] + [ col for col in ldf_corr.columns if col != 'time_to_eruption' ]]

filt = np.abs(ldf_corr['time_to_eruption'])>0.05

fig,axes=plt.subplots(figsize=(20,20))
sns.heatmap(ldf_corr[filt], annot=False, cmap='seismic', vmin=-1, vmax=1, cbar=False)

gc.collect()

axes.set_title('Correlation with Time to Eruption (all features with >5% correlation)', size=18)

Save some of the features which are >5% correlated for later use

In [None]:
TOP_FTS = ldf_corr[filt].index
print(len(TOP_FTS))

Rolling features definitely seem useful (no surprise - as important variations in the data would likely last longer than 1 row, but less than ten minutes)

As a side note:

https://www.nature.com/articles/s41467-020-17375-2

"We processed 9 years of data, from 1 January 2011 to 1 January 2020 (Fig. 1b), to obtain four time series that capture different parts of the tremor signal (RSAM, DSAR, medium and high frequency bands—MF and HF—sampled every 10 min; see “Methods”)"

Interesting that this also looked at 10 minutes of data as being a good time frame for detection.    

In [None]:
ldf_corr = ldf_corr.sort_values('time_to_eruption', ascending=False)

filt = np.abs(ldf_corr['time_to_eruption'])>0.3

fig,axes=plt.subplots(figsize=(10,7))
sns.heatmap(ldf_corr[['time_to_eruption', 'segment']][filt], 
            annot=True, cmap='seismic', vmin=-1, vmax=1, cbar=False)

gc.collect()

axes.set_title('Correlation with Time to Eruption (all features with >30% correlation)')

It seems that Gradient Rolling Standard Deviation min() has a negative correlation with time to eruption - i.e. when it is higher, eruptions are closer.
# 6. Look at some segments (10 min batches)

Let's look at how the gradient rolling std min is distributed across some sub segment (1000 rows)

In [None]:
#create subsegments of 1000 rows
loaded_dfs['sub_segment'] = loaded_dfs.index // 1000
segment_means = loaded_dfs.groupby(['sub_segment'])['grad_rstd_min'].mean().sort_values(ascending=False)

#lets ignore any where this value is zero (possibly a filled NA or other error)
segment_means = segment_means[segment_means>0]
fig,axes=plt.subplots(figsize=(15,5))
sns.kdeplot(segment_means.values,ax=axes, color='Red')
axes.set_title('Sub Segment Gradient Rolling STD Mins', size=18)

axes.set_xlabel('Gradient Rolling STD Mins', size=12)
axes.set_ylabel('Density of Sample Train Data', size=12)
sns.despine()

In [None]:
print('Segments with higher Gradients')
print(segment_means.head(10))

In [None]:
print('Segments with medium Gradients')
print(segment_means[len(segment_means)//2-5:len(segment_means)//2+5])

In [None]:
print('Segments with lower Gradients')
print(segment_means.tail(10))

Lets check out sensor readings from 3 contrasting segments

In [None]:
SEGMENT_HIGH = 179
SEGMENT_MEDIUM = 445
SEGMENT_LOW = 1609

nr = len(SENSOR_COLS)
nc = 2
tg = nr * nc

TITLES = ['Close to Eruption','Mid Point', 'Far from Eruption']
colors= ['Red','Orange','Green']

fig,axes=plt.subplots(nrows=nr, ncols=nc,figsize=(20,nr*4))

for count1, SEG in enumerate([SEGMENT_HIGH, SEGMENT_MEDIUM,SEGMENT_LOW]):
    
    seg_filt = loaded_dfs['sub_segment']==SEG
    
    for count, SC in enumerate(SENSOR_COLS):

        sns.lineplot(x=range(sum(seg_filt)),y=loaded_dfs[SC][seg_filt], ax=axes[count, 0],
                        color=colors[count1])

        sns.lineplot(x=range(sum(seg_filt)), y=loaded_dfs[SC+'_grad_rstd'][seg_filt], ax=axes[count, 1],
                     color=colors[count1])

        axes[count, 0].legend(TITLES, fontsize=12)
       
        axes[count, 0].set_title(SC + ' Readings', size=18)

        axes[count, 1].set_title(SC+' Gradients Rolling Max', size=18)
        axes[count, 1].legend(TITLES, fontsize=12)
    
plt.tight_layout()

**And the minimum across sensors
Note: the green line probably includes an error (missing sensor data) - will need to tackle NA values**

In [None]:
fig,axes=plt.subplots(nrows=1, ncols=1,figsize=(12,7))
for count1, SEG in enumerate([SEGMENT_HIGH, SEGMENT_MEDIUM,SEGMENT_LOW]):
    
    seg_filt = loaded_dfs['sub_segment']==SEG
    sns.lineplot(x=range(sum(seg_filt)), y=loaded_dfs['grad_rstd_min'][seg_filt], ax=axes,
                     color=colors[count1])
    
    axes.legend(TITLES, fontsize=12)
    
axes.set_title('Gradient Rolling STD Mins Comparison', size=18)
axes.set_xlabel('Time (one subsegment, 1000 rows)', size=14)
axes.set_ylabel('Gradient Rolling STD Mins', size=14)

**The mean, to mitigate the issue with missing sensor data**

In [None]:
fig,axes=plt.subplots(nrows=1, ncols=1,figsize=(12,7))

for count1, SEG in enumerate([SEGMENT_HIGH, SEGMENT_MEDIUM,SEGMENT_LOW]):
    
    seg_filt = loaded_dfs['sub_segment']==SEG
    sns.lineplot(x=range(sum(seg_filt)), y=loaded_dfs['grad_rstd_mean'][seg_filt], ax=axes,
                     color=colors[count1])
    
    axes.legend(TITLES, fontsize=12)
    
axes.set_title('Gradient Rolling STD Means Comparison', size=18)
axes.set_xlabel('Time (one subsegment, 1000 rows)', size=14)
axes.set_ylabel('Gradient Rolling STD Means', size=14)

# 7. Spectrograms of sensor data

As I wasn't really sure the best way to generate the spectrogram, I've borrowed some code from here:
https://www.kaggle.com/michael422/spectrogram-convolution/notebook

In [None]:
from scipy.signal import spectrogram

def return_spectrogram(sig_in, dsamp):
    nperseg = 256 # default 256
    noverlap = nperseg // 4 # default: nperseg // 8
    fs = 4000000 // dsamp # raw signal sample rate is 4MHz
    window = 'triang'
    scaling = 'density' # {'density', 'spectrum'}
    detrend = 'linear' # {'linear', 'constant', False}
    eps = 1e-11
    f, t, Sxx = spectrogram(sig_in, nperseg=nperseg, noverlap=noverlap,
                                   fs=fs, window=window,
                                   scaling=scaling, detrend=detrend)
    return f, t, np.log(Sxx + eps)

def plot_segment(segment):
    fig,axes=plt.subplots(ncols=2,nrows=len(SENSOR_COLS), figsize=(20,2.5*len(SENSOR_COLS)))
    time_to_eruption = loaded_dfs['time_to_eruption'][loaded_dfs['segment']==segment].mean() * train['time_to_eruption'].max()
    print('Spectrograms at Time to Eruption: ', time_to_eruption)
    for count,s in enumerate(SENSOR_COLS):
        sensor_data = loaded_dfs[s][loaded_dfs['segment']==segment].copy()
        #print(sensor_data.shape)
        axes[count,1].imshow(return_spectrogram(sensor_data.values, 100)[2],
                   cmap='seismic')
        axes[count,1].set_title(s + ' Spectrogram', size=16)
        
        axes[count,0].plot(sensor_data, color='Red')
        axes[count,0].set_title(s + ' Signal', size=16)
        
    plt.tight_layout()

In [None]:
plot_segment(0)

In [None]:
plot_segment(10)

In [None]:
plot_segment(40)

# 8. Check out some possible features
As it's going to be more challenging to load all train data (millions of rows), lets see what statistcs from each 10-minute section of data are most useful first

In [None]:
#credit - stack overflow

def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [None]:
gc.collect()

segment_summaries = loaded_dfs.groupby(['segment'])[TOP_FTS].agg(['max','skew','std','min','mean',
                                    percentile(0.1),percentile(0.25),
                                        percentile(0.75),percentile(0.9)])

segment_summaries.columns = [a+b for a,b in segment_summaries.columns]
segment_summaries_corr = segment_summaries.corr()
segment_summaries_corr = segment_summaries_corr.sort_values('time_to_eruptionmean')

drop_fts = ['eruption', 'segment']
FTS=segment_summaries_corr.columns
for d in drop_fts:
    FTS = [x for x in FTS if d not in x]

gc.collect()

Lets look at some of the correlations with our target time to eruption. We don't have all that much train data loaded for the exploration, so these correlations are probably quite dependent on the specific sections of data we happen to be looking at. But hopefully they will give some ideas.

In [None]:
filt = (np.abs(segment_summaries_corr['time_to_eruptionmean'])>0.3) & \
segment_summaries_corr.index.isin(FTS)

fig,axes=plt.subplots(figsize=(20,20))
sns.heatmap(segment_summaries_corr[filt], annot=False, cmap='seismic', vmin=-1, vmax=1, cbar=False)

gc.collect()

axes.set_title('Segment Level Correlation with Time to Eruption (all features with >30% correlation)', size=20)

Trying to reduce our feature list a bit further for next step to reduce calculation time

I think we're seeing based on this set of data that quite a lot of the most positive correlated data is rolling mins and gradients, and a lot of the negatively correlated (closer to eruption) features are percentiles. This is a small subset of the data though, so we'll have to explore further.


Lets try looking at subsegments of the data. This could maybe also help to think about what time frames / windows are useful to apply.
To do this, I'm going to try to group by varying 'window' lengths, and look at some average / max correlations of the groupby summary data. I'm just trying some large window sizes, because much smaller sizes take too long (maybe I can think of ways to speed up...). Based on this analysis, with this quite simple approach, it doesn't seem to help to group summary stats into a window of 1000 rather than 10000 or 30000. Doubtless more sophisticated approaches looking at specific events in the sensor data would do much better on shorter batches.

Note: again think we have to keep in mind that this is not a very perfect analysis, with limited train data sets.

In [None]:
windows_test = [1000, 5000, 10000, 30000]
windows_test_results = pd.Series(index=windows_test)
fig,axes=plt.subplots(figsize=(20,6))
for wt in windows_test:
    print('checking window size', wt)
    loaded_dfs['sub_segment'] = loaded_dfs.index // wt

    segment_summaries = loaded_dfs.groupby(['sub_segment'])[TOP_FTS].agg(['mean','max','skew','std','min',
                                    percentile(0.1),percentile(0.25),
                                        percentile(0.75),percentile(0.9)])
    
    segment_summaries.columns = [a+b for a,b in segment_summaries.columns]
    segment_summaries_corr = segment_summaries.corr()
    segment_summaries_corr = segment_summaries_corr.fillna(value=0)
    segment_summaries_corr = segment_summaries_corr.sort_values('time_to_eruptionmean')
    
    print('Mean Correlation',np.abs(segment_summaries_corr.loc[FTS,'time_to_eruptionmean']).mean())
    windows_test_results[wt] = np.abs(segment_summaries_corr.loc[FTS,'time_to_eruptionmean']).mean()
    
    sns.kdeplot(np.abs(segment_summaries_corr.loc[FTS,'time_to_eruptionmean']))
    
axes.legend(windows_test, fontsize=16)
axes.set_title('Distribution of feature correlations by Window size (note: negative correlations plotted as absolutes)', size=18)


axes.set_xlabel('Feature Correlation with proximity of eruption', size=12)
axes.set_ylabel('Density of Features', size=12)
sns.despine()

# 9. Analyse Gradient feature(s)
Note: the sections of the data are roughly ordered with the closest to eruption being on the left - but the data is not consecutive..so this is indicative, rather than a thorough analysis.

In [None]:
loaded_dfs['sub_segment'] = loaded_dfs.index // 100

segment_summaries = loaded_dfs.groupby(['sub_segment'])['grad_rstd_min',].agg(['mean','max','skew','std','min',
                                    percentile(0.1),percentile(0.25),
                                        percentile(0.75),percentile(0.9)])

segment_summaries.columns = [b for a,b in segment_summaries.columns]

fig,axes=plt.subplots(nrows=segment_summaries.shape[1],
                     figsize=(20,segment_summaries.shape[1]*7))

for count,c in enumerate(segment_summaries.columns):
    axes[count].plot(segment_summaries.loc[:,c].values, color='Red')
    axes[count].set_title('Grad. Rolling Std ' + c+' Left Side = Closer To Eruption', size=18)

We have some quite high peaks on the far right (far from next eruption).
I dont have any domain knowledge, but could these be tremors shortly after an eruption? Googling the subject seems to suggest that aftershocks could be possible. This could be a source of confusion for our modelling.
More in line with what might be expected, there are some increasingly large peaks on the left (close to an eruption)

Lets see if its possible to cluster some of the raw data based on some top features. This does seem to reveal some smaller clusters closer to eruption - probably some of the 'events' visible in the graphs above.

In [None]:
CLUSTER_FTS = ['min','sensor_2_grad_rmin',
              'sensor_5_grad_rmin',
              'sensor_3_grad_rmin',
              'grad_rstd_min',
              'rstd_min','rstd_mean','min_abs']
gc.collect()
km = KMeans(n_clusters=8, random_state=42)
StSc = StandardScaler()
km.fit(StSc.fit_transform(loaded_dfs[CLUSTER_FTS]))
gc.collect()
loaded_dfs['km_labels'] = km.labels_.astype(str)

#print()
#print(loaded_dfs['km_labels'].value_counts())

loaded_dfs.groupby(['km_labels'])['time_to_eruption'].agg(['mean','count'])

In [None]:
fig,axes=plt.subplots(figsize=(12,8))
km_summ = loaded_dfs.groupby(['km_labels',
        'time_to_eruption'])['segment'].count().unstack('km_labels').fillna(value=0)

order = loaded_dfs.groupby(['km_labels'])['time_to_eruption'].agg(['mean','count']).sort_values('mean').index
km_summ.index = np.round(km_summ.index,2)
sns.heatmap(km_summ[order],
           cmap='seismic', annot=False,vmin=0,cbar=False)

axes.set_title('Clustered & Sorted by mean label proximity to eruption', size=18)

axes.set_xlabel('KMeans Label', size=14)
axes.set_ylabel('Proximity to Eruption (lower = closer)', size=14)

Let's look at the range of sensor readings for each of these 'clusters' - we'd probably guess that the clusters close to eruption will show a wider spread
Darker lines indicate closer to eruption

In [None]:
fig,axes=plt.subplots(nrows=len(SENSOR_COLS),
                     figsize=(15,5*len(SENSOR_COLS)),
                     sharex=True)

label_prox = loaded_dfs.groupby(['km_labels'])['time_to_eruption'].mean()

for count,s in enumerate(SENSOR_COLS):
    
    for ksl in loaded_dfs['km_labels'].unique():
        #print(ksl)
        sns.kdeplot(loaded_dfs[s][loaded_dfs['km_labels']==ksl].values,ax=axes[count],
                   color=color_grades[int(0.5+label_prox[ksl]*4)], alpha=1)
    axes[count].set_xlim(-5000,5000)
    axes[count].set_title(s + ' Distribution of Readings by Cluster Label - Red = Closer To Eruption', size=16)
    axes[count].legend(order)
        
gc.collect()

Some further analysis of sub segments to see what correlates well with approach eruptions

In [None]:
segment_stats = loaded_dfs.groupby(['sub_segment'])[loaded_dfs.columns].mean()

correlations=pd.Series(index=segment_stats.columns,
                      data=0.0)

for i in correlations.index:
    #print(i)
    correlations[i] = np.corrcoef(segment_stats['time_to_eruption'],
                                 segment_stats[i])[0,1]
    
correlations.sort_values(ascending=False).head(10)

In [None]:
correlations.sort_values(ascending=False).tail(10)

# 10. Train vs Test Sensor Ranges
The predictions so far have shown a different shape for train and test. Lets load some randomised data

In [None]:
TEST_PATH = V_PATH + 'test/'
test_examples=pd.Series(sample_submission.index).sample(n=100,random_state=42).values
print('Examples list ',test_examples)

loaded_test_dfs = pd.DataFrame()

for count,q in enumerate(test_examples):
    
    s_ID = sample_submission.loc[q,'segment_id']
        
    temp_df = pd.read_csv(TEST_PATH+str(s_ID)+'.csv', dtype=dtypes_dict)
    
    temp_df = temp_df.fillna(method='ffill').fillna(method='bfill')
    temp_df = temp_df.fillna(value=0)
    
    temp_df['time_to_eruption'] = q
    temp_df['segment'] = count
    loaded_test_dfs = pd.concat([loaded_test_dfs, temp_df])
    
    gc.collect()
    
loaded_test_dfs = loaded_test_dfs.reset_index(drop=True) 
print(' Total DF Size ',loaded_test_dfs.shape)
loaded_test_dfs.head(10)

In [None]:
train_examples=pd.Series(train.index).sample(n=100,random_state=42).values
print('Examples list',test_examples)

loaded_train_dfs = pd.DataFrame()

for count,q in enumerate(train_examples):
    
    s_ID = train.loc[q,'segment_id']
        
    temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv', dtype=dtypes_dict)
    
    temp_df = temp_df.fillna(method='ffill').fillna(method='bfill')
    temp_df = temp_df.fillna(value=0)
    
    temp_df['time_to_eruption'] = q
    temp_df['segment'] = count
    loaded_train_dfs = pd.concat([loaded_train_dfs, temp_df])
    
    gc.collect()
    
loaded_train_dfs = loaded_train_dfs.reset_index(drop=True) 
print(' Total DF Size ',loaded_train_dfs.shape)
loaded_train_dfs.head(10)

Examine some sample distributions of train vs test sensor data
Hiding readings of 0 or readings which are very extreme, to better view the central distributions

In [None]:
nc=2
fig,axes=plt.subplots(nrows=10//nc,ncols=nc,figsize=(20,18))
#fil
for count,S in enumerate(SENSOR_COLS):
    
    sns.kdeplot(loaded_train_dfs[S][(loaded_train_dfs[S]!=0) & (np.abs(loaded_train_dfs[S])<6000)],color='Green',
               ax=axes[count//nc,count%nc])
    
    sns.kdeplot(loaded_test_dfs[S][(loaded_test_dfs[S]!=0) & (np.abs(loaded_test_dfs[S])<6000)],color='Red',
               ax=axes[count//nc,count%nc])    
    
    axes[count//nc,count%nc].set_xlim(-5000,5000)
    axes[count//nc,count%nc].legend(['Sample Train data', 'Sample Test data'], fontsize=12)
    axes[count//nc,count%nc].set_title(S, size=18)
    axes[count//nc,count%nc].set_xlabel('Sensor Reading', size=12)
    
plt.tight_layout()

Within these randomly selected examples, it looks like the distributions are pretty similar, though the test data does look more clustered around the centre for 8 of the 10 sensors. This could imply that the average test time to the next eruption is a bit higher, if it is true of the whole Test data set.
The NAs were filtered out though, and the analysis further up highlighted that some sensors could have a lot more missing values in Test data.