# EDA for Volcanic Eruption Competition

### Goals

* For competition https://www.kaggle.com/c/predict-volcanic-eruptions-ingv-oe/data

### Comments

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import sqlite3
from tqdm.notebook import tqdm
tqdm.pandas()
import missingno as msno
from multiprocessing import Pool
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk", font_scale=1.4)
sns.set_style('whitegrid')



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!apt-get install tree

# Step I: Business Goal

* To allow for an early warning of vulcanic erruption the goal is to predict when a volcano's next eruption will occur. 
* You'll analyze a large geophysical dataset collected by sensors deployed on active volcanoes
* Identify signatures in seismic waveforms that characterize the development of an eruption. 

* Metrics: MAE for the time until erruption
* Final submission deadline: December 30, 2020

* Hypotheses
    * Sensor signatures indicate the development towards an erruption
    * The sensor signatures for eruptions are similar across different vulcanos



# Step II: Data Extraction

* readings are from several seismic sensors around a volcano and challenges you to estimate how long it will be until the next eruption. 
* The data represent a classic signal processing setup that has resisted traditional methods.
* Each file contains 10 minutes of logs from 10 different sensors arrayed around a volcano.


In [None]:
! tree /kaggle/input/predict-volcanic-eruptions-ingv-oe -L 1

In [None]:
erruptions = pd.read_csv("/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv")

In [None]:
erruptions.head()

In [None]:
sensors_path_train = []
sensors_path_test = []


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        if 'train/' in path:
            sensors_path_train.append(Path(path))
        if 'test/' in path:
            sensors_path_test.append(Path(path))

In [None]:
sensors_train = pd.Series(sensors_path_train).to_frame('path')

sensors_test = pd.Series(sensors_path_test).to_frame('path')

In [None]:
sensors_train['segment_id'] = sensors_train['path'].apply(lambda path: int(path.stem))
sensors_test['segment_id'] = sensors_test['path'].apply(lambda path: int(path.stem))

In [None]:
sensors_train.head()

In [None]:
sensors_train_erupt = pd.merge(sensors_train, erruptions, on ='segment_id', how='left')

In [None]:
sensors_train_erupt.head()

In [None]:
sensors_train.shape

Read one 10-min file/sensor time series for exploration 

In [None]:
sensor_sample = pd.read_csv(sensors_train['path'].iloc[0])
sensor_sample.shape

In [None]:
f'Total number of sensor rows: {sensors_train.shape[0] * sensor_sample.shape[0]} x 10 sensor data columns'

Dataset too large to load into the Kaggle Machine memory of 16GB (cpu) at once.

In [None]:
storage_sql=False

if storage_sql:

    create_db=False
    if create_db:
        conn = sqlite3.connect('train2.db')
        print("Opened database successfully")

        conn.execute('''CREATE TABLE sensors
                 (SERIE         INT    NOT NULL,
                 S1           REAL    NOT NULL,
                 S2           REAL    NOT NULL,
                 S3           REAL    NOT NULL,
                 S4           REAL    NOT NULL,
                 S5           REAL    NOT NULL,
                 S6           REAL    NOT NULL,
                 S7          REAL    NOT NULL,
                 S8           REAL    NOT NULL,
                 S9           REAL    NOT NULL,
                 S10           REAL    NOT NULL,
                 ERRUPT          REAL    NOT NULL);''')
        print("Table created successfully")
        conn.close()
    
    def write_to_sql(bunch):
        
        assert type(bunch[0]) == tuple
        # [(serie, S1, S2, S3, S4, S5... , ERRUPT)]
        #examples = [(2, "def"), (3, "ghi"), (4, "jkl")]
        cur.executemany("INSERT INTO sensors VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", bunch)
    
    def ingest_data(row):
        sensor_series = pd.read_csv(row['path'])

        sensor_series['time_to_eruption'] = row['time_to_eruption']
        sensor_series['segment_id'] = str(row['segment_id'])
        #sensor_series['adsfb'] = 'abc'
        #print(sensor_series.dtypes)

        sensor_series = sensor_series[['segment_id'] + sensor_series.columns.to_list()[:-1]]

        to_ingest = [tuple(x) for i, x in sensor_series.iterrows()]




        #to_ingest = []
        #for i, row_sens in sensor_test.iterrows():
        #    to_ingest.append(tuple([row['segment_id']] + row_sens.to_list() + [row['time_to_eruption']]))

        #print(to_ingest)
        write_to_sql(to_ingest)   
    
    conn = sqlite3.connect('train2.db')
    print("Opened database successfully")
    
    _ = sensors_train_erupt.head(1).progress_apply(ingest_data, axis=1)

# II. Meet and Greet the Data

In the following I analyze the high-level sensor data as well as 1 sample cycle file.


* Missing values identified via pandas: 
    * the sensor sample cycle file has sensor_9 with about 1/6 of missing data.
    * All train cycle series have a time-to-eruption, no missing values

* According to sample file sensor values are varying, can be positive and negative.

* Data types in proper format, tested sensor cycle sample
    * segment_id: discrete numeric identifier
    * time to erruption: discrete numeric. Unit is not defined.
    * sensor data: 
        * documentation suggests "you may find that you still need to load the data as float32 due to the presence of some nulls"
 
* Assumptions:
    *  The readings have been normalized within each segment, in part to ensure that the readings fall within the range of int16 values.
    


In [None]:
sensors_train_erupt.head()

In [None]:
sensors_train_erupt.sample(10, random_state=42)

In [None]:
sensor_sample.shape

In [None]:
sensor_sample.head()

In [None]:
sensor_sample.sample(10, random_state=42)

In [None]:
sensors_train_erupt.info()

In [None]:
sensor_sample.info()

Missing Values? Missigness not randomly distributed but occurs in certain time intervals.

In [None]:
msno.matrix(sensor_sample)

### Working with all sensor cycle files

* Check for number of steps per cycle: Is it always 60001 as in sample above? - yes always.
* Always same data types for cycle files? - yes.
* how much missing values are there in other cycle files? Missingness is common, 10% ahve missing values, 5% of sensors have no sensor values at all. Best to have algorithm to deal with missing values.

In [None]:
def extract_aggregates(sensor_cycle):
        
    # series sensor 1 to 10
    missing_values = sensor_cycle.isna().sum().values
    
    cycle_steps = sensor_cycle.shape[0]
    
    dtypes = [str(val) for val in sensor_cycle.dtypes.values]
    
    return [cycle_steps, dtypes, missing_values]

def process_cycle(path):
    
    sensor_cycle = pd.read_csv(path)
    
    return extract_aggregates(sensor_cycle)

In [None]:
paths_train = [str(path) for path in sensors_train_erupt['path'].values]

In [None]:
with Pool(4) as p:
  aggs = list(tqdm(p.imap(process_cycle, paths_train), total=len(paths_train)))

In [None]:
len(aggs)

In [None]:
aggs_stats = pd.DataFrame(aggs, columns=['num_rows', 'dtypes', 'missingness'])

In [None]:
aggs_stats.head()

In [None]:
aggs_stats = sensors_train_erupt.join(aggs_stats)

In [None]:
aggs_stats.head()

In [None]:
aggs_stats['num_rows'].unique()

In [None]:
aggs_stats['dtypes'].apply(lambda dtypes_cycle:  dtypes_cycle== aggs_stats['dtypes'].iloc[0]).all()

### Missing Values

In [None]:
missing_cycles = pd.DataFrame(np.array(aggs_stats['missingness'].to_list()), columns=sensor_sample.columns)

In [None]:
missing_cycles.head()

If missing values for sensor, are there all sensor values for the whole cycle missing?

In [None]:
sensors_missing_values=  missing_cycles.applymap(lambda x: x>0).sum().sum()
print(f'{round(100*sensors_missing_values/(missing_cycles.shape[0]*missing_cycles.shape[1]), 2)}% of sensors have missing data')

In [None]:
sensors_missing_values=  missing_cycles.applymap(lambda x: x>0).sum().sum()
sensors_all_missing_values=  missing_cycles.applymap(lambda x: x==60001).sum().sum()
print(f'{round(100*sensors_all_missing_values/sensors_missing_values, 2)}% of sensors with missing data have not sensor values at all.')

Missingess Analysis:
* Different cycles have different sensor values missing.
* We expect that there will be data in test set that have no sensor values at all. More difficult to impute, if it becomes necessary.

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
sns.heatmap(missing_cycles, ax=ax)
ax.set_ylabel('sensor cycles')
ax.set_title('Number of missing values accross sensor cycles')

In [None]:
ax = missing_cycles.applymap(lambda val: val>0).sum().div(missing_cycles.shape[0]).plot(kind='bar')
ax.set_title('fraction cycles with missing values')

> Some sensors seem to be more prone towards having missing values. Particular `sensor_9` has in 30% of all cycles missing values.

In [None]:
missing_cycles.applymap(lambda val: val>0).sum(axis=1).value_counts()

In [None]:
aggs_stats[missing_cycles.applymap(lambda val: val>0).sum(axis=1)==10]

A some cycles below have more than 4+ sensors with signficant missing values. Missingness is not a big problem for other cycles.

In [None]:
aggs_stats[missing_cycles.applymap(lambda val: val>0).sum(axis=1)>3][['segment_id', 'missingness']]

In [None]:
del aggs_stats

In [None]:
del aggs

# Step IV: Univariate Analysis

* Sensor cycle sample insights
    * no obvious correlations but appear non-linear affects
    * 

In [None]:
sensors_train_erupt.head()

In [None]:
sensors_train_erupt[['time_to_eruption']].describe().T

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=(16,4))
    sns.distplot(np.log10(sensors_train_erupt['time_to_eruption']), ax=ax, kde=True, hist=True)
    #ax.set_xscale('log')
    ax.set_xlabel('log time to eruption')
    plt.yticks(np.arange(0,2,0.25))
    plt.grid()

In [None]:
fig, ax = plt.subplots(figsize=(16,4))
sns.boxplot(np.log10(sensors_train_erupt['time_to_eruption']), ax=ax)
ax.set_xlabel('log time to eruption')

Outliers using IQR for short times until erruption.

In [None]:
sensors_train_erupt[['path', 'segment_id']].astype('O').describe()

### Investigating Sensor Cycle Sample

* All data appears normalized to median equal to zero (reference normalization)
* 

In [None]:
sensor_sample.describe().T

Is there any correlation between sensor data?

In [None]:
sns.pairplot(sensor_sample)

> No linear correlation between sensor values but there appears to be some non-linear correlations (shapes peculiar).

In [None]:
with sns.plotting_context("talk", font_scale=1):
    fig, ax = plt.subplots(figsize=(16,8))
    sns.heatmap(sensor_sample.corr(), annot=True, fmt=".2", ax=ax)

In [None]:
sensor_sample.head()

In [None]:
to_plot = sensor_sample.unstack() 
to_plot = to_plot.droplevel(1).reset_index()

g = sns.FacetGrid(to_plot, col='index', col_wrap=3, sharex=False, sharey=False, aspect=1.2, height=6)
g.map(sns.distplot, 0)

### Time Series Analysis

step size: 60x10x100 = 10min > 10ms intervals between sensor readings

In [None]:
sensor_sample.shape

In [None]:
with sns.plotting_context("talk", font_scale=1):
    fig, axes = plt.subplots(len(sensor_sample.columns), figsize=(30, 20), sharex=True)
    axes = axes.flatten()
    for i, col in enumerate(sensor_sample.columns):
        sns.lineplot(data=sensor_sample, x=sensor_sample.index, y=col, ax=axes[i])
        axes[i].set_ylabel(col); axes[i].set_xlabel('')

> Normalization effect visible. 
    * There appear to be signals (e.g. sensor_5 and sensor 10 have larger variances for certain time periods) and there does not appear to be only simple white noise. We measure something! 
    * Also, magnitude of change differs and larger variance differences, e.g. sensor1 vs sensor2. 
    * Also, no seasonal effects expected so do not do any decomposition
    * Also, no investigation into autoregression parts and other things needed due to the nature of the problem.


In [None]:
sensor_sample.isna().sum()

In [None]:
rolmean = sensor_sample.rolling(window=100).mean()# 1s averages

In [None]:
plt.figure(figsize=(16,8))
plt.plot(sensor_sample['sensor_1'], color='blue', label='Actual Series')
plt.plot(rolmean['sensor_1'], color='red', label='Actual Series')

In [None]:
# ADF test
def adf_test(series):
    from statsmodels.tsa.stattools import adfuller
    test = adfuller(series)
    output = pd.Series(test[0:4], index=['Test Statistic','p-value','Lags Used','No. of Observations'])
    for key,value in test[4].items():
        output['Critical Value (%s)'%key] = value
    print(output)
adf_test(sensor_sample['sensor_1'])

Test reveals stationarity while our mean estimation  over the 1-s window did not show stationarity.

In [None]:
pd.date_range("00:00", "01:00", freq="10ms").time.shape

### Extraction Sensor Features and comparing with predictor

* Using quantile binning to extract features from time series/summarise the time series for each cycle. Maybe choose other quantiles in the future.
* Investigate the pearson correlation between sensor values for each cycle.

#### 1. Extraction

In [None]:
def extract_stats(sensor_cycle, cycle_ID):
        
    # extract sensors stats for cycles 
    description = sensor_cycle.describe().T
    description['segment_id'] = cycle_ID
    
    # identify cycles with linear correlation
    corr = sensor_cycle.corr().fillna(0).values
    np.fill_diagonal(corr, 0)
    abs_threshold = 0.4
    corr_signal = (np.abs(corr)>abs_threshold).sum().sum() /2. # factor 2 due to symmetric matrix
    
    
    return [description, corr_signal]

def process_cycle_stats(path):
    
    sensor_cycle = pd.read_csv(path)
    
    return extract_stats(sensor_cycle, path.stem)

In [None]:
with Pool(4) as p:
  aggs = list(tqdm(p.imap(process_cycle_stats, sensors_train_erupt['path']), total=len(paths_train)))

In [None]:
summary_stats_cycles = pd.concat([val[0] for val in aggs])
summary_stats_cycles.shape

In [None]:
summary_stats_cycles['segment_id'] = summary_stats_cycles['segment_id'].astype(int)

#### 2. Analysis

In [None]:
summary_stats_cycles.head().drop(columns=['segment_id', 'count']).stack().to_frame().T

In [None]:
stats_by_cycle = summary_stats_cycles.groupby('segment_id').apply(lambda cycle: cycle.drop(columns=['segment_id', 'count']).stack().to_frame().T)

cols = [col[0]+'_'+col[1] for col in stats_by_cycle.columns]
stats_by_cycle.columns = cols
stats_by_cycle.head()

Distribution of maximum and minimum values per cycle

In [None]:
sensor_max = [col for col in stats_by_cycle.columns if 'max' in col]
to_plot = stats_by_cycle[sensor_max].stack().reset_index().drop(columns=['level_1', 'segment_id'])

with sns.plotting_context("talk", font_scale=0.7):
    g = sns.FacetGrid(to_plot, col="level_2", col_wrap=3, sharex=False, sharey=False,  aspect=1.5, height=4)
    g.map(sns.distplot, 0)

> normalish shaped with long tail towards high values. Outliers indicated. It could be beneficial to do log transform or general power-transform box-cox

In [None]:
sensor_max = [col for col in stats_by_cycle.columns if 'max' in col]
to_plot = stats_by_cycle[sensor_max].stack().reset_index().drop(columns=['level_1', 'segment_id'])

with sns.plotting_context("talk", font_scale=0.7):
    g = sns.FacetGrid(to_plot, col="level_2", col_wrap=3, sharex=False, sharey=False,  aspect=1.5, height=4)
    g.map(sns.boxplot, 0)

In [None]:
sensor_min = [col for col in stats_by_cycle.columns if 'min' in col]
to_plot = stats_by_cycle[sensor_min].stack().reset_index().drop(columns=['level_1', 'segment_id'])

with sns.plotting_context("talk", font_scale=0.7):
    g = sns.FacetGrid(to_plot, col="level_2", col_wrap=3, sharex=False, sharey=False,  aspect=1.5, height=4)
    g.map(sns.distplot, 0)

> Again, clearly there are strong outliers for some sensor series with some very large and very small values. This could be badly calibrated sensors or missfunctional sensors. Should I remove those? These outliers could also be related to the time to erruption as closer erruption time leads to more and higher signals?

#### Outlier sensors

In [None]:
def outlier_iqr(series):

    q75=series.quantile(q=0.75)
    q25=series.quantile(q=0.25)
    IQR = q75-q25
    low_IQR = q25 -1.5*IQR
    high_IQR = q75+1.5*IQR
    #print(low_IQR, high_IQR)
    to_keep = stats_by_cycle[(series>low_IQR) & (series<high_IQR)]
    print(f'drop {series.shape[0]-to_keep.shape[0]} out of {series.shape[0]}')

In [None]:
outlier_iqr(stats_by_cycle['sensor_1_max'])

quantile of 75% might be better suited to identify vastly different cycles/sensor setups:

In [None]:
outlier_iqr(stats_by_cycle['sensor_1_75%'])

In [None]:
sensor_75 = [col for col in stats_by_cycle.columns if '75%' in col]
to_plot = stats_by_cycle[sensor_75].stack().reset_index().drop(columns=['level_1', 'segment_id'])

with sns.plotting_context("talk", font_scale=0.7):
    g = sns.FacetGrid(to_plot, col="level_2", col_wrap=3, sharex=False, sharey=False,  aspect=1.5, height=4)
    g.map(sns.boxplot, 0)
    

# Step V: Multivariate Analysis

* Test how extracted features depend on erruption time.



In [None]:
sensors_train_erupt_stats = pd.merge(sensors_train_erupt, stats_by_cycle.reset_index(), 
                                     on='segment_id', how='left').drop(columns=['path', 'level_1'])

### 2. Analyis of extracted features

In [None]:
sensors_train_erupt_stats.head(2)

In [None]:
cols_plot = [col for col in sensors_train_erupt_stats.columns if '25%' in col]
cols_plot

In [None]:
sensors_train_erupt_stats.head()

In [None]:
with sns.plotting_context("talk", font_scale=0.9):
    fig, ax = plt.subplots(figsize=(20,4))
    sensors_train_erupt_stats.corr()['time_to_eruption'].plot(kind='bar', ax=ax)
    #ax.xaxis.set_visible(False)
    plt.yticks([-0.5, 0, 0.5])
    ax.set_ylabel('person correlation')
    ax.set_title('correlation of sensor features with time to erruption')
    ax.set_ylim(-0.5, 0.5)

> There are not strong correlations but weak correlation of some features. This weak correlation varies between sensors, e.g. std, 25, 75, max. Median is always zero due to normalization and mean is also very small. 

In [None]:
sensors_train_erupt_stats['time_to_eruption_log'] = np.log10(sensors_train_erupt_stats['time_to_eruption'])

Investigating the maximum values per series. Could investigate more sensor stats.

In [None]:
with sns.plotting_context("talk", font_scale=0.5):

    col_max_features = [col for col in sensors_train_erupt_stats.columns if 'max' in col]
    fig, axes = plt.subplots(len(col_max_features), figsize=(20, 14))
    axes = axes.flatten()
    for i, col in enumerate(col_max_features):
        sns.scatterplot(data=sensors_train_erupt_stats, x='time_to_eruption_log', y=col, ax=axes[i])
    ax.set_title('Maximum sensor values for eruption times')

In [None]:
with sns.plotting_context("talk", font_scale=0.5):

    col_max_features = [col for col in sensors_train_erupt_stats.columns if 'mean' in col]
    fig, axes = plt.subplots(len(col_max_features), figsize=(20, 14))
    axes = axes.flatten()
    for i, col in enumerate(col_max_features):
        sns.scatterplot(data=sensors_train_erupt_stats, x='time_to_eruption_log', y=col, ax=axes[i])
    ax.set_title('Average sensor values for eruption times')

### One could investigate how the correlation of sensor values relates to the time until eruption.


In [None]:
sensors_train_erupt['corr_sensors'] = [val[1] for val in aggs]

Most cycle sensors have no linear correlation. Less than 2% have and are hence not relevant here.

In [None]:
sensors_train_erupt['corr_sensors'].value_counts(normalize=True).head()

## Extraction of Time series for certain times until erruption.

In [None]:
sensors_train_erupt.head()

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=(16,4))
    sns.distplot(np.log10(sensors_train_erupt['time_to_eruption']), ax=ax, kde=True, hist=True)
    #ax.set_xscale('log')
    ax.set_xlabel('log time to eruption')
    plt.yticks(np.arange(0,2,0.25))
    plt.grid()

In [None]:
fig, ax = plt.subplots(figsize=(16,4))
sns.boxplot(np.log10(sensors_train_erupt['time_to_eruption']), ax=ax)
ax.set_xlabel('log time to eruption')

In [None]:
sensors_train_erupt['time_to_eruption_log'] = np.log10(sensors_train_erupt['time_to_eruption'])

Choose quantiles to represent log distribution

In [None]:
pd.qcut(sensors_train_erupt['time_to_eruption_log'], q=[1e-8, 1e-7, 1e-4, 1e-3, 1e-2, 0.1, 1]).cat.categories

In [None]:
pd.qcut(sensors_train_erupt['time_to_eruption_log'], q=[1e-4, 1e-3, 1e-2, 0.1, 1]).value_counts()

In [None]:
sensors_train_erupt['q_time_to_eruption'] = pd.qcut(sensors_train_erupt['time_to_eruption_log'], q=[1e-4, 1e-3, 1e-2, 0.1, 1], 
                                                    labels=[1e-3, 1e-2, 0.1, 1]).astype(float)

In [None]:
sensors_train_erupt.head(2)

In [None]:
sensors_train_erupt['q_time_to_eruption'].value_counts()

In [None]:
sensors_train_erupt.sort_values('time_to_eruption_log').head(1)

In [None]:
sensors_train_erupt.loc[sensors_train_erupt['segment_id'] == 601524801, 'q_time_to_eruption'] = 1e-7

Sample cycle for each time_to_eruption bin:

In [None]:
cycle_samples = sensors_train_erupt.groupby('q_time_to_eruption').apply(lambda x: x.sample(1, random_state=42)).drop(
    columns='q_time_to_eruption').reset_index().sort_values('q_time_to_eruption').drop(columns='level_1')
cycle_samples

Get all sensor data for the samples

In [None]:
cycle_samples_sensors = []
for i, row in cycle_samples.iterrows():
    sens_tmp = pd.read_csv(row['path'])
    sens_tmp['segment_id'] = row['segment_id']
    sens_tmp['step'] = sens_tmp.index
    cycle_samples_sensors.append(sens_tmp)

In [None]:
cycle_samples_sensors = pd.merge(pd.concat(cycle_samples_sensors), cycle_samples, on='segment_id', how= 'left').drop(columns='path')

In [None]:
cycle_samples_sensors.head()

In [None]:
cycle_samples_sensors.shape

In [None]:
sensor_cols = [col for col in cycle_samples_sensors.columns if 'sensor_' in col]

In [None]:
sensor_cols_plot = cycle_samples_sensors.melt(id_vars=['q_time_to_eruption', 'time_to_eruption_log', 'step'], 
        var_name="sensors", value_vars=sensor_cols,
        value_name="Value")
sensor_cols_plot.head(2)

In [None]:
sensor_cols_plot['time_to_eruption_log'] = sensor_cols_plot['time_to_eruption_log'].apply(lambda x: round(x, 2))

In [None]:
cycle_samples_sensors.head()

In [None]:


j=-1
with sns.plotting_context("talk", font_scale=0.8):
    
    fig, axes = plt.subplots(10, 1, figsize=(16,40))
    axes = axes.flatten()
    for sensor_type in sensor_cols:
        j+=1
    #sensor_type = 'sensor_1'
        plot_subset = sensor_cols_plot[(sensor_cols_plot['sensors'] == sensor_type)]        
        colors = ['y', 'r', 'b', 'g', 'k']
        i=-1
        for label, df in plot_subset.groupby('time_to_eruption_log'):
            i+=1
            df.reset_index(drop=True).Value.plot(ax=axes[j], label=label, color=colors[i])
            ax=axes[j].set_title(sensor_type)
            #df.Value.plot(kind="kde", ax=axes[1], label=label, color=colors[i])
            ax=axes[j].legend()
            #axes[1].legend()
            #axes[1].set_xlim(-2500, 2500)

> Signals at different times until erruption do NOT differe greatly. One can spot some time ranges with signal patterns attributed to shorter times to erruption, at 3.8 and 4.46. But they are not clearly distinct from patterns at other times.

### Looking at 10 signals per binned time-to-erruption

In [None]:
sensors_train_erupt['q_time_to_eruption'].value_counts()

In [None]:
def draw_sample(df):
    if df.shape[0] <40:
        return df
    else:
        return df.sample(10, random_state=42)
    
cycle_samples = sensors_train_erupt.groupby('q_time_to_eruption').apply(draw_sample).drop(
    columns='q_time_to_eruption').reset_index().sort_values('q_time_to_eruption').drop(columns='level_1')
cycle_samples

In [None]:
def get_sensor_samples(sensor):
    
    cycle_samples_sensors = []

    for i, row in cycle_samples.iterrows():
        sens_tmp = pd.read_csv(row['path'])
        sens_tmp['segment_id'] = row['segment_id']
        sens_tmp['step'] = sens_tmp.index
        cycle_samples_sensors.append(sens_tmp[[sensor, 'step', 'segment_id']])

    cycle_samples_sensors = pd.merge(pd.concat(cycle_samples_sensors), cycle_samples, on='segment_id', how= 'left').drop(columns='path')
    return cycle_samples_sensors

In [None]:
cycle_samples_sensor = get_sensor_samples('sensor_1')
cycle_samples_sensor.shape

In [None]:
cycle_samples_sensor.head()

Looking just at sensor 1.

In [None]:
#colors = ['y', 'r', 'b', 'g', 'k']

with sns.plotting_context("talk", font_scale=0.8):


    fig, axes = plt.subplots(5, 1, figsize=(16,10), sharex=True, sharey=True)
    fig.suptitle("sensor_1")
    
    i=-1
    for label, df in cycle_samples_sensor.groupby('q_time_to_eruption'):
        i+=1
        df.reset_index(drop=True).groupby('segment_id').plot('step','sensor_1',ax=axes[i])
        
        #sns.lineplot(data=df.reset_index(drop=True), x='step', y='sensor_1', hue='segment_id', ax=axes[i])
        
        ax=axes[i].set_title(f"log time erruption: {round(df['time_to_eruption_log'].mean(), 2)}")
        #df.Value.plot(kind="kde", ax=axes[1], label=label, color=colors[i])
        ax=axes[i].legend().set_visible(False)
        #axes[i].legend()
        #axes[1].set_xlim(-2500, 2500)

> there are some cycles shown here where the sensor has extremely large values. This is related to my outlier analysis before. Again I ask: Are this simply differently calibrated sensors or malfunction sensors?

> One can also see that the red seensor for erruptoin 6.35 seem to hit a threshold.

In [None]:
filter_extreme = cycle_samples_sensor.groupby('segment_id')['sensor_1'].describe()[
    (cycle_samples_sensor.groupby('segment_id')['sensor_1'].describe()['75%'] > 1000)].index.to_list()
filter_extreme

In [None]:
#colors = ['y', 'r', 'b', 'g', 'k']

with sns.plotting_context("talk", font_scale=0.8):


    fig, axes = plt.subplots(5, 1, figsize=(16,10), sharex=True, sharey=True)
    fig.suptitle("sensor_1")
    
    i=-1
    for label, df in cycle_samples_sensor[~cycle_samples_sensor['segment_id'].isin(filter_extreme)].groupby('q_time_to_eruption'):
        i+=1
        df.reset_index(drop=True).groupby('segment_id').plot('step','sensor_1',ax=axes[i])
        
        #sns.lineplot(data=df.reset_index(drop=True), x='step', y='sensor_1', hue='segment_id', ax=axes[i])
        
        ax=axes[i].set_title(f"log time erruption: {round(df['time_to_eruption_log'].mean(), 2)}")
        #df.Value.plot(kind="kde", ax=axes[1], label=label, color=colors[i])
        ax=axes[i].legend().set_visible(False)
        #axes[i].legend()
        axes[i].set_ylim(-2500, 2500)

# Step VI. Downsample Series

* Downsample series to 1-s intervals, which is a factor of 100 less in data.
* write all downsampled series into one csv file.
* Which kind of aggregation used for series?

In [None]:
rolmean = sensor_sample.rolling(window=100).mean()# 1s averages
plt.plot(sensor_sample['sensor_1'], color='blue', label='Actual Series')
plt.plot(rolmean['sensor_1'], color='red', label='Actual Series')

In [None]:
sensor_sample.shape, rolmean.shape

In [None]:
plt.plot(rolmean['sensor_1'], color='red', label='Actual Series')
plt.plot(rolmean[0:-1:100]['sensor_1'], color='green', label='Actual Series')

> A lot of high signal gets removed when creating a rolling average over 1s.  

In [None]:
rolmax = sensor_sample.rolling(window=100).max()# 1s averages
plt.plot(sensor_sample['sensor_1'], color='blue', label='Actual Series')
plt.plot(rolmax['sensor_1'], color='red', label='Actual Series')

In [None]:
rolsample = sensor_sample[0:-1:100]
plt.plot(sensor_sample['sensor_1'], color='blue', label='Actual Series')
plt.plot(rolsample['sensor_1'], color='red', label='Actual Series')

> Above would correspond to a measure every 1s. Also notice the difference to the mean avg window.which does not preserve the max signal values..

In [None]:
import csv

In [None]:
sensor_sample.cols

In [None]:
writer = csv.writer(open("low_freq_train.cv", "w"))
writer.writerow([])

In [None]:
sensors_train_erupt.head()

In [None]:
sensor_cols = sensor_sample.columns.to_list()
sensor_cols

In [None]:
fname_train = "train_low_freq.csv"
for i, row in tqdm(sensors_train_erupt.iterrows(), total=sensors_train_erupt.shape[0]):
    
    sens_tmp = pd.read_csv(row['path'])
    
    sens_tmp = sens_tmp.rolling(window=100).mean()
    sens_tmp = sens_tmp[0:-1:100]
    sens_tmp['step'] = sens_tmp.index
    
    #sens_tmp.dropna(subset=sensor_cols, inplace=True, how='all') # drop row only if all sensors are missing
    sens_tmp.reset_index(drop=True, inplace=True)
    
    sens_tmp['segment_id'] = row['segment_id']
    sens_tmp['time_to_eruption_log'] = row['time_to_eruption_log']

    
    if i==0:
        sens_tmp.to_csv(fname_train, index=False, mode='w', header=True)
    else:
        sens_tmp.to_csv(fname_train, index=False, mode='a', header=False)

In [None]:
sensors_test.head()

In [None]:
fname_test = "test_low_freq.csv"
for i, row in tqdm(sensors_test.iterrows(), total=sensors_test.shape[0]):
    
    sens_tmp = pd.read_csv(row['path'])
    
    sens_tmp = sens_tmp.rolling(window=100).mean()
    sens_tmp = sens_tmp[0:-1:100]
    sens_tmp['step'] = sens_tmp.index
    
    #sens_tmp.dropna(subset=sensor_cols, inplace=True, how='all') # drop row only if all sensors are missing
    sens_tmp.reset_index(drop=True, inplace=True)
    
    sens_tmp['segment_id'] = row['segment_id']
    
    if i==0:
        sens_tmp.to_csv(fname_test, index=False, mode='w', header=True)
    else:
        sens_tmp.to_csv(fname_test, index=False, mode='a', header=False)

In [None]:
train_check = pd.read_csv(fname_train)
train_check.shape

In [None]:
train_check['segment_id'].nunique()

In [None]:
test_check = pd.read_csv(fname_test)
test_check.head()

In [None]:
test_check['segment_id'].nunique()