The datasets are saved here: <a href="https://www.kaggle.com/damoonshahhosseini/volcano-pca">Dataset</a>

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# Paths to the training and testing datasets
train_path = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/'
test_path = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/test/'

# Training data
train = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv')
sample_submission = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')

# Getting the ids
train_ids = train['segment_id']
test_ids = sample_submission['segment_id']

# Data Preprocessing:
- Data is normalized, dimension reduction will not be usefull sicne the data is a series of continous signlas.
- Missing values should be imputed with zero since it just means that there are no signals detected.
- Features for each signal:
    - min / max
    - std / mean / mad / skew / Kurtosis
    - quantiles: 1, 5, 10, 20, 25, 40, 50, 65, 75, 80, 85, 90, 95, 99
    - IQR, OMax: Q1  - 1.5 IQR, OMin :Q3 + 1.5IQR 

In [None]:
def extract_features(col):
    """
        Returns a one row DataFrame of features for 
        a singel signal.
        
        input:
            col: the column (signal) used for feature
                extraction.
            
        return:
            pd.DataFrame object containing the data of 
            the signal.
    """
    features = {} # Features dictionary
    # Measure of central tendency
    features['max'] = col.max()
    features['min'] = col.min()
    features['std'] = col.std()
    features['mean'] = col.mean()
    features['mad'] = col.mad()
    features['skew'] = col.skew()
    features['kurtosis'] = col.kurtosis()
    
    # Quantiles
    features['q-01'] = np.quantile(col, 0.01)
    features['q-05'] = np.quantile(col, 0.05)
    features['q-10'] = np.quantile(col, 0.1)
    features['q-25'] = np.quantile(col, 0.25)
    features['q-50'] = np.quantile(col, 0.5)
    features['q-75'] = np.quantile(col, 0.75)
    features['q-90'] = np.quantile(col, 0.90)
    features['q-95'] = np.quantile(col, 0.95)
    features['q-99'] = np.quantile(col, 0.99)
    
    return pd.DataFrame(features, index=range(1))

def reduce_data(data):
    """
        Reduce the data by only getting the measures of central
        tendency.
        
        input: Gets a dataframe
        return: Returns a dataframe containing the measures of
            central tendency needed for the function
    """
    dataframe = None
    
    for s in data.columns:
        if dataframe is None:
            dataframe = extract_features(data[s])
        else:
            dataframe = pd.concat([dataframe, extract_features(data[s])])
            
    return dataframe

def process_data(ids, file_name, base_path):
    data = np.empty(shape=(len(ids),10,16))

    for index, Id in enumerate(ids):
        df = pd.read_csv(f'{base_path}{Id}.csv').fillna(0)
        data[index] = reduce_data(df)
    
    # save the data
    np.save(f'{file_name}.npy', data)

In [None]:
# Process data and save the train and test dataset
process_data(test_ids, 'test', test_path)
process_data(train_ids, 'train', train_path)

# Rolling Dataset
- Breaking data into smaller chunks and record the change of its measures (mean, std, etc.) can help to make accurate prediction.

In [None]:
def extract_rolling_features(rolling_data, rolling_type):
    """
        Gets the measures of central tendency for the rolling data.
        
        input:
            rolling_data
            rolling_type: mean / std / skew / residual of a rolling data
        
        return: 
            dataframe conatining the measures of central tendency
    """
    return pd.concat(
        {
            f'{rolling_type}_mean': rolling_data.mean(), 
            f'{rolling_type}_std': rolling_data.std(), 
            f'{rolling_type}_skew': rolling_data.skew(), 
            f'{rolling_type}_residual': rolling_data.max() - rolling_data.min()
        }, axis=1)
    
def rolling_analysis(data):
    """
        Gets the measures of central tendency on the rolling data
        
        input:
            data: Given data
        
        return:
            dataframe containing the features for the measures of central 
            tendency for the rolling data
    """
    # Getting the rolling data
    indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=300) # Instantiating the indexer object
    rolling_data = data.rolling(window=indexer, min_periods=1)
    
    # measures of central tendesncy for the rolling data
    rolling_mean = rolling_data.mean() # Rolling averages
    rolling_std = rolling_data.std() # Rolling std
    rolling_skew = rolling_data.skew() # Rolling skew
    rolling_residual = rolling_data.max() - rolling_data.min() # Rolling residual values
    
    # Extracting their features
    mean_feats = extract_rolling_features(rolling_mean, 'rmean')
    std_feats = extract_rolling_features(rolling_std, 'rstd')
    skew_feats = extract_rolling_features(rolling_skew, 'rskew')
    residual_feats = extract_rolling_features(rolling_residual, 'rres')
    
    
    return pd.concat([mean_feats, std_feats, skew_feats, residual_feats], axis=1).fillna(0)

def process_rolling_data(ids, file_name, base_path):
    data = np.empty(shape=(len(ids),10,16))

    for index, Id in enumerate(ids):
        df = pd.read_csv(f'{base_path}{Id}.csv').fillna(0)
        data[index] = rolling_analysis(df)
    
    # save the data
    np.save(f'{file_name}.npy', data)

In [None]:
# Process the rolling data
process_rolling_data(train_ids, 'rolling_train', train_path)
process_rolling_data(test_ids, 'rolling_test', test_path)