# Mel Spectrogram Data Processing for INGV

Given that the dataset is time-series data, where each column represents the seismic sensor readings, we have found that representing it as a wave and using STFT to convert them into spectrograms.

In [None]:
# IMPORTS

import numpy as np
import pandas as pd
import librosa
import librosa.display
from scipy import signal
from scipy.fft import fft, ifft
from scipy import stats
from tqdm.auto import *
from collections import defaultdict
tqdm.get_lock().locks = []
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv')
test = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')
train.head()

In [None]:
# Read a sample dataset
train_segment_id_0 = train['segment_id'][0]
train_dataset_0 = pd.read_csv(f'/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/{train_segment_id_0}.csv', memory_map=True)
train_dataset_0.head()

## Spectrogram Generation
Let's now define a function to generate Mel Spectrograms and tweak some params

In [None]:
def mel_spectrogram(sensor, sr):
    # n_mels and hop_length designed to give spectrograms of size 256x256
    spec = librosa.feature.melspectrogram(y=sensor, sr=sr, n_mels=256, hop_length=60001//256+1)
    return librosa.power_to_db(spec)

In [None]:
# Generate 3 sample spectrograms for visualisation purposes
spec12000 = mel_spectrogram(train_dataset_0['sensor_1'].values, 12000)
spec48000 = mel_spectrogram(train_dataset_0['sensor_1'].values, 48000)
spec96000 = mel_spectrogram(train_dataset_0['sensor_1'].values, 96000)

In [None]:
librosa.display.specshow(spec12000, y_axis='mel', fmax=8000, x_axis='time')

In [None]:
librosa.display.specshow(spec48000, y_axis='mel', fmax=8000, x_axis='time')

In [None]:
librosa.display.specshow(spec96000, y_axis='mel', fmax=8000, x_axis='time')

As seen, increasing the sample rate (12000Hz -> **48000Hz**) stretches the lower frequencies, which we hypothesise hold more info (from visual observation).  
Further increasing the value past 60k will result in insufficient samples for the STFT as can be seen in the 96000Hz spectrogram.

## Process all train and test datasets
By writing a simple function, we can iterate through the whole train and test dataset to process all data.  
In this case, we output each set of spectrograms (10 sensors per dataset) as uint8 arrays stored in compressed `npz` format.

In [None]:
# Create output directories
!mkdir train test

In [None]:
# Redefine spectrogram function with chosen sample rate
def spectrogram(sensor):
    spec = librosa.feature.melspectrogram(y=sensor, sr=48000, n_mels=256, hop_length=60001//256+1)
    return librosa.power_to_db(spec)

In [None]:
def generate_spectrograms_for_df(df, dataset):
    # Enumerate across all datasets
    for i, segment_id in enumerate(df['segment_id'].values):
        sensors = pd.read_csv(f'/kaggle/input/predict-volcanic-eruptions-ingv-oe/{dataset}/{segment_id}.csv', memory_map=True)
        sensors.fillna(0, inplace=True)
        
        signals = []
        for i in range(10):
            name = f'sensor_{i+1}'
            sensor = sensors[name].values
            signals.append(spectrogram(sensor))
        
        # Cast to uint8 to save memory
        signals = np.array(signals, dtype='uint8')

        np.savez_compressed(f'{dataset}/{segment_id}-spec.npz', signals)

In [None]:
generate_spectrograms_for_df(train, 'train')
generate_spectrograms_for_df(test, 'test')

## Loading Data
To load data for training, you can use the following code:

In [None]:
segment_id = 1136037770
spectrogram = np.load(f'train/{segment_id}-spec.npz')['arr_0'].astype('float32')