In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import glob
from tqdm import tqdm
from numba import njit
import time

## Investigating train.csv

`train.csv` is containing target value for sensors data. for example this `[1136037770 ,12262005]` tells us given 10 mins of 10 sensors data in `1136037770.csv`, 12262005 time (in some unit) remain to eruption.

In [None]:
train = pd.read_csv('/kaggle/input/predict-volcanic-eruptions-ingv-oe/train.csv')

In [None]:
print('info: \n', train.info())

In [None]:
print('info: \n', train.info())
print('-+-'*30)
print('Statistics: \n',train['time_to_eruption'].describe(),
      '\nskewness:', train['time_to_eruption'].skew(),
      '\nkurtosis: ', train['time_to_eruption'].kurtosis(),
      '\nIQR:', train['time_to_eruption'].quantile(0.75) - train['time_to_eruption'].quantile(.25),
      '\nrange: ', train['time_to_eruption'].max() - train['time_to_eruption'].min())
print('-+-'*30)
print('train.head:\n',train.head())

There is no missing value (of course!).

Ideas to investigate:
1. sort `time_to_eruption` values and see the relation with volano activity and remaining time.




In [None]:
#Let's look at the histogram of the target value
px.histogram(train,
             x='time_to_eruption',
             nbins=200)

It seems, `time_to_eruption` uniformly distributed (roughly).

In [None]:
px.line(train, 
        x=train.index, 
        y='time_to_eruption',
        log_y=True)

Let's sort values by `time_to_eruption`

In [None]:
sorted_df = train.sort_values(by='time_to_eruption', ascending=False)
sorted_df.reset_index(inplace=True)
# sorted_df.drop('index', axis='columns', inplace=True)

In [None]:
px.line(sorted_df, 
        x=sorted_df.index,
        y=(sorted_df['time_to_eruption']))

In [None]:
px.line(sorted_df, 
        x=sorted_df.index,
        y=(sorted_df['time_to_eruption']),
        log_y=True
       )

In [None]:
sorted_df['step'] = sorted_df['time_to_eruption'].shift(-1) - sorted_df['time_to_eruption']

In [None]:
px.line(sorted_df, x=sorted_df.index,
        y=sorted_df['step'])

## Investigating Sensors Data

Let's See what a single file look's like.

In [None]:
sensor_path = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/train'

def read_sensor_data(path=sensor_path, fname='1000015382.csv'):
    df = pd.read_csv(path+'/'+fname, dtype='Int16')
    return df

In [None]:
sensor_df = read_sensor_data()
sensor_df.head()

In [None]:
sensor_df.info()

In [None]:
sensor_df.describe()

In [None]:
def plot_sensor_data(df, size=(1000, 1000), fixed_range=[-5000, 5000]):
    fig = make_subplots(rows=10, 
                        cols=1,
                        shared_xaxes=True,
                        vertical_spacing=0.03,
                        subplot_titles = df.columns.to_list())
    
    for i in range(df.shape[1]):
        fig.add_trace(go.Scatter(x=df.index,
                                 y=df.iloc[:, i].fillna(0),
                                 mode='lines',
                                 name=df.columns[i]),
                        row=i+1,
                        col=1)
        fig.update_yaxes(range=fixed_range)
        
    fig.layout.update(
        {'width':size[0],
         'height':size[1],
         'showlegend':False})
    return fig


In [None]:
fig = plot_sensor_data(sensor_df, fixed_range=None)
fig.show()

In [None]:
def plot_sensors_hist(df, shape=[5, 2]):

    fig = make_subplots(rows=shape[0], 
                        cols=shape[1],
                        vertical_spacing=0.09,
                        specs=[[{"secondary_y": True}]*shape[1] for i in range(shape[0])],
                        subplot_titles = df.columns.to_list())
    
    for i in range(df.shape[1]):
        row = int(i % shape[0] + 1)
        col = int(i // shape[0] + 1)

        fig.add_trace(go.Histogram(x=df.iloc[:, i].fillna(0),
                                   name=df.columns[i]),
                      row=row,
                      col=col)
        fig.add_trace(go.Histogram(x=df.iloc[:, i].fillna(0),
                                   name=df.columns[i],
                                   cumulative={'enabled':True},
                                   histnorm='probability',
                                   opacity=0.5),

                      row=row,
                      col=col,
                      secondary_y=True)

    fig.layout.update(
        {'width': shape[1]*280,
         'height':shape[0]*280,
         'showlegend':False})
    fig.show()

In [None]:
plot_sensors_hist(sensor_df, shape=[2,5])


## Preparing Data

If an entire sensor is `Null` we will drop it.

In [None]:
sensor_path = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/train'

In [None]:
# 15G training data!
!du -h -d 1 /kaggle/input/predict-volcanic-eruptions-ingv-oe/train/

In [None]:
# collect file names
sensors_files = glob.glob(f"{sensor_path}/*")
print(len(sensors_files))

### Missing Data

In [None]:
# counting missing values and columns

# meta_df = pd.DataFrame(np.zeros((10,2)), columns=['full_null', 'partial_null'], 
#              index=[f'sensor_{i+1}' for i in range(10)])
# for f in tqdm(sensors_files):
#     df = pd.read_csv(f, dtype='Int16')
    
#     partial_null = df.columns[df.isnull().any()].to_list()
#     full_null = df.columns[df.isnull().all()].to_list()
    
#     if partial_null:
#         meta_df.loc[partial_null,'partial_null'] +=1
    
#     if full_null:
#         meta_df.loc[full_null,'full_null'] +=1

In [None]:
# save this data and to not go with the process again
# meta_df.to_csv('/kaggle/working/null_data.csv')
meta_df = pd.read_csv('../input/null-data/null_data.csv',index_col=0)

In [None]:
 meta_df.info()

In [None]:
meta_df

In [None]:
def plot_group_bar(df, columns, title='Count of missing data for each sensor'):
    x = df.index
    traces = [go.Bar(x=x, y=meta_df[i], name=i) for i in columns]
    
    fig = go.Figure(data=traces, layout={'title':title, 
                                         'yaxis':{'title': 'Count'},
                                         'xaxis':{'title': 'Sensors'}})
    fig.show()

In [None]:
plot_group_bar(meta_df, ['full_null', 'partial_null'])

In [None]:
meta_df['full_null_percent'] = meta_df['full_null'] / train.shape[0] * 100
meta_df['partial_null_percent'] = meta_df['partial_null'] / train.shape[0] * 100

In [None]:
meta_df

In [None]:
plot_group_bar(meta_df, ['full_null_percent', 'partial_null_percent'], title='Count of missing data for each sensor(%)')

near 20% of `sensor_2` data are missing, and almost 10% of `sensor_3, sensor_5, and sensor_8` are missing.
`sensor_4 and sensor_6` are not missing entirly. and `sensor_2, and sensor_9` have partially missing data in the intire dataset. 

for a sensor if it's entirely missing we drop and just not use it. But for partially missing values I think '`Back fill`' is good strategy than filling with `0s`.

### Transformation and Feature Extraction
we will use continuous wavelet transform (`cwt`) to transform raw data to usefull features then use `PCA` to get the most useful features.
we accoumplish this with the `sklearn` custom transformer

![](http://)[wavelet reference](http://ataspinar.com/2018/12/21/a-guide-for-using-the-wavelet-transform-in-machine-learning/)

In [None]:
import pywt
from skimage.transform import resize
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin


In [None]:
def extract_features(df: pd.DataFrame, scale: int= 64, wavelet: str='morl', comps: int=5):
    scales = np.arange(1, scale + 1)
    pca = PCA(n_components=comps)

    pca_comps = np.empty((0, scales.shape[0] * comps), dtype='float32')
    df.fillna(method='bfill').fillna(0, inplace=True)
    for i in range(df.shape[1]):
        signal = df.iloc[:,i]
        coeff, freq = pywt.cwt(signal, scales, wavelet)
        coeff = np.nan_to_num(coeff)
        pca_comps = np.vstack([pca_comps, pca.fit_transform(coeff).flatten()]) 
    return pca_comps.flatten()
    # output shape 64 * 5 * 10 


In [None]:
def load_training_data(glob_list: glob.glob):
    data = np.empty((0, 3201))
    for i, file in enumerate(glob_list):
#         segment_id = file.split('/')[-1].split('.')[0]
        
        df = pd.read_csv(file)
        
        features = extract_features(df)
        
#         features = np.append(segment_id, features) # append the segment id to the begining of the features
        data = np.vstack([data, features])
        
        if i % 100 == 0:
            print(time.time())
            df = pd.DataFrame(data, columns=['segment_id']+list(range(1, 3201)))
            df.to_csv('/kaggle/working/train_data.csv')
            
    df = pd.DataFrame(data, columns=['segment_id']+list(range(1, 3201)))
    df.to_csv('/kaggle/working/train_data.csv')
    return df

In [None]:
data = load_training_data(sensors_files)

In [None]:
def load_data_chunks(glob_list):
    main_arr = np.empty((len(glob_list), 60001, 10))
    for i, file in tqdm(enumerate(glob_list), total=len(glob_list)):
        
        main_arr[i,:, :] = np.genfromtxt(file, delimiter=',')[1:, :]
    return main_arr
    

In [None]:
x = load_data_chunks(sensors_files[:1000])

In [None]:
def ffill(arr):
    mask = np.isnan(arr)
    idx = np.where(~mask,np.arange(mask.shape[1]),0)
    np.maximum.accumulate(idx,axis=1, out=idx)
    arr[mask] = arr[np.nonzero(mask)[0], idx[mask]]
    return np.nan_to_num(arr)

In [None]:
def extract_features(arr, scale: int= 64, wavelet: str='morl', comps: int=5):
    scales = np.arange(1, scale + 1)
    pca = PCA(n_components=comps)

    pca_comps = np.empty((0, scales.shape[0] * comps), dtype='float32')
    arr = ffill(arr)
    for i in range(arr.shape[1]):
        signal = arr[:,i]
        coeff, freq = pywt.cwt(signal, scales, wavelet)
        coeff = np.nan_to_num(coeff)
        pca_comps = np.vstack([pca_comps, pca.fit_transform(coeff).flatten()]) 
    return pca_comps.flatten()

In [None]:
def make_train_set(ndarr):
    num_samples = ndarr.shape[0]
    train = np.empty((num_samples, 3200))
    for i in tqdm(range(num_samples)):
        train[i,:] = extract_features(ndarr[i, :, :])
    
    return train

In [None]:
f = make_train_set(x)

In [None]:
df = pd.DataFrame(f)
df['segment_id'] = [int(i.split('/')[-1].split('.')[0]) for i in sensors_files[:1000]]

In [None]:
df.head()

In [None]:
df = df.merge(train.astype(np.float32), on='segment_id')

In [None]:
df.to_csv('/kaggle/working/train.csv')

In [None]:
fig = px.imshow(coeff)
fig.update_layout(width=400, height=1000)
fig.show()

In [None]:
coeff_1, freq = pywt.cwt(sensor_df['sensor_1'].fillna(method='backfill').astype(np.float32), np.arange(1, 65), 'morl')

In [None]:
pca = PCA(n_components=0.95)

In [None]:
c = pca.fit_transform(coeff[:, :, 1 ].T)

In [None]:
coeff[:, :, 9]

In [None]:
coeff_1

In [None]:
df = pd.DataFrame(data=np.zeros((2,2)), columns=['a','b'])

In [None]:
df.iloc[0,0] = 1

In [None]:
import pandas as pd
pd.read_csv('/kaggle/working/train_data.csv')

## Using CNN
