In [None]:
import pandas as pd
import numpy as np
import math
import datetime
from fastai.tabular.all import *
# from fastai.tabular import *
from fastai.imports import *
from fastai.metrics import error_rate
#from fastai.callbacks import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import scipy.stats as spstats

In [None]:
path = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'
output_path = '/kaggle/output/kaggle/working/modles/'

**The below dataframe consist of the segment ID and the target value i.e. the time left for the volcano to erupt.
For each of the segment ID's we have been provided with a csv file with 10 mins of logs of readings belonging to 10 different sensors.**

In [None]:
train = pd.read_csv(path+"train.csv")
train

In [None]:
sample_submission = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv")
sample_submission

In [None]:
train['time_to_eruption'].describe()

In [None]:
train_readings = glob.glob(path+"train/*")
len(train_readings)

In [None]:
test_readings = glob.glob(path+"test/*")
len(test_readings)

In [None]:
train_readings[0]

In [None]:
sensor_file = pd.read_csv(train_readings[0])
sensor_file

For each segment ID, we have got 10 sensors and 60001 readings from each one of them.

### Feature Creation

In [None]:
def create_features(df,signal,seg_id,sensor_id):
    f = np.fft.fft(signal)
    f_real = np.real(f)
    df.loc[seg_id, f'{sensor_id}_sum']       = signal.sum()
    df.loc[seg_id, f'{sensor_id}_mean']      = signal.mean()
    df.loc[seg_id, f'{sensor_id}_std']       = signal.std()
    df.loc[seg_id, f'{sensor_id}_var']       = signal.var() 
    df.loc[seg_id, f'{sensor_id}_max']       = signal.max()
    df.loc[seg_id, f'{sensor_id}_min']       = signal.min()
    df.loc[seg_id, f'{sensor_id}_skew']      = signal.skew()
    df.loc[seg_id, f'{sensor_id}_mad']       = signal.mad()
    df.loc[seg_id, f'{sensor_id}_kurtosis']  = signal.kurtosis()
    df.loc[seg_id, f'{sensor_id}_quantile99']= np.quantile(signal, 0.99)
    df.loc[seg_id, f'{sensor_id}_quantile95']= np.quantile(signal, 0.95)
    df.loc[seg_id, f'{sensor_id}_quantile85']= np.quantile(signal, 0.85)
    df.loc[seg_id, f'{sensor_id}_quantile75']= np.quantile(signal, 0.75)
    df.loc[seg_id, f'{sensor_id}_quantile55']= np.quantile(signal, 0.55)
    df.loc[seg_id, f'{sensor_id}_quantile45']= np.quantile(signal, 0.45) 
    df.loc[seg_id, f'{sensor_id}_quantile25']= np.quantile(signal, 0.25) 
    df.loc[seg_id, f'{sensor_id}_quantile15']= np.quantile(signal, 0.15) 
    df.loc[seg_id, f'{sensor_id}_quantile05']= np.quantile(signal, 0.05)
    df.loc[seg_id, f'{sensor_id}_quantile01']= np.quantile(signal, 0.01)
    df.loc[seg_id, f'{sensor_id}_fft_real_mean']= f_real.mean()
    df.loc[seg_id, f'{sensor_id}_fft_real_std'] = f_real.std()
    df.loc[seg_id, f'{sensor_id}_fft_real_max'] = f_real.max()
    df.loc[seg_id, f'{sensor_id}_fft_real_min'] = f_real.min()
    df.loc[seg_id, f'{sensor_id}_fft_real_median'] = np.median(f_real)
    df.loc[seg_id, f'{sensor_id}_fft_real_skew'] = spstats.skew(f_real)
    df.loc[seg_id, f'{sensor_id}_fft_real_kurtosis'] = spstats.kurtosis(f_real)
    
    return df

####  Create features for Training Data

In [None]:
train = pd.read_csv(path+'train.csv')
train_df = pd.DataFrame()
train_df['segment_id'] = train.segment_id
train_df = train_df.set_index('segment_id')

j=0
for seg in train.segment_id:
    signals = pd.read_csv(path+f'train/{seg}.csv')
    if j%500 == 0:
        print(j)
    for i in range(1, 11):
        sensor_id = f'sensor_{i}'
        train_df = create_features(train_df, signals[sensor_id].fillna(0), seg, sensor_id,)
    j+=1    

In [None]:
train_df = pd.merge(train_df.reset_index(), train, on=['segment_id'], how='left').set_index('segment_id')
train_df

In [None]:
train_df = train_df.reset_index()

In [None]:
y = train_df['time_to_eruption']
train_df = train_df.drop(['segment_id'], axis = 1)
train_df

#### Create features for Test Data

In [None]:
test = pd.read_csv(path+'sample_submission.csv')
test_df = pd.DataFrame()
test_df['segment_id'] = test.segment_id
test_df = test_df.set_index('segment_id')

j=0
for seg in test.segment_id:
    signals = pd.read_csv(path+f'test/{seg}.csv')
    if j%500 == 0:
        print(j)
    for i in range(1, 11):
        sensor_id = f'sensor_{i}'
        test_df = create_features(test_df, signals[sensor_id].fillna(0), seg, sensor_id,)
    j+=1 

In [None]:
test_df

In [None]:
test_df = test_df.reset_index()
test_set = test_df
test_df = test_df.drop(['segment_id'], axis = 1)
test_df

In [None]:
test_df

In [None]:
train_df.columns

In [None]:
for i in list(train_df.columns):
    print(i)

In [None]:
cont_names = list(train_df.columns)
#removing time to eruption column
cont_names.pop()
cont_names

In [None]:
cat_names = []
procs = [Categorify, FillMissing, Normalize]

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(train_df))

In [None]:
to = TabularPandas(train_df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names='time_to_eruption',
                   splits=splits)

In [None]:
dls = to.dataloaders(bs=64)

In [None]:
dls.show_batch()

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=mae, ps=[0.001,0.01], emb_drop=0.01)

In [None]:
learn.model

In [None]:
learn.lr_find(suggestions=True)

In [None]:
learn.model_dir='/kaggle/working/' 

In [None]:
learn.fit_one_cycle(50,0.33113112449646,cbs=SaveModelCallback(monitor='mae', comp=np.less, fname="stage-1"))

In [None]:
learn.load('stage-1')

In [None]:
test_df

In [None]:
dl = learn.dls.test_dl(test_df)

In [None]:
preds = learn.get_preds(dl=dl)

In [None]:
preds[0]

In [None]:
test_preds = []
for i in np.array(preds[0]):
#     print(i[0])
    test_preds.append(i[0])

In [None]:
test

In [None]:
test['time_to_eruption'] = test_preds

In [None]:
test

In [None]:
test.to_csv('submission.csv', index=False)

## Work in Progress