In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd
import time
import os
from DataPreparation.dataset_preparation import get_LANL_dataset
from Utilities.evaluation_utils import save_submission

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
data_dir = '../Data/LANL_Earthquake_prediction/'
validation_split = 0.2
numpy = False
normalize = False
verbose= True

In [3]:
data, X_train_moments = get_LANL_dataset(data_dir, validation_split, numpy, normalize)
if numpy:
    X_train = data['X_train']
    y_train = data['y_train']
    X_val = data['X_val']
    y_val = data['y_val']
    test_dict = data['test_dict']
    if verbose:
        print('')
        print('----- Dataset Description -----')
        print('X_train size: %d' % len(X_train))
        print('X_val size: %d' % len(X_val))
        print('test_dict size: %d' % len(test_dict))
        print('-------------------------------')
else:
    train_df = data['train_df']
    val_df = data['val_df']
    test_dict = data['test_dict']
    if verbose:
        print('')
        print('----- Dataset Description -----')
        print('train_df:')
        print(train_df.count().compute())
        print('-----------')
        print('val_df:')
        print(val_df.count().compute())
        print('-----------')
        print('test_dict size: %d' % len(test_dict))
        print('-------------------------------')

Dask dataframes loaded.
Done.

----- Dataset Description -----
train_df:
acoustic_data      503316384
time_to_failure    503316384
dtype: int64
-----------
val_df:
acoustic_data      125829096
time_to_failure    125829096
dtype: int64
-----------
test_dict size: 2624
-------------------------------


### Compute Global Average

In [4]:
global_avg = train_df['time_to_failure'].mean().compute()
print('The global average time (s) until an earthquake hits is %.4f' % global_avg)

The global average time (s) until an earthquake hits is 5.6040


### Evaluation

In [5]:
train_df['AE'] = abs(train_df['time_to_failure'] - global_avg)
train_MAE = train_df['AE'].mean().compute()
print('Train MAE: %.5f' % train_MAE)

Train MAE: 2.99182


In [6]:
val_df['AE'] = abs(val_df['time_to_failure'] - global_avg)
val_MAE = val_df['AE'].mean().compute()
print('Validation MAE: %.5f' % val_MAE)

Validation MAE: 3.25379


### Predictions for Test set

In [7]:
test_prediction_dict = {}
for seg_id, test_df in test_dict.items():
    test_prediction_dict.update({seg_id:global_avg})

save_submission(test_prediction_dict, 'global_average_submission')

In [8]:
test_MAE = 2.795 # Got this number from submitting results to Kaggle
print('Global Average Test MAE: %.5f' % test_MAE)
print('This got us to rank 1904 from 2043 people who had submitted results.')

Global Average Test MAE: 2.79500
This got us to rank 1904 from 2043 people who had submitted results.
