In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings("ignore")
        
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.ndimage import maximum_filter1d
from scipy.ndimage import minimum_filter1d

from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
StSc = StandardScaler()
MMS = MinMaxScaler()

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print(datetime.now())

In [None]:
V_PATH = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'
TRAIN_PATH = V_PATH + 'train/'

In [None]:
SENSOR_COLS = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
       'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10']


SENSOR_RMEANS = [x+'_rmin' for x in SENSOR_COLS] 
SENSOR_RSTDS = [x+'_rstd' for x in SENSOR_COLS] 
SENSOR_RMINS = [x+'_rmin' for x in SENSOR_COLS] 
SENSOR_RMAXES = [x+'_rmax' for x in SENSOR_COLS]
SENSOR_RGRADMEAN = [x+'_grad_rmean' for x in SENSOR_COLS]
SENSOR_RGRADSTD = [x+'_grad_rstd' for x in SENSOR_COLS]

SENSOR_RSTATS = [SENSOR_RMEANS, SENSOR_RSTDS, SENSOR_RMINS, SENSOR_RMAXES,
               SENSOR_RGRADMEAN,  SENSOR_RGRADSTD]

ROLL_DESCR = ['rmin', 'rstd', 'rmin', 'rmax', 'grad_rmean','grad_rstd']

In [None]:
train = pd.read_csv(V_PATH+'train.csv')
print(train.shape)
print(train.columns)
train.head(5)

In [None]:
total_rows_estimate = 60001 * len(train) / 1000000
print('estimate of total TRAIN rows (millions)',total_rows_estimate)

In [None]:
sample_submission = pd.read_csv(V_PATH+'sample_submission.csv')
print(sample_submission.shape)
print(sample_submission.columns)

total_rows_estimate = 60001 * len(sample_submission) / 1000000
print('estimate of total TEST rows (millions)',total_rows_estimate)

sample_submission.head(5)

In [None]:
#examine the distribution of time until eruption

sns.kdeplot(train['time_to_eruption'] / 1000000)

In [None]:
print(train['time_to_eruption'].min(), train['time_to_eruption'].max(), train['time_to_eruption'].mean())

In [None]:
sz = train['time_to_eruption'].size-1
train['PCNT_TIME'] = train['time_to_eruption'].rank(method='max').apply(lambda x: 1.0*(x-1)/sz)
train.head(10)

In [None]:
def get_rolling(df, cols, window=50):
    for col in cols:
        df[col+'_grad'] = np.gradient(df[col])
        df[col+'_grad'] = df[col+'_grad'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_grad_abs'] = np.gradient(np.abs(df[col]))
        df[col+'_grad'] = df[col+'_grad'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rmin'] = minimum_filter1d(df[col].values, size=window)
        df[col+'_rmax'] = maximum_filter1d(df[col].values, size=window)
        
        df[col+'_rmin'] = df[col+'_rmin'].fillna(method='bfill').fillna(method='ffill')
        df[col+'_rmax'] = df[col+'_rmax'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rmean'] = df[col].rolling(window=window, center=True).mean().fillna(method='bfill').fillna(method='ffill')
        df[col+'_rstd'] = df[col].rolling(window=window, center=True).std().fillna(method='bfill').fillna(method='ffill')
        
        #add also for gradients
        df[col+'_grad_rmin'] = minimum_filter1d(df[col+'_grad_abs'].values, size=window)
        df[col+'_grad_rmax'] = maximum_filter1d(df[col+'_grad_abs'].values, size=window)
        
        df[col+'_grad_rmin'] = df[col+'_grad_rmin'].fillna(method='bfill').fillna(method='ffill')
        df[col+'_grad_rmax'] = df[col+'_grad_rmax'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_grad_rmean'] = df[col+'_grad_abs'].rolling(window=window, center=True).mean().fillna(method='bfill').fillna(method='ffill')
        df[col+'_grad_rstd'] = df[col+'_grad_abs'].rolling(window=window, center=True).std().fillna(method='bfill').fillna(method='ffill')
        
    return df

def get_stats(df, sensor_cols=SENSOR_COLS, rolling_cols=SENSOR_RSTATS):
    #we create the min max etc of original sensor columns
    df['max'] = df[sensor_cols].max(axis=1)
    df['min'] = df[sensor_cols].min(axis=1)
    df['std'] = df[sensor_cols].std(axis=1)
    
    #and with absolute values
    df['max_abs'] = np.abs(df[sensor_cols]).max(axis=1)
    df['min_abs'] = np.abs(df[sensor_cols]).min(axis=1)
    df['std_abs'] = np.abs(df[sensor_cols]).std(axis=1)
    
    #we take mins and maxes of groups of rolling columns
    for count,rc in enumerate(rolling_cols): #this takes a SINGLE mean, max across each GROUP of rolling
        #columns - e.g. the max of all rolling mins
        df[ROLL_DESCR[count]+'_max'] = df[rolling_cols[count]].max(axis=1)
        df[ROLL_DESCR[count]+'_min'] = df[rolling_cols[count]].min(axis=1)
        df[ROLL_DESCR[count]+'_std'] = df[rolling_cols[count]].std(axis=1)
        df[ROLL_DESCR[count]+'_mean'] = df[rolling_cols[count]].mean(axis=1)    
   
    return df

In [None]:
#credit - stack overflow

def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [None]:
#lets drop the rolling mean - does not seem that useful

def get_all_stats(df, cols, rolling_cols, window=50):
    
    df = get_rolling(df, cols, window=window)
    df = get_stats(df, sensor_cols=cols, rolling_cols=rolling_cols)
    df = df.groupby(['segment'])[[x for x in df.columns if x != 'segment']].agg(['mean','max','skew','std',
                                    percentile(0.1),percentile(0.25),
                                        percentile(0.75),percentile(0.9)])
    df.columns=[a+b for a,b in df.columns]
    return df

In [None]:
%%time
print(datetime.now())
#SAMPLE_COUNT = 200

#sample_index = train.sample(n=SAMPLE_COUNT).index
sample_index = train.index
print('number of samples', len(sample_index))

loaded_dfs = pd.DataFrame()

for count,S in enumerate(sample_index):
    
    s_ID = train.loc[S, 'segment_id']
    q = train.loc[S, 'PCNT_TIME']
    print(s_ID)
    
    temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv')
    print('count NAs Percent, ', temp_df.isna().sum().sum() / len(temp_df.values.flatten()))
    #for col in temp_df.columns:
    temp_df = temp_df.fillna(method='ffill').fillna(method='bfill')
    temp_df = temp_df.fillna(value=0)
    print('count NAs Percent, ', temp_df.isna().sum().sum() / len(temp_df.values.flatten()))
    temp_df['time_to_eruption'] = q
    temp_df['segment'] = s_ID
    
    temp_df = get_all_stats(temp_df, SENSOR_COLS, SENSOR_RSTATS, window=50)      
    
    print(temp_df.shape)    
    loaded_dfs = pd.concat([loaded_dfs, temp_df], axis=0)
    print(loaded_dfs.shape) 
    print('    ')
loaded_dfs = loaded_dfs.reset_index(drop=True) 
print(' Total DF Size ',loaded_dfs.shape)
loaded_dfs.head(10)

In [None]:
loaded_dfs.to_csv('volcano_train_fts.csv', index=True)

In [None]:
print(datetime.now())