In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgbm 
from lightgbm import LGBMRegressor

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))
#import warnings
#warnings.filterwarnings("ignore")
        
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.ndimage import maximum_filter1d
from scipy.ndimage import minimum_filter1d

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
StSc = StandardScaler()
MMS = MinMaxScaler()
from sklearn.model_selection import train_test_split, RandomizedSearchCV


In [None]:
V_PATH = '../input/predict-volcanic-eruptions-ingv-oe/'
TRAIN_PATH = V_PATH + 'train/'
TEST_PATH = V_PATH + 'test/'

In [None]:
SENSOR_COLS = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
       'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10']


SENSOR_RMEANS = [x+'_rmin' for x in SENSOR_COLS] 
SENSOR_RSTDS = [x+'_rstd' for x in SENSOR_COLS] 
SENSOR_RMINS = [x+'_rmin' for x in SENSOR_COLS] 
SENSOR_RMAXES = [x+'_rmax' for x in SENSOR_COLS]
SENSOR_RSKEWS = [x+'_rskew' for x in SENSOR_COLS]
SENSOR_RSUMS = [x+'_rsum' for x in SENSOR_COLS]
SENSOR_RVARS = [x+'_rvar' for x in SENSOR_COLS]
#SENSOR_RMADS = [x+'_rmad' for x in SENSOR_COLS]
#SENSOR_RKURTOSISES = [x+'_rkurtosis' for x in SENSOR_COLS]



#SENSOR_RGRADMEAN = [x+'_grad_rmean' for x in SENSOR_COLS]
#SENSOR_RGRADSTD = [x+'_grad_rstd' for x in SENSOR_COLS]

SENSOR_RSTATS = [SENSOR_RMEANS, SENSOR_RSTDS, SENSOR_RMINS, SENSOR_RMAXES, SENSOR_RSKEWS, SENSOR_RSUMS, SENSOR_RVARS]

ROLL_DESCR = ['rmean', 'rstd', 'rmin', 'rmax', 'rskew', 'rsum', 'rvar']

In [None]:
train = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')
print(train.shape)
print(train.columns)

train.head(6)

In [None]:
train_small=train[:200]
train_small

In [None]:
test_files = []
for dirname, _, filenames in os.walk(V_PATH+'/test/'):
    for filename in filenames:
        test_files.append(filename[:-4]) # without .csv extension
        
test = pd.DataFrame(test_files, columns=["segment_id"])
test.head(6)

In [None]:
test_small=test[:200]
test_small

In [None]:
sample_submission = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')
sample_submission.head(5)

In [None]:
sns.distplot(train['time_to_eruption'], 
             hist=True, 
             kde=False, 
             bins=100, 
             color = 'blue', 
             hist_kws={'edgecolor':'black'})

In [None]:

#pandasDataFrame.rolling(window=window_size).apply(mad) 
#pandas.rolling_kurt(pandasDataFrame, window=window_size)

def get_rolling(df, cols, window=50):
    for col in cols:
        
#pd.DataFrame.rolling(window=50).apply(mad) 
#pd.rolling_kurt(pd.DataFrame, window=50)        
        
        df[col+'_rmin'] = minimum_filter1d(df[col].values, size=window)
        df[col+'_rmax'] = maximum_filter1d(df[col].values, size=window)
        
        df[col+'_rmin'] = df[col+'_rmin'].fillna(method='bfill').fillna(method='ffill')
        df[col+'_rmax'] = df[col+'_rmax'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rmean'] = df[col].rolling(window=window, center=True).mean().fillna(method='bfill').fillna(method='ffill')
        df[col+'_rstd'] = df[col].rolling(window=window, center=True).std().fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rsum'] = df[col].rolling(window=window, center=True).sum().fillna(method='bfill').fillna(method='ffill')
        df[col+'_rskew'] = df[col].rolling(window=window, center=True).skew().fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rvar'] = df[col].rolling(window=window, center=True).var().fillna(method='bfill').fillna(method='ffill') #отклонение
        #df[col+'_rmad'] = df[col].rolling(window=window, center=True).apply(mad).fillna(method='bfill').fillna(method='ffill') #среднее абсолютное отклонение
        
        #df[col+'_rkurtosis'] = df[col].rolling_kurt(window=window, center=True).fillna(method='bfill').fillna(method='ffill') 
        
        
    return df

def get_stats(df, sensor_cols=SENSOR_COLS, rolling_cols=SENSOR_RSTATS):
    #we create the min max etc of original sensor columns
    df['max'] = df[sensor_cols].max(axis=1)
    df['min'] = df[sensor_cols].min(axis=1)
    df['std'] = df[sensor_cols].std(axis=1)
    df['sum'] = df[sensor_cols].sum(axis=1)
    df['skew'] = df[sensor_cols].skew(axis=1)
    df['var'] = df[sensor_cols].var(axis=1)
    df['mad'] = df[sensor_cols].mad(axis=1)
    #df['kurtosis'] = df[sensor_cols].kurtosis(axis=1)
    #df['mean'] = df[sensor_cols].mean(axis=1)
    
    #and with absolute values
    #df['max_abs'] = np.abs(df[sensor_cols]).max(axis=1)
    #df['min_abs'] = np.abs(df[sensor_cols]).min(axis=1)
    #df['std_abs'] = np.abs(df[sensor_cols]).std(axis=1)
    #df['sum_abs'] = np.abs(df[sensor_cols]).sum(axis=1)
    #df['skew_abs'] = np.abs(df[sensor_cols]).skew(axis=1)
    #df['var_abs'] = np.abs(df[sensor_cols]).var(axis=1)
    #df['mad_abs'] = np.abs(df[sensor_cols]).mad(axis=1)
    #df['kurtosis_abs'] = np.abs(df[sensor_cols]).kurtosis(axis=1)
    
    #we take mins and maxes of groups of rolling columns
    for count,rc in enumerate(rolling_cols): #this takes a SINGLE mean, max across each GROUP of rolling
        #columns - e.g. the max of all rolling mins
        df[ROLL_DESCR[count]+'_max'] = df[rolling_cols[count]].max(axis=1)
        df[ROLL_DESCR[count]+'_mean'] = df[rolling_cols[count]].mean(axis=1)
        df[ROLL_DESCR[count]+'_min'] = df[rolling_cols[count]].min(axis=1)
        df[ROLL_DESCR[count]+'_std'] = df[rolling_cols[count]].std(axis=1)
        df[ROLL_DESCR[count]+'_sum'] = df[rolling_cols[count]].sum(axis=1) 
        df[ROLL_DESCR[count]+'_skew'] = df[rolling_cols[count]].skew(axis=1)    
        df[ROLL_DESCR[count]+'_var'] = df[rolling_cols[count]].var(axis=1)    
        #df[ROLL_DESCR[count]+'_mad'] = df[rolling_cols[count]].mad(axis=1)    
        #df[ROLL_DESCR[count]+'_kurtosis'] = df[rolling_cols[count]].kurtosis(axis=1)    
   
    return df

In [None]:
#credit - stack overflow

def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [None]:
#lets drop the rolling mean - does not seem that useful

def get_all_stats(df, cols, rolling_cols, window=50):
    
    df = get_rolling(df, cols, window=window)
    df = get_stats(df, sensor_cols=cols, rolling_cols=rolling_cols)
    df = df.groupby(['segment'])[[x for x in df.columns if x != 'segment']].agg(['mean','max','skew','std', 'min', 'sum', 'var',
                                    percentile(0.01), percentile(0.1),percentile(0.25), percentile(0.5), 
                                        percentile(0.75), percentile(0.9), percentile(0.99)])
    df.columns=[a+b for a,b in df.columns]
    return df

In [None]:
sample_index = train_small.index
loaded_dfs = pd.DataFrame()
count = 0
for count,S in enumerate(sample_index):
    
    s_ID = train_small.loc[S, 'segment_id']
    #q = train.loc[S, 'time_to_eruption']
    temp_df = pd.read_csv(TRAIN_PATH+str(s_ID)+'.csv')
    temp_df = temp_df.fillna(value=0)
    
    temp_df['segment'] = s_ID
    #temp_df['time_to_eruption']=q
    temp_df = get_all_stats(temp_df, SENSOR_COLS, SENSOR_RSTATS, window=50)  
    if count%50 == 0: 
        print('Processing segment_id={}'.format(count)) 
    loaded_dfs = pd.concat([loaded_dfs, temp_df], axis=0)
    count +=1
    
loaded_dfs = loaded_dfs.reset_index(drop=True) 
loaded_dfs = loaded_dfs.rename(columns={'index':'segment_id'})
#loaded_dfs = pd.merge (loaded_dfs, train_small, on = 'segment_id' )
loaded_dfs.head(10)
loaded_dfs.to_csv('volcano_train_small_fts.csv', index=True)

In [None]:
sample_index = test_small.index 
test_dfs = pd.DataFrame()
count = 0
for count,S in enumerate(sample_index):
    
    s_ID = test_small.loc[S, 'segment_id']
    temp_df = pd.read_csv(TEST_PATH+str(s_ID)+'.csv')
    temp_df = temp_df.fillna(value=0)
    
    temp_df['segment'] = s_ID
    temp_df = get_all_stats(temp_df, SENSOR_COLS, SENSOR_RSTATS, window=50)  
    if count%50 == 0: 
        print('Processing segment_id={}'.format(count)) 
    test_dfs = pd.concat([test_dfs, temp_df], axis=0)
    count +=1
    
test_dfs = test_dfs.reset_index(drop=True) 
test_dfs = test_dfs.rename(columns={'index':'segment_id'})
test_dfs = pd.merge (test_dfs, test_small, on = 'segment_id' )
test_dfs.head(10)
test_dfs.to_csv('volcano_test_small_fts.csv', index=True)

In [None]:
X = loaded_dfs.drop(['segment_id', 'time_to_eruption'], axis=1)
y = loaded_dfs['time_to_eruption']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2, 
                                                      random_state=42)

In [None]:
# Default parameters
params = {
    'boosting_type': 'gbdt', 
    'num_leaves': 31,
    'max_depth': -1,
    'learning_rate': 0.1,
    'n_estimators': 100, 
    'subsample_for_bin': 200, # 200000 is default 
    #'objective': 'binary'
    'min_split_gain': 0.5,    # 0.0 is default 
    'min_child_weight': 1e-3, 
    'min_child_samples': 20,
    'subsample': 1,
    'colsample_bytree': 1.0,
    'min_data_in_leaf': 20,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'random_state': 42
    #'device': 'cpu', # you can use GPU to achieve faster learning
}
        
# Initiate classifier to use
model_lgbm_regr = LGBMRegressor(boosting_type = params['boosting_type'], 
                                num_leaves = params['num_leaves'],
                                max_depth = params['max_depth'],
                                learning_rate = params['learning_rate'],
                                n_estimators = params['n_estimators'],
                                subsample_for_bin = params['subsample_for_bin'],
                                #objective = params['objective'],
                                min_split_gain = params['min_split_gain'], 
                                min_child_weight = params['min_child_weight'], 
                                min_child_samples = params['min_child_samples'],
                                subsample = params['subsample'],
                                colsample_bytree = params['colsample_bytree'],
                                min_data_in_leaf = params['min_data_in_leaf'],
                                feature_fraction = params['feature_fraction'],
                                bagging_fraction = params['bagging_fraction'],
                                random_state = params['random_state'],
                                #n_jobs = 5, 
                                silent = True
                               )

# To view the default model parameters:
model_lgbm_regr.get_params().keys()

In [None]:
model_lgbm_regr.fit(X_train, y_train, 
eval_set= [(X_train, y_train), (X_valid, y_valid)], eval_metric="mae", verbose=200, early_stopping_rounds=50)


In [None]:
predictions = model_lgbm_regr.predict(test_dfs.drop(columns=['segment_id']))

In [None]:
submission = pd.DataFrame()
submission['segment_id'] = test_dfs["segment_id"]
submission['time_to_eruption'] = predictions
submission.to_csv('submission.csv', header=True, index=False)

In [None]:
submission.head(6)