In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings("ignore")
        
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.ndimage import maximum_filter1d
from scipy.ndimage import minimum_filter1d

from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
StSc = StandardScaler()
MMS = MinMaxScaler()

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Note: I have shared the TRAIN data feed notebook so that the settings / code used are visible. 

https://www.kaggle.com/davidedwards1/volcano-train-fts-gen-v1/notebook

# This takes a while to run. If you have any ideas to make it faster and to get better features, feel free to share...

# This is a rough first attempt based on summary data for each train dataframe
# #So far, lower cv = lower LB, when the CV has reduced significantly. BUT the cv remains consistently lower than Lb
# 
# Comments
# Created a ton of features (rolling stats, summaries of rolling stats)
# So far, more features generally works better
# # I'm loading these from another workbook/data feed as they take a while to calculate
# Ive not spent tons of time fine tuning. just some basic directional testing
# Huber/SGD etc regression dont seem to work
# Tree models seem to work better at this stage
# I've chosen XGB as it has GPU option and can handle missing values
# Running multiple seeds
# 
# Version updates
# Replaced zeros with np.nan (missing)
# Removed scaling (not needed for xgb)
# Testing dropping features with a lot more NAs in test data set to see if this reduces CV absolute error compared to LB absolute error
# #I have added a correlation feed - this quite simply uses the corr() function on each set of data to provide a table (10 x 10) of correlations between each sensor for the whole 60001 rows. this did not help.
# I tried a stronger cutoff for features in test missing values. this did not help.

#added feature importance analysis at the end

In [None]:
V_PATH = '/kaggle/input/predict-volcanic-eruptions-ingv-oe/'
TRAIN_PATH = V_PATH + 'train/'

In [None]:
SENSOR_COLS = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
       'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10']


SENSOR_RMEANS = [x+'_rmin' for x in SENSOR_COLS] 
SENSOR_RSTDS = [x+'_rstd' for x in SENSOR_COLS] 
SENSOR_RMINS = [x+'_rmin' for x in SENSOR_COLS] 
SENSOR_RMAXES = [x+'_rmax' for x in SENSOR_COLS]
SENSOR_RGRADMEAN = [x+'_grad_rmean' for x in SENSOR_COLS]
SENSOR_RGRADSTD = [x+'_grad_rstd' for x in SENSOR_COLS]

SENSOR_RSTATS = [SENSOR_RMEANS, SENSOR_RSTDS, SENSOR_RMINS, SENSOR_RMAXES,
               SENSOR_RGRADMEAN,  SENSOR_RGRADSTD]

ROLL_DESCR = ['rmin', 'rstd', 'rmin', 'rmax', 'grad_rmean','grad_rstd']

In [None]:
train = pd.read_csv(V_PATH+'train.csv')
print(train.shape)
print(train.columns)
train.head(5)

In [None]:
total_rows_estimate = 60001 * len(train) / 1000000
print('estimate of total TRAIN rows (millions)',total_rows_estimate)

In [None]:
sample_submission = pd.read_csv(V_PATH+'sample_submission.csv')
print(sample_submission.shape)
print(sample_submission.columns)

total_rows_estimate = 60001 * len(sample_submission) / 1000000
print('estimate of total TEST rows (millions)',total_rows_estimate)

sample_submission.head(5)

In [None]:
#examine the distribution of time until eruption

sns.kdeplot(train['time_to_eruption'] / 1000000)

In [None]:
print(train['time_to_eruption'].min(), train['time_to_eruption'].max(), train['time_to_eruption'].mean())

In [None]:
sz = train['time_to_eruption'].size-1
train['PCNT_TIME'] = train['time_to_eruption'].rank(method='max').apply(lambda x: 1.0*(x-1)/sz)
train.head(10)

In [None]:
def get_rolling(df, cols, window=50):
    for col in cols:
        df[col+'_grad'] = np.gradient(df[col])
        df[col+'_grad'] = df[col+'_grad'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_grad_abs'] = np.gradient(np.abs(df[col]))
        df[col+'_grad'] = df[col+'_grad'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rmin'] = minimum_filter1d(df[col].values, size=window)
        df[col+'_rmax'] = maximum_filter1d(df[col].values, size=window)
        
        df[col+'_rmin'] = df[col+'_rmin'].fillna(method='bfill').fillna(method='ffill')
        df[col+'_rmax'] = df[col+'_rmax'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_rmean'] = df[col].rolling(window=window, center=True).mean().fillna(method='bfill').fillna(method='ffill')
        df[col+'_rstd'] = df[col].rolling(window=window, center=True).std().fillna(method='bfill').fillna(method='ffill')
        
        #add also for gradients
        df[col+'_grad_rmin'] = minimum_filter1d(df[col+'_grad_abs'].values, size=window)
        df[col+'_grad_rmax'] = maximum_filter1d(df[col+'_grad_abs'].values, size=window)
        
        df[col+'_grad_rmin'] = df[col+'_grad_rmin'].fillna(method='bfill').fillna(method='ffill')
        df[col+'_grad_rmax'] = df[col+'_grad_rmax'].fillna(method='bfill').fillna(method='ffill')
        
        df[col+'_grad_rmean'] = df[col+'_grad_abs'].rolling(window=window, center=True).mean().fillna(method='bfill').fillna(method='ffill')
        df[col+'_grad_rstd'] = df[col+'_grad_abs'].rolling(window=window, center=True).std().fillna(method='bfill').fillna(method='ffill')
        
    return df

def get_stats(df, sensor_cols=SENSOR_COLS, rolling_cols=SENSOR_RSTATS):
    #we create the min max etc of original sensor columns
    df['max'] = df[sensor_cols].max(axis=1)
    df['min'] = df[sensor_cols].min(axis=1)
    df['std'] = df[sensor_cols].std(axis=1)
    
    #and with absolute values
    df['max_abs'] = np.abs(df[sensor_cols]).max(axis=1)
    df['min_abs'] = np.abs(df[sensor_cols]).min(axis=1)
    df['std_abs'] = np.abs(df[sensor_cols]).std(axis=1)
    
    #we take mins and maxes of groups of rolling columns
    for count,rc in enumerate(rolling_cols): #this takes a SINGLE mean, max across each GROUP of rolling
        #columns - e.g. the max of all rolling mins
        df[ROLL_DESCR[count]+'_max'] = df[rolling_cols[count]].max(axis=1)
        df[ROLL_DESCR[count]+'_min'] = df[rolling_cols[count]].min(axis=1)
        df[ROLL_DESCR[count]+'_std'] = df[rolling_cols[count]].std(axis=1)
        df[ROLL_DESCR[count]+'_mean'] = df[rolling_cols[count]].mean(axis=1)    
   
    return df

In [None]:
#lets drop the rolling mean - does not seem that useful


def get_all_stats(df, cols, rolling_cols, window=50):
    
    df = get_rolling(df, cols, window=window)
    df = get_stats(df, sensor_cols=cols, rolling_cols=rolling_cols)
    df = df.groupby(['segment'])[[x for x in df.columns if x != 'segment']].agg(['mean',
                                                                                'max','min','std'])
    df.columns=[a+b for a,b in df.columns]
    return df



In [None]:
loaded_dfs = pd.read_csv('/kaggle/input/volcano-train-fts/volcano_train_fts.csv',index_col=0)
print(loaded_dfs.shape)
loaded_dfs.head(10)

In [None]:
test_dfs = pd.read_csv('/kaggle/input/volcano-test-features/volcano_test_fts.csv',index_col=0)
print(test_dfs.shape)
test_dfs.head(10)

In [None]:
NON_FTS = ['time_to_eruption', 'segment']
LABEL = 'time_to_eruptionmean'

REGRESSION_FTS = [x for x in loaded_dfs.columns if 'time_to_eruption' not in x]
REGRESSION_FTS = [x for x in REGRESSION_FTS if 'segment' not in x]
print('Number of features,', len(REGRESSION_FTS))

In [None]:
#it looks from the EDA like some sensors may be quite reasonably correlated
#lets try to create some features by examining differences between stats of sensor 2 and sensor 4
#these sensors (based on some limited sample data) looked much better correlated close to eruptions

s1 = 'sensor_2'
s2 = 'sensor_4'

s1_feats = [x for x in REGRESSION_FTS if s1 in x]
s2_feats = [x for x in REGRESSION_FTS if s2 in x]

print(s1_feats[0:10])
print(s2_feats[0:10])

In [None]:
for sd1, sd2 in zip(s1_feats, s2_feats):
    loaded_dfs[sd1+'_delta_'+s2] = loaded_dfs[sd1] - loaded_dfs[sd2]
    test_dfs[sd1+'_delta_'+s2] = test_dfs[sd1] - test_dfs[sd2]
    
    REGRESSION_FTS+=[sd1+'_delta_'+s2]

In [None]:
loaded_dfs['time_to_eruptionmean'].max()

In [None]:
LABEL = 'time_to_eruptionmean'

sns.kdeplot(loaded_dfs[LABEL])

In [None]:
loaded_dfs = loaded_dfs.fillna(value=0)
test_dfs = test_dfs.fillna(value=0)

In [None]:
regression_importance = pd.Series(index=REGRESSION_FTS, data=0.0)
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
#for RF in REGRESSION_FTS:
regression_importance[:] = f_regression(loaded_dfs[REGRESSION_FTS], loaded_dfs[LABEL])[0]

CUTOFF = regression_importance.quantile(0.25)
print('Number of features over cutoff', sum(regression_importance>CUTOFF))
SEL_FTS = regression_importance.index[regression_importance>CUTOFF]

sns.kdeplot(regression_importance)
regression_importance.sort_values(ascending=False).head(20)

In [None]:
FILL_ZEROS=True
if FILL_ZEROS==True:
    loaded_dfs[SEL_FTS] = loaded_dfs[SEL_FTS].replace({0: np.nan})
    test_dfs[SEL_FTS] = test_dfs[SEL_FTS].replace({0: np.nan})

In [None]:
na_analysis = pd.DataFrame(index=SEL_FTS,
                          data=0.0, columns=['Train', 'Test'])

na_analysis['Train'] = loaded_dfs[SEL_FTS].isna().sum().values / len(train)
na_analysis['Test'] = test_dfs[SEL_FTS].isna().sum().values / len(test_dfs)
na_analysis['Delta'] = na_analysis['Test'] - na_analysis['Train']

fig,axes=plt.subplots(figsize=(10,4))
sns.kdeplot(na_analysis['Train'], color='Green')
sns.kdeplot(na_analysis['Test'], color='Red')
sns.kdeplot(na_analysis['Delta'], color='Blue')
axes.set_title('Distribution of Zeros/NAs')

In [None]:
na_analysis['Delta'].sort_values()

In [None]:
drop_fts = [x for x in na_analysis[na_analysis['Delta']>0.2].index]
print(len(drop_fts))

In [None]:
drop_fts

In [None]:
SEL_FTS = [x for x in SEL_FTS if x not in drop_fts]
print(len(SEL_FTS))

In [None]:
loaded_dfs['label_strat'] = np.round(loaded_dfs[LABEL] * 20, 0)
loaded_dfs['label_strat'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedKFold
NFOLDS=10
skf5 = StratifiedKFold(n_splits=NFOLDS)

In [None]:
loaded_dfs.columns[~loaded_dfs.columns.isin(test_dfs.columns)]

In [None]:
#run xgb with multiple seeds and gpu support

baseline_error = mean_absolute_error(loaded_dfs['time_to_eruptionmean'],
                  np.full((len(loaded_dfs),), loaded_dfs['time_to_eruptionmean'].mean()))

print('baseline error', baseline_error)

predictions = np.zeros((len(loaded_dfs),))
test_predictions = np.zeros((len(test_dfs),))

ft_imps=pd.Series(index=SEL_FTS,
                  data=0.0)

rslist=range(20)
rs_errors=[]

for count1, RS in enumerate(rslist):
    xgbr = xgb.XGBRegressor(random_state=RS,
                           tree_method='gpu_hist' ,
                            colsample_bytree=0.5,
                            reg_alpha=0.1,
                            missing =np.nan,
                            subsample=0.75
                       )
    models = [xgbr]
    for count,mod in enumerate(models):
        #print(mod)

        for trn_idx, val_idx in skf5.split(loaded_dfs[SEL_FTS], loaded_dfs['label_strat']):
            print('run fold')
            mod.fit(loaded_dfs.loc[trn_idx, SEL_FTS].values, 
                      loaded_dfs.loc[trn_idx,'time_to_eruptionmean'].values)

            predictions[val_idx] +=mod.predict(loaded_dfs.loc[val_idx, SEL_FTS].values)

            print('Fold val Error',mean_absolute_error(loaded_dfs['time_to_eruptionmean'][val_idx],
                          predictions[val_idx]/((count+1) * (count1+1))))

            test_predictions += mod.predict(test_dfs[SEL_FTS].values)
            
            ft_imps+=xgbr.feature_importances_

        print('Error end of model run',mean_absolute_error(loaded_dfs['time_to_eruptionmean'],
                          predictions/((count+1) * (count1+1))))
        
    rs_errors+=[mean_absolute_error(loaded_dfs['time_to_eruptionmean'],
                          predictions/((count+1) * (count1+1)))]

predictions = predictions/(len(models)* len(rslist))
test_predictions = test_predictions / (len(models)*NFOLDS * len(rslist))

predictions = np.where(predictions<0, 0, predictions)
test_predictions = np.where(test_predictions<0, 0, test_predictions)

print(mean_absolute_error(loaded_dfs['time_to_eruptionmean'],
                      predictions))

print('Scaled CV error',mean_absolute_error(loaded_dfs['time_to_eruptionmean'],
                      predictions) * train['time_to_eruption'].max())


fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(18,6))
axes[0].scatter(x=loaded_dfs['time_to_eruptionmean'],
           y=predictions, color='Red')
sns.lineplot(x=range(len(rs_errors)),
           y=np.array(rs_errors), ax=axes[1])

axes[0].set_title('CV predictions vs actual time to eruption')
axes[1].set_title('CV error vs random seed cycle')

In [None]:
print('Scaled CV error',mean_absolute_error(loaded_dfs['time_to_eruptionmean'],
                      predictions) * train['time_to_eruption'].max())

In [None]:
sns.kdeplot(predictions, color='Green')
sns.kdeplot(test_predictions, color='Red')

# let's check feature importance


In [None]:
sns.kdeplot(ft_imps, color='Green')

In [None]:
fig,axes=plt.subplots(figsize=(8,20))
ft_imps=ft_imps.sort_values(ascending=False)
axes.barh(y=ft_imps.index[0:20], width=ft_imps[0:20])

In [None]:
print(len(test_predictions), len(sample_submission))

In [None]:
sample_submission.head(10)

In [None]:
sample_submission['time_to_eruption'] = test_predictions * train['time_to_eruption'].max()

In [None]:
sns.kdeplot(train['time_to_eruption'], color='Green')
sns.kdeplot(sample_submission['time_to_eruption'], color='Red')

In [None]:
sample_submission.head(10)

In [None]:
sample_submission.to_csv('submission.csv', index=False)

In [None]:
print(datetime.now())