* Thanks to Chris for his clean dataset
* Thanks to https://www.kaggle.com/martxelo/fe-and-ensemble-mlp-and-lgbm for signal processing features
* Thanks to https://www.kaggle.com/jazivxt/physically-possible for aggregate features
    
Hyperparammeters were obtain from a simple bayesian local optimization (can be improved)

Feature selection can improve score

More feature engineering can improve score

I hope this kernel help you in your work

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip install slackweb



In [0]:
import numpy as np 
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split
from sklearn import metrics
from tqdm import tqdm
from scipy import signal
import slackweb
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [0]:
def read_data():
    print('Reading training, testing and submission data...')
    train = pd.read_csv('./drive/My Drive/Colab Notebooks/liverpool-ion-switching/train_clean.csv')
    test = pd.read_csv('./drive/My Drive/Colab Notebooks/liverpool-ion-switching/test_clean.csv')
    submission = pd.read_csv('./drive/My Drive/Colab Notebooks/liverpool-ion-switching/sample_submission.csv', dtype={'time':str})
    print('Train set has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
    print('Test set has {} rows and {} columns'.format(test.shape[0], test.shape[1]))
    return train, test, submission

def get_batch(train, test):
    # concatenate data
    batch = 50
    total_batches = 14
    train['set'] = 'train'
    test['set'] = 'test'
    data = pd.concat([train, test])
    for i in range(int(total_batches)):
        data.loc[(data['time'] > i * batch) & (data['time'] <= (i + 1) * batch), 'batch'] = i + 1
    train = data[data['set'] == 'train']
    test = data[data['set'] == 'test']
    train.drop(['set'], inplace = True, axis = 1)
    test.drop(['set'], inplace = True, axis = 1)
    del data
    return train, test

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        if col!='open_channels':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
# signal processing features
def calc_gradients(s, n_grads = 4):
    '''
    Calculate gradients for a pandas series. Returns the same number of samples
    '''
    grads = pd.DataFrame()
    
    g = s.values
    for i in range(n_grads):
        g = np.gradient(g)
        grads['grad_' + str(i+1)] = g
        
    return grads

def calc_low_pass(s, n_filts=10):
    '''
    Applies low pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.3, n_filts)
    
    low_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='low')
        zi = signal.lfilter_zi(b, a)
        low_pass['lowpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        low_pass['lowpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return low_pass

def calc_high_pass(s, n_filts=10):
    '''
    Applies high pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.1, n_filts)
    
    high_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='high')
        zi = signal.lfilter_zi(b, a)
        high_pass['highpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        high_pass['highpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return high_pass

def calc_ewm(s, windows=[10, 50, 100, 500, 1000]):
    '''
    Calculates exponential weighted functions
    '''
    ewm = pd.DataFrame()
    for w in windows:
        ewm['ewm_mean_' + str(w)] = s.ewm(span=w, min_periods=1).mean()
        ewm['ewm_std_' + str(w)] = s.ewm(span=w, min_periods=1).std()
        
    # add zeros when na values (std)
    ewm = ewm.fillna(value=0)
        
    return ewm


def add_features(s):
    '''
    All calculations together
    '''
    
    gradients = calc_gradients(s)
    low_pass = calc_low_pass(s)
    high_pass = calc_high_pass(s)
    ewm = calc_ewm(s)
    
    return pd.concat([s, gradients, low_pass, high_pass, ewm], axis=1)


def divide_and_add_features(s, signal_size=500000):
    '''
    Divide the signal in bags of "signal_size".
    Normalize the data dividing it by 15.0
    '''
    # normalize
    s = s / 15.0
    
    ls = []
    for i in tqdm(range(int(s.shape[0]/signal_size))):
        sig = s[i*signal_size:(i+1)*signal_size].copy().reset_index(drop=True)
        sig_featured = add_features(sig)
        ls.append(sig_featured)
    
    return pd.concat(ls, axis=0)

In [0]:
# rolling and aggreagate batch features
def rolling_features(train, test):
    
    pre_train = train.copy()
    pre_test = test.copy()
    
        
    for df in [pre_train, pre_test]:
        
        df['lag_t1'] = df.groupby('batch')['signal'].transform(lambda x: x.shift(1))
        df['lag_t2'] = df.groupby('batch')['signal'].transform(lambda x: x.shift(2))
        df['lag_t3'] = df.groupby('batch')['signal'].transform(lambda x: x.shift(3))
        
        df['lead_t1'] = df.groupby('batch')['signal'].transform(lambda x: x.shift(-1))
        df['lead_t2'] = df.groupby('batch')['signal'].transform(lambda x: x.shift(-2))
        df['lead_t3'] = df.groupby('batch')['signal'].transform(lambda x: x.shift(-3))
                
        for window in [1000, 5000, 10000, 20000, 40000, 80000]:
            
            # roll backwards
            df['signalmean_t' + str(window)] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(1).rolling(window).mean())
            df['signalstd_t' + str(window)] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(1).rolling(window).std())
            df['signalvar_t' + str(window)] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(1).rolling(window).var())
            df['signalmin_t' + str(window)] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(1).rolling(window).min())
            df['signalmax_t' + str(window)] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(1).rolling(window).max())
            min_max = (df['signal'] - df['signalmin_t' + str(window)]) / (df['signalmax_t' + str(window)] - df['signalmin_t' + str(window)])
            df['norm_t' + str(window)] = min_max * (np.floor(df['signalmax_t' + str(window)]) - np.ceil(df['signalmin_t' + str(window)]))
            
            # roll forward
            df['signalmean_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(- window - 1).rolling(window).mean())
            df['signalstd_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(- window - 1).rolling(window).std())
            df['signalvar_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(- window - 1).rolling(window).var())
            df['signalmin_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(- window - 1).rolling(window).min())
            df['signalmax_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal'].transform(lambda x: x.shift(- window - 1).rolling(window).max())   
            min_max = (df['signal'] - df['signalmin_t' + str(window) + '_lead']) / (df['signalmax_t' + str(window) + '_lead'] - df['signalmin_t' + str(window) + '_lead'])
            df['norm_t' + str(window) + '_lead'] = min_max * (np.floor(df['signalmax_t' + str(window) + '_lead']) - np.ceil(df['signalmin_t' + str(window) + '_lead']))
            
    del train, test, min_max
    
    return pre_train, pre_test

def static_batch_features(df, n):
    
    df = df.copy()
    df.drop('batch', inplace = True, axis = 1)
    df = df.sort_values(by=['time']).reset_index(drop=True)
    df.index = ((df.time * 10000) - 1).values
    df['batch_' + str(n)] = df.index // n
    df['batch_index_' + str(n)] = df.index  - (df['batch_' + str(n)] * n)
    df['batch_slices_' + str(n)] = df['batch_index_' + str(n)]  // (n / 10)
    df['batch_slices2_' + str(n)] = df.apply(lambda r: '_'.join([str(r['batch_' + str(n)]).zfill(3), str(r['batch_slices_' + str(n)]).zfill(3)]), axis=1)

    for c in ['batch_' + str(n), 'batch_slices2_' + str(n)]:
        d = {}
        # -----------------------------------------------
        d['mean' + c] = df.groupby([c])['signal'].mean()
        d['median' + c] = df.groupby([c])['signal'].median()
        d['max' + c] = df.groupby([c])['signal'].max()
        d['min' + c] = df.groupby([c])['signal'].min()
        d['std' + c] = df.groupby([c])['signal'].std()
        d['p10' + c] = df.groupby([c])['signal'].apply(lambda x: np.percentile(x, 10))
        d['p25' + c] = df.groupby([c])['signal'].apply(lambda x: np.percentile(x, 25))
        d['p75' + c] = df.groupby([c])['signal'].apply(lambda x: np.percentile(x, 75))
        d['p90' + c] = df.groupby([c])['signal'].apply(lambda x: np.percentile(x, 90))
        d['skew' + c] = df.groupby([c])['signal'].apply(lambda x: pd.Series(x).skew())
        d['kurtosis' + c] = df.groupby([c])['signal'].apply(lambda x: pd.Series(x).kurtosis())
        min_max = (d['mean' + c] - d['min' + c]) / (d['max' + c] - d['min' + c])
        d['norm' + c] = min_max * (np.floor(d['max' + c]) - np.ceil(d['min' + c]))
        d['mean_abs_chg' + c] = df.groupby([c])['signal'].apply(lambda x: np.mean(np.abs(np.diff(x))))
        d['abs_max' + c] = df.groupby([c])['signal'].apply(lambda x: np.max(np.abs(x)))
        d['abs_min' + c] = df.groupby([c])['signal'].apply(lambda x: np.min(np.abs(x)))
        d['range' + c] = d['max' + c] - d['min' + c]
        d['maxtomin' + c] = d['max' + c] / d['min' + c]
        d['abs_avg' + c] = (d['abs_min' + c] + d['abs_max' + c]) / 2
        # -----------------------------------------------
        for v in d:
            df[v] = df[c].map(d[v].to_dict())

    for c in [c1 for c1 in df.columns if c1 not in ['time', 'signal', 'open_channels', 'batch', 'batch_' + str(n), 
                                                    'batch_index_' + str(n), 'batch_slices_' + str(n), 
                                                    'batch_slices2_' + str(n)]]:
        df[c + '_msignal'] = df[c] - df['signal']
        
    df.reset_index(drop = True, inplace = True)
        
    return df

In [0]:
# simple lgbm with 5 stratified KFold (function has the option for a return for bayesian optimization, just ignore this)
def run_lgb(pre_train, pre_test, features, params):
    
    kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    target = 'open_channels'
    oof_pred = np.zeros(len(pre_train))
    y_pred = np.zeros(len(pre_test))
     
    for fold, (tr_ind, val_ind) in enumerate(kf.split(pre_train, pre_train[target])):
        x_train, x_val = pre_train[features].iloc[tr_ind], pre_train[features].iloc[val_ind]
        y_train, y_val = pre_train[target][tr_ind], pre_train[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        
        model = lgb.train(params, train_set, num_boost_round = 10000, early_stopping_rounds = 50, 
                         valid_sets = [train_set, val_set], verbose_eval = 100)
        
        oof_pred[val_ind] = model.predict(x_val)
        
        y_pred += model.predict(pre_test[features]) / kf.n_splits
        
    rmse_score = np.sqrt(metrics.mean_squared_error(pre_train[target], oof_pred))
    # want to clip and then round predictions (you can get a better performance using optimization to found the best cuts)
    oof_pred = np.round(np.clip(oof_pred, 0, 10)).astype(int)
    round_y_pred = np.round(np.clip(y_pred, 0, 10)).astype(int)
    f1 = metrics.f1_score(pre_train[target], oof_pred, average = 'macro')
    
    
    print(f'Our oof rmse score is {rmse_score}')
    print(f'Our oof macro f1 score is {f1}')
    return round_y_pred

In [0]:
!pip install pykalman



In [0]:
from pykalman import KalmanFilter

def Kalman1D(observations,damping=1):
    # To return the smoothed time series data
    observation_covariance = damping
    initial_value_guess = observations[0]
    transition_matrix = 1
    # transition_covariance = 0.1
    transition_covariance=0.5*np.eye(1)
    initial_value_guess
    kf = KalmanFilter(
            initial_state_mean=initial_value_guess,
            initial_state_covariance=observation_covariance,
            observation_covariance=observation_covariance,
            transition_covariance=transition_covariance,
            transition_matrices=transition_matrix
        )
    pred_state, state_cov = kf.smooth(observations)
    return pred_state

In [0]:
# feature engineer part 1 (signal processing features)
train, test, submission = read_data()

Reading training, testing and submission data...
Train set has 5000000 rows and 3 columns
Test set has 2000000 rows and 2 columns


In [0]:
# Kalman Filter
# observation_covariance = .0015
observation_covariance = .0006
train['signal'] = Kalman1D(train.signal.values,observation_covariance)
test['signal'] = Kalman1D(test.signal.values,observation_covariance)

slack = slackweb.Slack(url="https://hooks.slack.com/services/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")
slack.notify(text="おわった")

'ok'

In [0]:


pre_train4 = divide_and_add_features(train['signal'])
pre_test4 = divide_and_add_features(test['signal'])

pre_train4.drop(['signal'], inplace = True, axis = 1)
pre_test4.drop(['signal'], inplace = True, axis = 1)

pre_train4.reset_index(inplace = True, drop = True)
pre_test4.reset_index(inplace = True, drop = True)

pre_train4 = reduce_mem_usage(pre_train4)
pre_test4 = reduce_mem_usage(pre_test4)


100%|██████████| 10/10 [00:08<00:00,  1.21it/s]
100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


Mem. usage decreased to 514.98 Mb (75.0% reduction)
Mem. usage decreased to 205.99 Mb (75.0% reduction)


In [0]:
# feature engineering part 2 (rolling and aggregate features)
train, test = get_batch(train, test)
pre_train1, pre_test1 = rolling_features(train, test)
pre_train1 = reduce_mem_usage(pre_train1)
pre_test1 = reduce_mem_usage(pre_test1)
pre_train2 = static_batch_features(train, 25000)
pre_train2 = reduce_mem_usage(pre_train2)
pre_test2 = static_batch_features(test, 25000)
pre_test2 = reduce_mem_usage(pre_test2)

Mem. usage decreased to 848.77 Mb (73.2% reduction)
Mem. usage decreased to 339.51 Mb (73.2% reduction)
Mem. usage decreased to 810.62 Mb (73.1% reduction)
Mem. usage decreased to 324.25 Mb (73.1% reduction)


In [0]:

del train, test
gc.collect()

# join features for training
feat2 = [col for col in pre_train2.columns if col not in ['open_channels', 'signal', 'time', 'batch_25000', 
                                                          'batch_index_25000', 'batch_slices_25000', 'batch_slices2_25000']]
pre_train = pd.concat([pre_train1, pre_train2[feat2], pre_train4], axis = 1)
pre_test = pd.concat([pre_test1, pre_test2[feat2], pre_test4], axis = 1)
del pre_train1, pre_train2, pre_train4, pre_test1, pre_test2, pre_test4

In [0]:
pre_train

Unnamed: 0,time,signal,open_channels,batch,lag_t1,lag_t2,lag_t3,lead_t1,lead_t2,lead_t3,signalmean_t1000,signalstd_t1000,signalvar_t1000,signalmin_t1000,signalmax_t1000,norm_t1000,signalmean_t1000_lead,signalstd_t1000_lead,signalvar_t1000_lead,signalmin_t1000_lead,signalmax_t1000_lead,norm_t1000_lead,signalmean_t5000,signalstd_t5000,signalvar_t5000,signalmin_t5000,signalmax_t5000,norm_t5000,signalmean_t5000_lead,signalstd_t5000_lead,signalvar_t5000_lead,signalmin_t5000_lead,signalmax_t5000_lead,norm_t5000_lead,signalmean_t10000,signalstd_t10000,signalvar_t10000,signalmin_t10000,signalmax_t10000,norm_t10000,signalmean_t10000_lead,signalstd_t10000_lead,signalvar_t10000_lead,signalmin_t10000_lead,signalmax_t10000_lead,norm_t10000_lead,signalmean_t20000,signalstd_t20000,signalvar_t20000,signalmin_t20000,signalmax_t20000,norm_t20000,signalmean_t20000_lead,signalstd_t20000_lead,signalvar_t20000_lead,signalmin_t20000_lead,signalmax_t20000_lead,norm_t20000_lead,signalmean_t40000,signalstd_t40000,signalvar_t40000,signalmin_t40000,signalmax_t40000,norm_t40000,signalmean_t40000_lead,signalstd_t40000_lead,signalvar_t40000_lead,signalmin_t40000_lead,signalmax_t40000_lead,norm_t40000_lead,signalmean_t80000,signalstd_t80000,signalvar_t80000,signalmin_t80000,signalmax_t80000,norm_t80000,signalmean_t80000_lead,signalstd_t80000_lead,signalvar_t80000_lead,signalmin_t80000_lead,signalmax_t80000_lead,norm_t80000_lead,meanbatch_25000,medianbatch_25000,maxbatch_25000,minbatch_25000,stdbatch_25000,p10batch_25000,p25batch_25000,p75batch_25000,p90batch_25000,skewbatch_25000,kurtosisbatch_25000,normbatch_25000,mean_abs_chgbatch_25000,abs_maxbatch_25000,abs_minbatch_25000,rangebatch_25000,maxtominbatch_25000,abs_avgbatch_25000,meanbatch_slices2_25000,medianbatch_slices2_25000,maxbatch_slices2_25000,minbatch_slices2_25000,stdbatch_slices2_25000,p10batch_slices2_25000,p25batch_slices2_25000,p75batch_slices2_25000,p90batch_slices2_25000,skewbatch_slices2_25000,kurtosisbatch_slices2_25000,normbatch_slices2_25000,mean_abs_chgbatch_slices2_25000,abs_maxbatch_slices2_25000,abs_minbatch_slices2_25000,rangebatch_slices2_25000,maxtominbatch_slices2_25000,abs_avgbatch_slices2_25000,meanbatch_25000_msignal,medianbatch_25000_msignal,maxbatch_25000_msignal,minbatch_25000_msignal,stdbatch_25000_msignal,p10batch_25000_msignal,p25batch_25000_msignal,p75batch_25000_msignal,p90batch_25000_msignal,skewbatch_25000_msignal,kurtosisbatch_25000_msignal,normbatch_25000_msignal,mean_abs_chgbatch_25000_msignal,abs_maxbatch_25000_msignal,abs_minbatch_25000_msignal,rangebatch_25000_msignal,maxtominbatch_25000_msignal,abs_avgbatch_25000_msignal,meanbatch_slices2_25000_msignal,medianbatch_slices2_25000_msignal,maxbatch_slices2_25000_msignal,minbatch_slices2_25000_msignal,stdbatch_slices2_25000_msignal,p10batch_slices2_25000_msignal,p25batch_slices2_25000_msignal,p75batch_slices2_25000_msignal,p90batch_slices2_25000_msignal,skewbatch_slices2_25000_msignal,kurtosisbatch_slices2_25000_msignal,normbatch_slices2_25000_msignal,mean_abs_chgbatch_slices2_25000_msignal,abs_maxbatch_slices2_25000_msignal,abs_minbatch_slices2_25000_msignal,rangebatch_slices2_25000_msignal,maxtominbatch_slices2_25000_msignal,abs_avgbatch_slices2_25000_msignal,grad_1,grad_2,grad_3,grad_4,lowpass_lf_0.0100,lowpass_ff_0.0100,lowpass_lf_0.0154,lowpass_ff_0.0154,lowpass_lf_0.0239,lowpass_ff_0.0239,lowpass_lf_0.0369,lowpass_ff_0.0369,lowpass_lf_0.0570,lowpass_ff_0.0570,lowpass_lf_0.0880,lowpass_ff_0.0880,lowpass_lf_0.1359,lowpass_ff_0.1359,lowpass_lf_0.2100,lowpass_ff_0.2100,lowpass_lf_0.3244,lowpass_ff_0.3244,lowpass_lf_0.5012,lowpass_ff_0.5012,highpass_lf_0.0100,highpass_ff_0.0100,highpass_lf_0.0163,highpass_ff_0.0163,highpass_lf_0.0264,highpass_ff_0.0264,highpass_lf_0.0430,highpass_ff_0.0430,highpass_lf_0.0699,highpass_ff_0.0699,highpass_lf_0.1136,highpass_ff_0.1136,highpass_lf_0.1848,highpass_ff_0.1848,highpass_lf_0.3005,highpass_ff_0.3005,highpass_lf_0.4885,highpass_ff_0.4885,highpass_lf_0.7943,highpass_ff_0.7943,ewm_mean_10,ewm_std_10,ewm_mean_50,ewm_std_50,ewm_mean_100,ewm_std_100,ewm_mean_500,ewm_std_500,ewm_mean_1000,ewm_std_1000
0,0.0001,-2.759766,0.0,1.0,,,,-2.847656,-2.423828,-3.130859,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.683594,-2.693359,-0.783691,-3.625000,0.266602,-2.996094,-2.855469,-2.531250,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625000,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.000000,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,0.076904,0.067810,1.977539,-0.865234,3.027344,-0.235718,-0.094238,0.228516,0.377197,3.775391,7.210938,3.423828,3.023438,6.386719,3.544922,5.601562,2.976562,4.964844,0.066895,0.067139,0.840332,-0.686523,2.998047,-0.240112,-0.095947,0.222534,0.376221,2.783203,2.710938,3.253906,3.017578,6.207031,4.679688,4.289062,3.318359,5.445312,-0.005825,0.017029,-0.018829,0.001498,-0.184082,-0.186157,-0.184082,-0.186401,-0.184082,-0.186157,-0.184082,-0.185547,-0.184082,-0.184814,-0.184082,-0.184326,-0.184082,-0.184082,-0.184082,-0.184082,-0.184082,-0.184082,-0.184082,-0.184082,0.000000,0.002165,-0.000000,0.002415,0.000000,0.002041,0.000000,0.001249,0.000000,0.000498,0.000000,0.000089,0.000000,-0.000009,0.000000,-0.000002,0.000000,-0.000000,0.000000,-0.000024,-0.184082,0.000000,-0.184082,0.000000,-0.184082,0.000000,-0.184082,0.000000,-0.184082,0.000000
1,0.0002,-2.847656,0.0,1.0,-2.759766,,,-2.423828,-3.130859,-3.144531,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.683594,-2.693359,-0.783691,-3.625000,0.266602,-2.996094,-2.855469,-2.531250,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625000,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.000000,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,0.164307,0.155151,2.064453,-0.777832,3.115234,-0.148315,-0.006844,0.315918,0.464600,3.863281,7.296875,3.511719,3.111328,6.472656,3.630859,5.691406,3.064453,5.050781,0.154297,0.154541,0.927734,-0.599121,3.085938,-0.152832,-0.008598,0.309814,0.463623,2.871094,2.796875,3.341797,3.105469,6.296875,4.769531,4.375000,3.404297,5.531250,0.011208,-0.001787,-0.017319,0.013031,-0.184082,-0.186157,-0.184204,-0.186523,-0.184204,-0.186279,-0.184326,-0.185669,-0.184570,-0.185059,-0.184814,-0.184692,-0.185059,-0.184570,-0.185547,-0.184204,-0.186157,-0.182861,-0.187012,-0.181396,-0.005733,-0.003681,-0.005680,-0.003395,-0.005592,-0.003710,-0.005455,-0.004410,-0.005245,-0.005005,-0.004936,-0.005241,-0.004486,-0.005508,-0.003857,-0.006699,-0.002966,-0.008476,-0.001461,-0.004765,-0.187256,0.004120,-0.187012,0.004120,-0.187012,0.004120,-0.187012,0.004120,-0.187012,0.004120
2,0.0003,-2.423828,0.0,1.0,-2.847656,-2.759766,,-3.130859,-3.144531,-2.650391,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.683594,-2.693359,-0.783691,-3.625000,0.266602,-2.996094,-2.855469,-2.531250,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625000,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.000000,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,-0.259521,-0.268555,1.640625,-1.201172,2.691406,-0.572266,-0.430420,-0.107849,0.040894,3.439453,6.875000,3.087891,2.687500,6.050781,3.207031,5.265625,2.640625,4.628906,-0.269531,-0.269287,0.503906,-1.022461,2.662109,-0.576660,-0.432373,-0.113831,0.039917,2.447266,2.373047,2.917969,2.681641,5.871094,4.343750,3.951172,2.982422,5.109375,-0.009399,-0.017609,0.007244,0.017929,-0.183838,-0.186157,-0.183838,-0.186523,-0.183594,-0.186401,-0.183472,-0.185913,-0.183105,-0.185425,-0.182495,-0.185425,-0.181763,-0.185669,-0.180542,-0.185791,-0.178711,-0.184448,-0.175781,-0.180420,0.022247,0.024551,0.022141,0.024872,0.021973,0.024643,0.021698,0.024078,0.021240,0.023743,0.020508,0.023941,0.019333,0.024246,0.017456,0.023239,0.014328,0.019135,0.007812,0.007660,-0.177002,0.015732,-0.178223,0.015083,-0.178345,0.014999,-0.178467,0.014931,-0.178467,0.014923
3,0.0004,-3.130859,0.0,1.0,-2.423828,-2.847656,-2.759766,-3.144531,-2.650391,-2.697266,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.683594,-2.693359,-0.783691,-3.625000,0.266602,-2.996094,-2.855469,-2.531250,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625000,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.000000,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,0.446289,0.437256,2.345703,-0.495850,3.396484,0.133667,0.275146,0.597656,0.746582,4.144531,7.578125,3.792969,3.392578,6.757812,3.914062,5.972656,3.345703,5.335938,0.436279,0.436523,1.209961,-0.316895,3.367188,0.129272,0.273438,0.591797,0.745605,3.152344,3.080078,3.623047,3.386719,6.578125,5.050781,4.656250,3.687500,5.812500,-0.024017,0.012703,0.018539,-0.008575,-0.183960,-0.186157,-0.183838,-0.186523,-0.183716,-0.186523,-0.183594,-0.186157,-0.183472,-0.185913,-0.183228,-0.186401,-0.182983,-0.187866,-0.182861,-0.190308,-0.183350,-0.193359,-0.185181,-0.197144,-0.024750,-0.022507,-0.024826,-0.022125,-0.024948,-0.022247,-0.025116,-0.022629,-0.025360,-0.022568,-0.025620,-0.021484,-0.025787,-0.019180,-0.025497,-0.015915,-0.023682,-0.011742,-0.015701,-0.004917,-0.187378,0.021194,-0.186279,0.019745,-0.186157,0.019562,-0.186035,0.019424,-0.186035,0.019394
4,0.0005,-3.144531,0.0,1.0,-3.130859,-2.423828,-2.847656,-2.650391,-2.697266,-2.595703,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.683594,-2.693359,-0.783691,-3.625000,0.266602,-2.996094,-2.855469,-2.531250,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625000,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.000000,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,0.461182,0.451904,2.361328,-0.480957,3.412109,0.148438,0.290039,0.612793,0.761230,4.160156,7.593750,3.808594,3.408203,6.769531,3.927734,5.988281,3.361328,5.347656,0.451172,0.451416,1.224609,-0.302002,3.382812,0.144043,0.288330,0.606445,0.760254,3.167969,3.093750,3.638672,3.402344,6.593750,5.066406,4.671875,3.701172,5.828125,0.016006,0.019470,-0.009903,-0.015884,-0.184692,-0.186157,-0.185059,-0.186523,-0.185547,-0.186523,-0.186401,-0.186279,-0.187622,-0.186157,-0.189575,-0.186646,-0.192261,-0.188232,-0.196289,-0.191040,-0.201904,-0.195312,-0.209229,-0.201172,-0.024963,-0.023514,-0.024551,-0.023087,-0.023911,-0.023163,-0.022858,-0.023483,-0.021210,-0.023376,-0.018631,-0.022186,-0.014687,-0.019547,-0.008919,-0.015190,-0.000930,-0.008850,0.007572,-0.001156,-0.193848,0.020752,-0.191284,0.020050,-0.191040,0.019943,-0.190796,0.019852,-0.190796,0.019836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,500.0000,2.951172,7.0,10.0,4.359375,4.179688,2.402344,2.740234,4.519531,5.632812,3.738281,1.465820,2.150391,-1.220703,7.753906,3.718750,,,,,,,3.535156,1.693359,2.867188,-2.904297,8.078125,5.332031,,,,,,,3.601562,1.689453,2.853516,-3.345703,8.078125,6.062500,,,,,,,3.412109,1.767578,3.125000,-3.730469,8.078125,6.222656,,,,,,,3.408203,1.792969,3.214844,-4.128906,8.414062,6.773438,,,,,,,3.285156,1.804688,3.257812,-4.558594,8.46875,6.914062,,,,,,,3.412109,3.437500,8.085938,-3.730469,1.784180,0.988281,2.166016,4.667969,5.699219,-0.243408,-0.199219,6.648438,1.030273,8.085938,0.000317,11.820312,-2.167969,4.042969,3.320312,3.347656,7.910156,-2.904297,1.721680,1.004883,2.138672,4.496094,5.574219,-0.231689,-0.051697,5.179688,1.081055,7.910156,0.000317,10.820312,-2.724609,3.957031,0.461914,0.486084,5.136719,-6.679688,-1.166016,-1.962891,-0.783691,1.718750,2.748047,-3.193359,-3.150391,3.697266,-1.919922,5.136719,-2.951172,8.867188,-5.117188,1.092773,0.370361,0.396484,4.960938,-5.855469,-1.228516,-1.946289,-0.811523,1.543945,2.625000,-3.181641,-3.001953,2.228516,-1.869141,4.960938,-2.951172,7.867188,-5.675781,1.005859,-0.053986,0.046661,0.067383,-0.034576,0.297852,0.316406,0.299072,0.322754,0.292725,0.323242,0.280029,0.316895,0.266113,0.303955,0.255615,0.285156,0.250000,0.264404,0.249390,0.246338,0.250000,0.231445,0.243652,0.216675,-0.101196,-0.206543,-0.102051,-0.189453,-0.093445,-0.164429,-0.078186,-0.133301,-0.063782,-0.102478,-0.054962,-0.076111,-0.052490,-0.054504,-0.053345,-0.037231,-0.047729,-0.020859,-0.017242,-0.001806,0.256348,0.060181,0.297363,0.074646,0.290283,0.078979,0.258301,0.089722,0.247925,0.098022
4999996,500.0000,2.740234,7.0,10.0,2.951172,4.359375,4.179688,4.519531,5.632812,5.394531,3.736328,1.466797,2.150391,-1.220703,7.753906,3.531250,,,,,,,3.535156,1.693359,2.867188,-2.904297,8.078125,5.140625,,,,,,,3.601562,1.689453,2.853516,-3.345703,8.078125,5.859375,,,,,,,3.412109,1.767578,3.125000,-3.730469,8.078125,6.027344,,,,,,,3.408203,1.792969,3.214844,-4.128906,8.414062,6.574219,,,,,,,3.285156,1.804688,3.257812,-4.558594,8.46875,6.722656,,,,,,,3.412109,3.437500,8.085938,-3.730469,1.784180,0.988281,2.166016,4.667969,5.699219,-0.243408,-0.199219,6.648438,1.030273,8.085938,0.000317,11.820312,-2.167969,4.042969,3.320312,3.347656,7.910156,-2.904297,1.721680,1.004883,2.138672,4.496094,5.574219,-0.231689,-0.051697,5.179688,1.081055,7.910156,0.000317,10.820312,-2.724609,3.957031,0.671387,0.695801,5.347656,-6.472656,-0.956543,-1.752930,-0.574219,1.927734,2.958984,-2.984375,-2.939453,3.908203,-1.710938,5.347656,-2.740234,9.078125,-4.910156,1.302734,0.580078,0.605957,5.171875,-5.644531,-1.019531,-1.736328,-0.601562,1.753906,2.833984,-2.972656,-2.792969,2.439453,-1.659180,5.171875,-2.740234,8.078125,-5.464844,1.214844,0.052307,0.075195,-0.029099,-0.066528,0.294678,0.317139,0.293945,0.324219,0.285156,0.325928,0.270020,0.322266,0.253418,0.312744,0.239502,0.297119,0.228516,0.278320,0.218994,0.259033,0.206787,0.239258,0.189575,0.215698,-0.111816,-0.223999,-0.110535,-0.208130,-0.099426,-0.184937,-0.081360,-0.156006,-0.063721,-0.127686,-0.049988,-0.103577,-0.039093,-0.081787,-0.026535,-0.060150,-0.007973,-0.034515,0.005089,-0.007618,0.243042,0.062134,0.292725,0.076538,0.288330,0.079651,0.258057,0.089661,0.247803,0.097961
4999997,500.0000,4.519531,8.0,10.0,2.740234,2.951172,4.359375,5.632812,5.394531,,3.734375,1.466797,2.150391,-1.220703,7.753906,5.117188,,,,,,,3.535156,1.693359,2.867188,-2.904297,8.078125,6.757812,,,,,,,3.601562,1.689453,2.853516,-3.345703,8.078125,7.574219,,,,,,,3.412109,1.767578,3.125000,-3.730469,8.078125,7.683594,,,,,,,3.408203,1.792969,3.214844,-4.128906,8.414062,8.273438,,,,,,,3.285156,1.804688,3.257812,-4.558594,8.46875,8.359375,,,,,,,3.412109,3.437500,8.085938,-3.730469,1.784180,0.988281,2.166016,4.667969,5.699219,-0.243408,-0.199219,6.648438,1.030273,8.085938,0.000317,11.820312,-2.167969,4.042969,3.320312,3.347656,7.910156,-2.904297,1.721680,1.004883,2.138672,4.496094,5.574219,-0.231689,-0.051697,5.179688,1.081055,7.910156,0.000317,10.820312,-2.724609,3.957031,-1.107422,-1.083008,3.568359,-8.250000,-2.734375,-3.531250,-2.353516,0.149536,1.179688,-4.761719,-4.718750,2.128906,-3.488281,3.568359,-4.519531,7.296875,-6.687500,-0.475830,-1.199219,-1.172852,3.392578,-7.425781,-2.798828,-3.515625,-2.380859,-0.025009,1.054688,-4.750000,-4.570312,0.660156,-3.437500,3.392578,-4.519531,6.296875,-7.242188,-0.563477,0.096375,-0.011543,-0.065613,0.006187,0.292969,0.317871,0.291504,0.325684,0.282227,0.329102,0.267090,0.328857,0.251709,0.324463,0.240112,0.315674,0.233398,0.304443,0.230713,0.294678,0.232056,0.289307,0.242065,0.290283,0.008362,-0.109009,0.010567,-0.094727,0.022369,-0.073914,0.039978,-0.048462,0.055756,-0.025146,0.065735,-0.007927,0.070190,0.004326,0.069885,0.011414,0.060211,0.011292,0.027206,0.004154,0.253662,0.061005,0.293213,0.075012,0.288574,0.078857,0.258301,0.089539,0.247925,0.097900
4999998,500.0000,5.632812,9.0,10.0,4.519531,2.740234,2.951172,5.394531,,,3.736328,1.466797,2.152344,-1.220703,7.753906,6.109375,,,,,,,3.535156,1.693359,2.867188,-2.904297,8.078125,7.773438,,,,,,,3.601562,1.689453,2.853516,-3.345703,8.078125,8.640625,,,,,,,3.412109,1.767578,3.125000,-3.730469,8.078125,8.718750,,,,,,,3.408203,1.792969,3.214844,-4.128906,8.414062,9.343750,,,,,,,3.285156,1.804688,3.257812,-4.558594,8.46875,9.382812,,,,,,,3.412109,3.437500,8.085938,-3.730469,1.784180,0.988281,2.166016,4.667969,5.699219,-0.243408,-0.199219,6.648438,1.030273,8.085938,0.000317,11.820312,-2.167969,4.042969,3.320312,3.347656,7.910156,-2.904297,1.721680,1.004883,2.138672,4.496094,5.574219,-0.231689,-0.051697,5.179688,1.081055,7.910156,0.000317,10.820312,-2.724609,3.957031,-2.220703,-2.195312,2.455078,-9.359375,-3.847656,-4.644531,-3.466797,-0.963379,0.066528,-5.875000,-5.832031,1.015625,-4.601562,2.455078,-5.632812,6.183594,-7.800781,-1.588867,-2.312500,-2.285156,2.279297,-8.539062,-3.910156,-4.628906,-3.494141,-1.137695,-0.058075,-5.863281,-5.683594,-0.452637,-4.550781,2.279297,-5.632812,5.183594,-8.359375,-1.676758,0.029221,-0.056091,-0.016724,0.038361,0.294434,0.318604,0.293701,0.327393,0.286133,0.332764,0.274902,0.335938,0.265869,0.337402,0.264160,0.336426,0.270752,0.333252,0.285645,0.332520,0.308350,0.338135,0.338623,0.353027,0.081177,-0.038544,0.082397,-0.025970,0.091858,-0.007843,0.104431,0.013618,0.111511,0.031311,0.108521,0.040771,0.095032,0.043457,0.071777,0.039062,0.038849,0.023514,0.005054,0.002815,0.275879,0.074158,0.296387,0.075317,0.290283,0.079041,0.258789,0.089661,0.248169,0.097961


In [0]:





features = [col for col in pre_train.columns if col not in ['open_channels', 'time', 'batch']]
print('Training with {} features'.format(len(features)))

# define hyperparammeter (using bayesian optimization extracted with 151 features)
params = {'boosting_type': 'gbdt',
          'metric': 'rmse',
          'objective': 'regression',
          'n_jobs': -1,
          'seed': 236,
          'num_leaves': 280,
          'learning_rate': 0.026623466966581126,
          'max_depth': 73,
          'lambda_l1': 2.959759088169741,
          'lambda_l2': 1.331172832164913,
          'bagging_fraction': 0.9655406551472153,
          'bagging_freq': 9,
          'colsample_bytree': 0.6867118652742716}

# run model and predict
round_y_pred = run_lgb(pre_train, pre_test, features, params)
submission['open_channels'] = round_y_pred
submission.to_csv('./drive/My Drive/Colab Notebooks/liverpool-ion-switching/submission.csv', index = False)

Training with 205 features
Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.236651	valid_1's rmse: 0.237879
[200]	training's rmse: 0.152341	valid_1's rmse: 0.155567
[300]	training's rmse: 0.150166	valid_1's rmse: 0.154734
[400]	training's rmse: 0.148817	valid_1's rmse: 0.154582
[500]	training's rmse: 0.14771	valid_1's rmse: 0.154517
[600]	training's rmse: 0.146627	valid_1's rmse: 0.154448
[700]	training's rmse: 0.145604	valid_1's rmse: 0.154394
[800]	training's rmse: 0.144635	valid_1's rmse: 0.154348
[900]	training's rmse: 0.143688	valid_1's rmse: 0.15429
[1000]	training's rmse: 0.14281	valid_1's rmse: 0.154262
[1100]	training's rmse: 0.141966	valid_1's rmse: 0.154244
[1200]	training's rmse: 0.141079	valid_1's rmse: 0.154225
[1300]	training's rmse: 0.140249	valid_1's rmse: 0.154204
Early stopping, best iteration is:
[1298]	training's rmse: 0.140268	valid_1's rmse: 0.154204
Training until validation scores don't improve for 50 rounds.
[100]	training