<a href="https://colab.research.google.com/github/shin04/ion-switching/blob/master/lightgbm_reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle



In [2]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth

auth.authenticate_user()

drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])

filename = "/root/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)

request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

Download 100%.


In [3]:
!kaggle datasets download -d cdeotte/data-without-drift

Downloading data-without-drift.zip to /content
 98% 54.0M/55.0M [00:01<00:00, 36.0MB/s]
100% 55.0M/55.0M [00:01<00:00, 54.1MB/s]


In [4]:
!kaggle datasets download -d sggpls/ion-shifted-rfc-proba

Downloading ion-shifted-rfc-proba.zip to /content
 98% 217M/222M [00:04<00:00, 54.7MB/s]
100% 222M/222M [00:04<00:00, 57.1MB/s]


In [5]:
!kaggle datasets download -d ragnar123/clean-kalman

Downloading clean-kalman.zip to /content
 87% 69.0M/79.0M [00:01<00:00, 37.6MB/s]
100% 79.0M/79.0M [00:01<00:00, 62.8MB/s]


In [6]:
!kaggle competitions download -c liverpool-ion-switching

Downloading train.csv.zip to /content
 34% 9.00M/26.6M [00:00<00:00, 27.3MB/s]
100% 26.6M/26.6M [00:00<00:00, 67.1MB/s]
Downloading test.csv.zip to /content
 50% 5.00M/9.91M [00:00<00:00, 17.7MB/s]
100% 9.91M/9.91M [00:00<00:00, 28.5MB/s]
Downloading sample_submission.csv.zip to /content
100% 4.27M/4.27M [00:00<00:00, 39.8MB/s]



In [7]:
!unzip -o '*.zip'

Archive:  test.csv.zip
  inflating: test.csv                

Archive:  train.csv.zip
  inflating: train.csv               

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   

Archive:  ion-shifted-rfc-proba.zip
  inflating: Y_test_proba.npy        
  inflating: Y_train_proba.npy       

Archive:  data-without-drift.zip
  inflating: test_clean.csv          
  inflating: train_clean.csv         

Archive:  clean-kalman.zip
  inflating: test_clean_kalman.csv   
  inflating: train_clean_kalman.csv  

6 archives were successfully processed.


In [8]:
!pip install tensorflow_addons



In [0]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Input, Dense, Add, Multiply, BatchNormalization, Activation, Dropout
import pandas as pd
import numpy as np
import random
from tensorflow.keras.callbacks import Callback, LearningRateScheduler
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras import losses, models, optimizers
import tensorflow_addons as tfa
import gc
from tqdm import tqdm
from scipy import signal

import lightgbm as lgb

from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import f1_score

import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 500)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [0]:
# configurations and main hyperparammeters
EPOCHS = 110
NNBATCHSIZE = 16
GROUP_BATCH_SIZE = 4000
SEED = 321
LR = 0.001
SPLITS = 5

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [0]:
# read data
def read_data():

    train = pd.read_csv('./train_clean_kalman.csv', dtype={'time': np.float32, 'signal': np.float32, 'open_channels':np.int32})
    test  = pd.read_csv('./test_clean_kalman.csv', dtype={'time': np.float32, 'signal': np.float32})
    sub  = pd.read_csv('./sample_submission.csv', dtype={'time': np.float32})
    
    Y_train_proba = np.load("./Y_train_proba.npy")
    Y_test_proba = np.load("./Y_test_proba.npy")
    
    for i in range(11):
        train[f"proba_{i}"] = Y_train_proba[:, i]
        test[f"proba_{i}"] = Y_test_proba[:, i]

    return train, test, sub

In [0]:
def batching_10(train, test):
    # concatenate data
    # batchを1から10までフル　ややこしい
    batch = 50
    total_batches = 14
    train['set'] = 'train'
    test['set'] = 'test'
    data = pd.concat([train, test])
    for i in range(int(total_batches)):
        data.loc[(data['time'] > i * batch) & (data['time'] <= (i + 1) * batch), 'batch'] = i + 1
    train = data[data['set'] == 'train']
    test = data[data['set'] == 'test']
    train.drop(['set'], inplace = True, axis = 1)
    test.drop(['set'], inplace = True, axis = 1)
    del data
    return train, test

In [0]:
def create_signal_mod(train):
    left = 3641000
    right = 3829000
    thresh_dict = {
        3: [0.1, 2.0],
        2: [-1.1, 0.7],
        1: [-2.3, -0.6],
        0: [-3.8, -2],
    }
    
    # train['signal_mod'] = train['signal'].values
    for ch in train[train['batch']==8]['open_channels'].unique():
        idxs_noisy = (train['open_channels']==ch) & (left<train.index) & (train.index<right)
        idxs_not_noisy = (train['open_channels']==ch) & ~idxs_noisy
        mean = train[idxs_not_noisy]['signal'].mean()

        idxs_outlier = idxs_noisy & (thresh_dict[ch][1]<train['signal'].values)
        train['signal'][idxs_outlier]  = mean
        idxs_outlier = idxs_noisy & (train['signal'].values<thresh_dict[ch][0])
        train['signal'][idxs_outlier]  = mean
    return train

In [0]:
# !pip install pykalman

In [0]:
# from pykalman import KalmanFilter

# def Kalman1D(observations, damping=1):
#     # To return the smoothed time series data
#     observation_covariance = damping
#     initial_value_guess = observations[0]
#     transition_matrix = 1
#     transition_covariance = 0.1
#     initial_value_guess
#     kf = KalmanFilter(
#             initial_state_mean=initial_value_guess,
#             initial_state_covariance=observation_covariance,
#             observation_covariance=observation_covariance,
#             transition_covariance=transition_covariance,
#             transition_matrices=transition_matrix
#         )
#     pred_state, state_cov = kf.smooth(observations)
#     return pred_state

In [0]:
# create batches of 4000 observations
def batching(df, batch_size):
    df['group'] = df.groupby(df.index//batch_size, sort=False)['signal'].agg(['ngroup']).values
    df['group'] = df['group'].astype(np.uint16)
    return df

In [0]:
# normalize the data (standard scaler). We can also try other scalers for a better score!
def normalize(train, test):
    train_input_mean = train.signal.mean()
    train_input_sigma = train.signal.std()
    train['signal'] = (train.signal - train_input_mean) / train_input_sigma
    test['signal'] = (test.signal - train_input_mean) / train_input_sigma
    
    return train, test

In [0]:
def add_category(train, test):
  train["category"] = 0
  test["category"] = 0

  # train segments with more then 9 open channels classes
  train.loc[2000000:2500000-1, 'category'] = 1
  train.loc[4500000:5000000-1, 'category'] = 1

  # test segments with more then 9 open channels classes (potentially)
  test.loc[500000:600000-1, "category"] = 1
  test.loc[700000:800000-1, "category"] = 1
  
  return train, test

In [0]:
# signal processing features
def calc_gradients(s, n_grads = 4):
    '''
    Calculate gradients for a pandas series. Returns the same number of samples
    '''
    grads = pd.DataFrame()
    
    g = s.values
    for i in range(n_grads):
        g = np.gradient(g)
        grads['grad_' + str(i+1)] = g
        
    return grads

def calc_low_pass(s, n_filts=10):
    '''
    Applies low pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.3, n_filts)
    
    low_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='low')
        zi = signal.lfilter_zi(b, a)
        low_pass['lowpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        low_pass['lowpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return low_pass

def calc_high_pass(s, n_filts=10):
    '''
    Applies high pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.1, n_filts)
    
    high_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='high')
        zi = signal.lfilter_zi(b, a)
        high_pass['highpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        high_pass['highpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return high_pass

def calc_ewm(s, windows=[10, 50, 100, 500, 1000]):
    '''
    Calculates exponential weighted functions
    '''
    ewm = pd.DataFrame()
    for w in windows:
        ewm['ewm_mean_' + str(w)] = s.ewm(span=w, min_periods=1).mean()
        ewm['ewm_std_' + str(w)] = s.ewm(span=w, min_periods=1).std()
        
    # add zeros when na values (std)
    ewm = ewm.fillna(value=0)
        
    return ewm


def add_features(s):
    '''
    All calculations together
    '''
    
    gradients = calc_gradients(s)
    low_pass = calc_low_pass(s)
    high_pass = calc_high_pass(s)
    ewm = calc_ewm(s)
    
    return pd.concat([s, gradients, low_pass, high_pass, ewm], axis=1)


# signal_size を考える
# 500000? or 4000?
def divide_and_add_features(s, signal_size=500000):
    '''
    Divide the signal in bags of "signal_size".
    Normalize the data dividing it by 15.0
    '''
    # normalize
    s = s / 15.0
    
    ls = []
    for i in tqdm(range(int(s.shape[0]/signal_size))):
        sig = s[i*signal_size:(i+1)*signal_size].copy().reset_index(drop=True)
        sig_featured = add_features(sig)
        ls.append(sig_featured)
    
    return pd.concat(ls, axis=0)

In [0]:
# get lead and lags features
def lag_with_pct_change(df, windows):
    for window in windows:    
        df['signal_shift_pos_' + str(window)] = df.groupby('group')['signal'].shift(window).fillna(0)
        df['signal_shift_neg_' + str(window)] = df.groupby('group')['signal'].shift(-1 * window).fillna(0)
    return df

In [0]:
def calc_roll_stats(df, windows, group='group'):
    '''
    Calculates rolling stats like mean, std, min, max...
    '''
    for i, window in enumerate(windows):
      df[group + 'roll_mean_' + str(window)] = df.groupby(group)['signal'].rolling(window=window, min_periods=window).mean().values
      df[group + 'roll_std_' + str(window)] = df.groupby(group)['signal'].rolling(window=window, min_periods=window).std().values
      df[group + 'roll_min_' + str(window)] = df.groupby(group)['signal'].rolling(window=window, min_periods=window).min().values
      df[group + 'roll_max_' + str(window)] = df.groupby(group)['signal'].rolling(window=window, min_periods=window).max().values
      df[group + 'roll_range' + str(window)] = df[group + 'roll_max_' + str(window)] - df[group + 'roll_min_' + str(window)]

      df['roll_q10_' + str(window)] = df.groupby('group')['signal'].rolling(window=window, min_periods=window).quantile(0.10).values
      df['roll_q25_' + str(window)] = df.groupby('group')['signal'].rolling(window=window, min_periods=window).quantile(0.25).values
      df['roll_q50_' + str(window)] = df.groupby('group')['signal'].rolling(window=window, min_periods=window).quantile(0.50).values
      df['roll_q75_' + str(window)] = df.groupby('group')['signal'].rolling(window=window, min_periods=window).quantile(0.75).values
      df['roll_q90_' + str(window)] = df.groupby('group')['signal'].rolling(window=window, min_periods=window).quantile(0.90).values
             
    return df

In [0]:
def calc_expand_stats(df, group='group'):
  df['expanding_mean'] = df.groupby(group)['signal'].expanding().mean().fillna(0).values
  df['expanding_std'] = df.groupby(group)['signal'].expanding().std().fillna(0).values
  df['expanding_max'] = df.groupby(group)['signal'].expanding().max().fillna(0).values
  df['expanding_min'] = df.groupby(group)['signal'].expanding().min().fillna(0).values
  df['expanding_range'] = df['expanding_max'] - df['expanding_min']
  
  return df

In [0]:
# main module to run feature engineering. Here you may want to try and add other features and check if your score imporves :).
def run_feat_engineering(df, batch_size):
    # create batches
    df = batching(df, batch_size = batch_size)

    # create leads and lags
    df = lag_with_pct_change(df, np.asarray(range(1, 3), dtype=np.int32))

    # create rolling stats
    # df = calc_roll_stats(df, [3, 10, 50, 100, 500, 1000]) # groupごと(4000)
    df = calc_roll_stats(df, [3, 10, 50, 100,  500, 1000])
    # df = calc_roll_stats(df, [50000, 100000], group='batch') # batchごと(500000)

    # create expanding stats
    # df = calc_expand_stats(df)

    # create signal ** 2 (this is the new feature)
    df['signal_2'] = df['signal'] ** 2

    return df

In [0]:
# fillna with the mean and select features for training
def feature_selection(train, test):
    features = [col for col in train.columns if col not in ['index', 'group', 'open_channels', 'time', 'batch', 'train_group', 'test_group']]
    train = train.replace([np.inf, -np.inf], np.nan)
    test = test.replace([np.inf, -np.inf], np.nan)
    for feature in features:
        feature_mean = pd.concat([train[feature], test[feature]], axis = 0).mean()
        train[feature] = train[feature].fillna(feature_mean)
        test[feature] = test[feature].fillna(feature_mean)
    return train, test, features

In [0]:
# main function to perfrom groupkfold cross validation (we have 1000 vectores of 4000 rows and 8 features (columns)). Going to make 5 groups with this subgroups.
def run_cv_model_by_batch(train, test, splits, batch_col, feats, sample_submission):
    oof_ = np.zeros(len(train))
    preds_ = np.zeros(len(test))
    target = ['open_channels']
    group = train['group']
    # kf = GroupKFold(n_splits=5)
    kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

    # for n_fold, (tr_idx, val_idx) in enumerate(kf.split(train, train[target], groups=group)):
    for n_fold, (tr_idx, val_idx) in enumerate(kf.split(train, train[target])):
        train_x, train_y = train.iloc[tr_idx], train[target].iloc[tr_idx]
        valid_x, valid_y = train.iloc[val_idx], train[target].iloc[val_idx]
        print(f'Our training dataset shape is {train_x.shape}')
        print(f'Our validation dataset shape is {valid_x.shape}')

        gc.collect()

        train_set = lgb.Dataset(train_x[feats], train_y)
        val_set = lgb.Dataset(valid_x[feats], valid_y)

        params = {'boosting_type': 'gbdt',
                'metric': 'rmse',
                'objective': 'regression',
                'n_jobs': -1,
                'seed': 236,
                'num_leaves': 280,
                'learning_rate': 0.026623466966581126,
                'max_depth': 73,
                'lambda_l1': 2.959759088169741,
                'lambda_l2': 1.331172832164913,
                'bagging_fraction': 0.9655406551472153,
                'bagging_freq': 9,
                'colsample_bytree': 0.6867118652742716}
        
        lgb_model = lgb.train(
            params, 
            train_set, num_boost_round = 10000, 
            early_stopping_rounds = 50, 
            valid_sets = [train_set, val_set], 
            valid_names = ['train', 'eval'],
            verbose_eval = 100,)

        preds_f = lgb_model.predict(valid_x[feats])
        f1_score_ = f1_score(valid_y,  np.round(np.clip(preds_f, 0, 10)).astype(int), average = 'macro')
        print(f'Training fold {n_fold + 1} completed. macro f1 score : {f1_score_ :1.5f}')
        oof_[val_idx] += preds_f
        te_preds = lgb_model.predict(test[feats])
        preds_ += te_preds / SPLITS

    return preds_, oof_

    # f1_score_ = f1_score(train[target],  np.round(np.clip(oof_, 0, 10)).astype(int), average = 'macro')
    # print(f'Training completed. oof macro f1 score : {f1_score_:1.5f}')
    # sample_submission['open_channels'] = np.round(np.clip(preds_, 0, 10)).astype(int)
    # sample_submission.to_csv('lgb_submission.csv', index=False, float_format='%.4f')

In [0]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        if col!='open_channels':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [27]:
print('Reading Data Started...')
train, test, sample_submission = read_data()
train, test = batching_10(train, test)
train = create_signal_mod(train)

# # Kalman Filter
# observation_covariance = .0015
# train['signal'] = Kalman1D(train.signal.values,observation_covariance)
# test['signal'] = Kalman1D(test.signal.values,observation_covariance)

train, test = normalize(train, test)
print('Reading and Normalizing Data Completed')

Reading Data Started...
Reading and Normalizing Data Completed


In [28]:
print('Creating Features')
print('Feature Engineering Started...')

train, test = add_category(train, test)

# train, test = batching_10(train, test)

train = run_feat_engineering(train, batch_size = GROUP_BATCH_SIZE)
test = run_feat_engineering(test, batch_size = GROUP_BATCH_SIZE)

print('Reduce memory usage...')
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# tr_clean = pd.read_csv('./train_clean.csv', dtype={'time': np.float32, 'signal': np.float32, 'open_channels':np.int32})
# ts_clean = pd.read_csv('./test_clean.csv', dtype={'time': np.float32, 'signal': np.float32})
# tr_clean, rs_clean = batching_10(tr_clean, ts_clean)
# tr_clean = create_signal_mod(tr_clean)
pre_train = divide_and_add_features(train.signal, signal_size=500000)
pre_test = divide_and_add_features(test.signal, signal_size=500000)
pre_train.drop('signal', axis=1, inplace=True)
pre_test.drop('signal', axis=1, inplace=True)
pre_train.reset_index(inplace = True, drop = True)
pre_test.reset_index(inplace = True, drop = True)
train = pd.concat([train, pre_train], axis=1)
test = pd.concat([test, pre_test], axis=1)

# del pre_train, pre_test, tr_clean, ts_clean
del pre_train, pre_test
gc.collect

print('Reduce memory usage...')
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

train, test, features = feature_selection(train, test)
print('Feature Engineering Completed...')

Creating Features
Feature Engineering Started...
Reduce memory usage...
Mem. usage decreased to 1004.00 Mb (69.4% reduction)


  0%|          | 0/10 [00:00<?, ?it/s]

Mem. usage decreased to 417.60 Mb (68.5% reduction)


100%|██████████| 10/10 [00:09<00:00,  1.04it/s]
100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


Reduce memory usage...
Mem. usage decreased to 1518.99 Mb (48.5% reduction)
Mem. usage decreased to 623.59 Mb (47.9% reduction)
Feature Engineering Completed...


In [29]:
print('Reduce memory usage...')
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Reduce memory usage...
Mem. usage decreased to 1518.99 Mb (0.0% reduction)
Mem. usage decreased to 623.59 Mb (0.0% reduction)


In [0]:
def train_grouping(train, test):
  train.loc[0:1000000, 'train_group'] = 0          # batch 0 and 1
  train.loc[1000000:1500000, 'train_group'] = 1  # batch 2
  train.loc[1500000:2000000, 'train_group'] = 2  # batch 3
  train.loc[2000000:2500000, 'train_group'] = 3  # batch 4
  train.loc[2500000:3000000, 'train_group'] = 4  # batch 5
  train.loc[3000000:3500000, 'train_group'] = 1  # batch 6
  train.loc[3500000:4000000, 'train_group'] = 2  # batch 7
  train.loc[4000000:4500000, 'train_group'] = 4  # batch 8
  train.loc[4500000:5000001, 'train_group'] = 3  # batch 9

  test.loc[0:100000, 'test_group'] = 0
  test.loc[100000:200000, 'test_group'] = 2
  test.loc[200000:300000, 'test_group'] = 4
  test.loc[300000:400000, 'test_group'] = 0
  test.loc[400000:500000, 'test_group'] = 1
  test.loc[500000:600000, 'test_group'] = 3
  test.loc[600000:700000, 'test_group'] = 4
  test.loc[700000:800000, 'test_group'] = 3
  test.loc[800000:900000, 'test_group'] = 0
  test.loc[900000:1000000, 'test_group'] = 2
  test.loc[1000000:, 'test_group'] = 0

  return train, test

In [31]:
# print(f'Training lgb model with {SPLITS} folds of GroupKFold Started...')
# run_cv_model_by_batch(train, test, SPLITS, 'group', features, sample_submission)
# print('Training completed...')

train. test = train_grouping(train, test)
preds = []
oofs = []
print('Training started')
for i in range(5):
  print(f'Training train_group {i+1} started...')
  group_train = train[train['train_group'] == i]
  group_test = test[test['test_group'] == i]
  pred, oof = run_cv_model_by_batch(group_train, group_test, SPLITS, 'group', features, sample_submission)
  # preds += pred.tolist()
  # oofs += oof.tolist()
  preds.append(pred)
  oofs.append(oof)
  print(f'Training train_group {i+1} completed...')
print('Training completed...')

Training started
Training train_group 1 started...
Our training dataset shape is (800000, 137)
Our validation dataset shape is (200000, 137)
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.0192802	eval's rmse: 0.0209293
[200]	train's rmse: 0.0127243	eval's rmse: 0.0166619
[300]	train's rmse: 0.0107137	eval's rmse: 0.0165816
Early stopping, best iteration is:
[264]	train's rmse: 0.0113591	eval's rmse: 0.0165659
Training fold 1 completed. macro f1 score : 0.99746
Our training dataset shape is (800000, 137)
Our validation dataset shape is (200000, 137)
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.0192884	eval's rmse: 0.0206367
[200]	train's rmse: 0.0127833	eval's rmse: 0.0164833
Early stopping, best iteration is:
[207]	train's rmse: 0.0126253	eval's rmse: 0.0164672
Training fold 2 completed. macro f1 score : 0.99728
Our training dataset shape is (800000, 137)
Our validation dataset shape is (200000, 137)
Training u

In [0]:
def sort_oof_data(arr):
  arr_ = []
  arr_ += arr[0][0:].tolist()
  arr_ += arr[1][0:500000].tolist()
  arr_ += arr[2][0:500000].tolist()
  arr_ += arr[3][0:500000].tolist()
  arr_ += arr[4][0:500000].tolist()
  arr_ += arr[1][500000:].tolist()
  arr_ += arr[2][500000:].tolist()
  arr_ += arr[4][500000:].tolist()
  arr_ += arr[3][500000:].tolist()
  return arr_

full_oof = sort_oof_data(oofs)

In [0]:
def sort_pred_data(arr):
  arr_ = []
  arr_ += arr[0][0:100000].tolist()
  arr_ += arr[2][0:100000].tolist()
  arr_ += arr[4][0:100000].tolist()
  arr_ += arr[0][100000:200000].tolist()
  arr_ += arr[1][0:100000].tolist()
  arr_ += arr[3][0:100000].tolist()
  arr_ += arr[4][100000:].tolist()
  arr_ += arr[3][100000:].tolist()
  arr_ += arr[0][200000:300000].tolist()
  arr_ += arr[2][100000:].tolist()
  arr_ += arr[0][300000:].tolist()
  return arr_

full_pred = sort_pred_data(preds)

In [34]:
f1_score_ = f1_score(train.open_channels,  np.round(np.clip(full_oof, 0, 10)).astype(int), average = 'macro')
print(f'Training completed. oof macro f1 score : {f1_score_:1.5f}')
sample_submission['open_channels'] = np.round(np.clip(full_pred, 0, 10)).astype(int)
sample_submission.to_csv('lgb_submission.csv', index=False, float_format='%.4f')

Training completed. oof macro f1 score : 0.94008


In [38]:
!kaggle competitions submit -f './lgb_submission.csv' -m 'lgb' liverpool-ion-switching

100% 21.0M/21.0M [00:04<00:00, 4.65MB/s]
Successfully submitted to University of Liverpool - Ion Switching

training with clean_kalman
> Training completed. oof macro f1 score : 0.93940  
> LB : 0.940  

clean_kalman  
add category  
add 20shifted
> Training completed. oof macro f1 score : 0.93941  
> LB : 0.940

clean_kalman  
add category  
add 20shifted  
add target encoding(group)
> Training completed. oof macro f1 score : 0.93880  
> LB : 0.653

clean_kalman + add category + add 20shifted
> Training completed. oof macro f1 score : 0.93953  
> LB : 0.942  

clean_kalman + add category + add 20shifted + target encording(category)  
>　Training completed. oof macro f1 score : 0.93842  
>　LB : 0.941