In [None]:
import numpy as npp
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats
from sklearn.kernel_ridge import KernelRidge
from itertools import product

from tsfresh.feature_extraction import feature_calculators
from joblib import Parallel, delayed

In [None]:
# Create a training file with simple derived features

def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

def classic_sta_lta(x, length_sta, length_lta):
    
    sta = np.cumsum(x ** 2)

    # Convert to float
    sta = np.require(sta, dtype=np.float)

    # Copy for LTA
    lta = sta.copy()

    # Compute the STA and the LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta

    # Pad zeros
    sta[:length_lta - 1] = 0

    # Avoid division by zero by setting zero values to tiny float
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny

    return sta / lta

def calc_change_rate(x):
    change = (np.diff(x) / x[:-1]).values
    change = change[np.nonzero(change)[0]]
    change = change[~np.isnan(change)]
    change = change[change != -np.inf]
    change = change[change != np.inf]
    return np.mean(change)

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats
from sklearn.kernel_ridge import KernelRidge
from itertools import product

from tsfresh.feature_extraction import feature_calculators
from joblib import Parallel, delayed
# Create a training file with simple derived features

def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

def classic_sta_lta(x, length_sta, length_lta):
    
    sta = np.cumsum(x ** 2)

    # Convert to float
    sta = np.require(sta, dtype=np.float)

    # Copy for LTA
    lta = sta.copy()

    # Compute the STA and the LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta

    # Pad zeros
    sta[:length_lta - 1] = 0

    # Avoid division by zero by setting zero values to tiny float
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny

    return sta / lta

def calc_change_rate(x):
    change = (np.diff(x) / x[:-1]).values
    change = change[np.nonzero(change)[0]]
    change = change[~np.isnan(change)]
    change = change[change != -np.inf]
    change = change[change != np.inf]
    return np.mean(change)
class FeatureGenerator(object):
    def __init__(self, dtype, n_jobs=1, chunk_size=None):
        self.chunk_size = chunk_size
        self.dtype = dtype
        self.filename = None
        self.n_jobs = n_jobs
        self.test_files = []
        if self.dtype == 'train':
            self.filename = '../input/train.csv'
            self.total_data = int(629145481 / self.chunk_size)
        else:
            submission = pd.read_csv('../input/sample_submission.csv')
            for seg_id in submission.seg_id.values:
                self.test_files.append((seg_id, '../input/test/' + seg_id + '.csv'))
            self.total_data = int(len(submission))

    def read_chunks(self):
        if self.dtype == 'train':
            iter_df = pd.read_csv(self.filename, iterator=True, chunksize=self.chunk_size,
                                  dtype={'acoustic_data': np.float64, 'time_to_failure': np.float64})
            for counter, df in enumerate(iter_df):
                x = df.acoustic_data.values
                y = df.time_to_failure.values[-1]
                seg_id = 'train_' + str(counter)
                del df
                yield seg_id, x, y
        else:
            for seg_id, f in self.test_files:
                df = pd.read_csv(f, dtype={'acoustic_data': np.float64})
                x = df.acoustic_data.values[-self.chunk_size:]
                del df
                yield seg_id, x, -999
    
    def get_features(self, x, y, seg_id):
        """
        Gets three groups of features: from original data and from reald and imaginary parts of FFT.
        """
        
        x = pd.Series(x)
    
        zc = np.fft.fft(x)
        realFFT = pd.Series(np.real(zc))
        imagFFT = pd.Series(np.imag(zc))
        
        main_dict = self.features(x, y, seg_id)
        r_dict = self.features(realFFT, y, seg_id)
        i_dict = self.features(imagFFT, y, seg_id)
        
        for k, v in r_dict.items():
            if k not in ['target', 'seg_id']:
                main_dict[f'fftr_{k}'] = v
                
        for k, v in i_dict.items():
            if k not in ['target', 'seg_id']:
                main_dict[f'ffti_{k}'] = v
        
        return main_dict
        
    
    def features(self, x, y, seg_id):
        feature_dict = dict()
        feature_dict['target'] = y
        feature_dict['seg_id'] = seg_id

        # create features here

        # lists with parameters to iterate over them
        percentiles = [1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99]
        percentiles1 = [5,10,20]
        hann_windows = [50, 150, 1500, 15000]
        spans = [300, 3000, 30000, 50000]
        windows = [10, 50, 100, 500, 1000, 10000]
        borders = list(range(-4000, 4001, 1000))
        peaks = [10, 20, 50, 100]
        coefs = [1, 5, 10, 50, 100]
        lags = [10, 100, 1000, 10000]
        autocorr_lags = [5, 10, 50, 100, 500, 1000, 5000, 10000]

        # basic stats
#         feature_dict['mean'] = x.mean()
#         feature_dict['std'] = x.std()
#         feature_dict['max'] = x.max()
#         feature_dict['min'] = x.min()

#         # basic stats on absolute values
#         feature_dict['mean_change_abs'] = np.mean(np.diff(x))
#         feature_dict['abs_max'] = np.abs(x).max()
#         feature_dict['abs_mean'] = np.abs(x).mean()
#         feature_dict['abs_std'] = np.abs(x).std()

#         # geometric and harminic means
#         feature_dict['hmean'] = stats.hmean(np.abs(x[np.nonzero(x)[0]]))
#         feature_dict['gmean'] = stats.gmean(np.abs(x[np.nonzero(x)[0]])) 

#         # k-statistic and moments
#         for i in range(1, 5):
#             feature_dict[f'kstat_{i}'] = stats.kstat(x, i)
#             feature_dict[f'moment_{i}'] = stats.moment(x, i)

#         for i in [1, 2]:
#             feature_dict[f'kstatvar_{i}'] = stats.kstatvar(x, i)
        # note!
        feature_dict['std_first_50000'] = x[:50000].agg('std')
        # aggregations on various slices of data
#         for agg_type, slice_length, direction in product(['std', 'min', 'max', 'mean'], [1000, 10000, 50000], ['first', 'last']):
#             if direction == 'first':
#                 feature_dict[f'{agg_type}_{direction}_{slice_length}'] = x[:slice_length].agg(agg_type)
#             elif direction == 'last':
#                 feature_dict[f'{agg_type}_{direction}_{slice_length}'] = x[-slice_length:].agg(agg_type)

#         feature_dict['max_to_min'] = x.max() / np.abs(x.min())
#         feature_dict['max_to_min_diff'] = x.max() - np.abs(x.min())
#         feature_dict['count_big'] = len(x[np.abs(x) > 500])
#         feature_dict['sum'] = x.sum()

#         feature_dict['mean_change_rate'] = calc_change_rate(x)
        # calc_change_rate on slices of data
#         for slice_length, direction in product([1000, 10000, 50000], ['first', 'last']):
#             if direction == 'first':
#                 feature_dict[f'mean_change_rate_{direction}_{slice_length}'] = calc_change_rate(x[:slice_length])
#             elif direction == 'last':
#                 feature_dict[f'mean_change_rate_{direction}_{slice_length}'] = calc_change_rate(x[-slice_length:])

        # percentiles on original and absolute values
        feature_dict['percentile_25'] = np.percentile(x, 25)
        
        for p in percentiles1:
            feature_dict[f'abs_percentile_{p}'] = np.percentile(np.abs(x), p)

#         feature_dict['trend'] = add_trend_feature(x)
#         feature_dict['abs_trend'] = add_trend_feature(x, abs_values=True)

#         feature_dict['mad'] = x.mad()
#         feature_dict['kurt'] = x.kurtosis()
#         feature_dict['skew'] = x.skew()
#         feature_dict['med'] = x.median()

#         feature_dict['Hilbert_mean'] = np.abs(hilbert(x)).mean()

#         for hw in hann_windows:
#             feature_dict[f'Hann_window_mean_{hw}'] = (convolve(x, hann(hw), mode='same') / sum(hann(hw))).mean()

#         feature_dict['classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean()
#         feature_dict['classic_sta_lta2_mean'] = classic_sta_lta(x, 5000, 100000).mean()
#         feature_dict['classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean()
#         feature_dict['classic_sta_lta4_mean'] = classic_sta_lta(x, 10000, 25000).mean()
#         feature_dict['classic_sta_lta5_mean'] = classic_sta_lta(x, 50, 1000).mean()
#         feature_dict['classic_sta_lta6_mean'] = classic_sta_lta(x, 100, 5000).mean()
#         feature_dict['classic_sta_lta7_mean'] = classic_sta_lta(x, 333, 666).mean()
#         feature_dict['classic_sta_lta8_mean'] = classic_sta_lta(x, 4000, 10000).mean()

        # exponential rolling statistics
        ewma = pd.Series.ewm
        for s in spans:
#             feature_dict[f'exp_Moving_average_{s}_mean'] = (ewma(x, span=s).mean(skipna=True)).mean(skipna=True)
#             feature_dict[f'exp_Moving_average_{s}_std'] = (ewma(x, span=s).mean(skipna=True)).std(skipna=True)
#             feature_dict[f'exp_Moving_std_{s}_mean'] = (ewma(x, span=s).std(skipna=True)).mean(skipna=True)
            feature_dict[f'exp_Moving_std_{s}_std'] = (ewma(x, span=s).std(skipna=True)).std(skipna=True)

        feature_dict['iqr'] = np.subtract(*np.percentile(x, [75, 25]))
#         feature_dict['iqr1'] = np.subtract(*np.percentile(x, [95, 5]))
#         feature_dict['ave10'] = stats.trim_mean(x, 0.1)
        
        for slice_length, threshold in product([50000, 100000, 150000],
                                                     [10, 50, 100]):
            feature_dict[f'count_big_{slice_length}_threshold_{threshold}'] = (np.abs(x[-slice_length:]) > threshold).sum()
            feature_dict[f'count_big_{slice_length}_less_threshold_{threshold}'] = (np.abs(x[-slice_length:]) < threshold).sum()

        # tfresh features take too long to calculate, so I comment them for now

#         feature_dict['abs_energy'] = feature_calculators.abs_energy(x)
#         feature_dict['abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes(x)
#         feature_dict['count_above_mean'] = feature_calculators.count_above_mean(x)
#         feature_dict['count_below_mean'] = feature_calculators.count_below_mean(x)
#         feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change(x)
#         feature_dict['mean_change'] = feature_calculators.mean_change(x)
#         feature_dict['var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation(x)
#         feature_dict['range_minf_m4000'] = feature_calculators.range_count(x, -np.inf, -4000)
#         feature_dict['range_p4000_pinf'] = feature_calculators.range_count(x, 4000, np.inf)

        for i, j in zip(borders, borders[1:]):
            feature_dict[f'range_{i}_{j}'] = feature_calculators.range_count(x, i, j)

#         feature_dict['ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(x)
#         feature_dict['first_loc_min'] = feature_calculators.first_location_of_minimum(x)
#         feature_dict['first_loc_max'] = feature_calculators.first_location_of_maximum(x)
#         feature_dict['last_loc_min'] = feature_calculators.last_location_of_minimum(x)
#         feature_dict['last_loc_max'] = feature_calculators.last_location_of_maximum(x)

#         for lag in lags:
#             feature_dict[f'time_rev_asym_stat_{lag}'] = feature_calculators.time_reversal_asymmetry_statistic(x, lag)
#         for autocorr_lag in autocorr_lags:
#             feature_dict[f'autocorrelation_{autocorr_lag}'] = feature_calculators.autocorrelation(x, autocorr_lag)
#             feature_dict[f'c3_{autocorr_lag}'] = feature_calculators.c3(x, autocorr_lag)

#         for coeff, attr in product([1, 2, 3, 4, 5], ['real', 'imag', 'angle']):
#             feature_dict[f'fft_{coeff}_{attr}'] = list(feature_calculators.fft_coefficient(x, [{'coeff': coeff, 'attr': attr}]))[0][1]

#         feature_dict['long_strk_above_mean'] = feature_calculators.longest_strike_above_mean(x)
#         feature_dict['long_strk_below_mean'] = feature_calculators.longest_strike_below_mean(x)
#         feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0)
#         feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1)

#         for p in percentiles:
#             feature_dict[f'binned_entropy_{p}'] = feature_calculators.binned_entropy(x, p)

#         feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m(x, 0)
        
    
        feature_dict['num_peaks_10'] = feature_calculators.number_peaks(x, 10)
        
#         for peak in peaks:
#             feature_dict[f'num_peaks_{peak}'] = feature_calculators.number_peaks(x, peak)

#         for c in coefs:
#             feature_dict[f'spkt_welch_density_{c}'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': c}]))[0][1]
#             feature_dict[f'time_rev_asym_stat_{c}'] = feature_calculators.time_reversal_asymmetry_statistic(x, c)  

        # statistics on rolling windows of various sizes
        feature_dict['min_roll_std_1000'] = x.rolling(1000).std().dropna().values.min()
        feature_dict['min_roll_std_10000'] = x.rolling(10000).std().dropna().values.min()

        feature_dict[f'std_roll_std_{10}'] = x.rolling(10).std().dropna().values.std()
        
        for w in [1000,10000]:
            x_roll_std = x.rolling(w).std().dropna().values
            feature_dict[f'abs_max_roll_std_{w}'] = np.abs(x_roll_std).max()
        
        for w in windows:
            x_roll_std = x.rolling(w).std().dropna().values
            x_roll_mean = x.rolling(w).mean().dropna().values

#             feature_dict[f'ave_roll_std_{w}'] = x_roll_std.mean()
#             feature_dict[f'std_roll_std_{w}'] = x_roll_std.std()
#             feature_dict[f'max_roll_std_{w}'] = x_roll_std.max()
#             feature_dict[f'min_roll_std_{w}'] = x_roll_std.min()

            for p in percentiles:
                feature_dict[f'percentile_roll_std_{p}_window_{w}'] = np.percentile(x_roll_std, p)

#             feature_dict[f'av_change_abs_roll_std_{w}'] = np.mean(np.diff(x_roll_std))
#             feature_dict[f'av_change_rate_roll_std_{w}'] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
#             feature_dict[f'abs_max_roll_std_{w}'] = np.abs(x_roll_std).max()

#             feature_dict[f'ave_roll_mean_{w}'] = x_roll_mean.mean()
#             feature_dict[f'std_roll_mean_{w}'] = x_roll_mean.std()
#             feature_dict[f'max_roll_mean_{w}'] = x_roll_mean.max()
#             feature_dict[f'min_roll_mean_{w}'] = x_roll_mean.min()

            for p in [70,99,10]:
                feature_dict[f'percentile_roll_mean_{p}_window_{w}'] = np.percentile(x_roll_mean, p)

#             feature_dict[f'av_change_abs_roll_mean_{w}'] = np.mean(np.diff(x_roll_mean))
#             feature_dict[f'av_change_rate_roll_mean_{w}'] = np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])
#             feature_dict[f'abs_max_roll_mean_{w}'] = np.abs(x_roll_mean).max()       

        return feature_dict

    def generate(self):
        feature_list = []
        res = Parallel(n_jobs=self.n_jobs,
                       backend='threading')(delayed(self.get_features)(x, y, s)
                                            for s, x, y in tqdm_notebook(self.read_chunks(), total=self.total_data))
        for r in res:
            feature_list.append(r)
        return pd.DataFrame(feature_list)


In [None]:
training_fg = FeatureGenerator(dtype='train', n_jobs=20, chunk_size=150000)
training_data = training_fg.generate()

test_fg = FeatureGenerator(dtype='test', n_jobs=20, chunk_size=150000)
test_data = test_fg.generate()

train_X1 = training_data.drop(['target', 'seg_id'], axis=1)
test_X1 = test_data.drop(['target', 'seg_id'], axis=1)
test_segs = test_data.seg_id
train_y1 = training_data.target

In [None]:
train_X = train_X1.iloc[:-1,1:]
train_X.head(3)
train_X.shape

In [None]:
test_X = test_X1.iloc[:,1:]
print('test_X',test_X.shape)
test_X.head(3)

In [None]:
print(train_y1.shape)
print(train_y1.head(3))
train_y = train_y1.iloc[:-1]
print(train_y.shape)
train_y.head(3)

In [None]:
means_dict = {}
for col in train_X.columns:
    if train_X[col].isnull().any():
        print(col)
        mean_value = train_X.loc[train_X[col] != -np.inf, col].mean()
        train_X.loc[train_X[col] == -np.inf, col] = mean_value
        train_X[col] = train_X[col].fillna(mean_value)
        means_dict[col] = mean_value

In [None]:
for col in test_X.columns:
    if test_X[col].isnull().any():
        test_X.loc[test_X[col] == -np.inf, col] = means_dict[col]
        test_X[col] = test_X[col].fillna(means_dict[col])

In [None]:
scaler = StandardScaler()
scaler.fit(train_X)
scaled_train_X = pd.DataFrame(scaler.transform(train_X), columns=train_X.columns)
scaled_test_X = pd.DataFrame(scaler.transform(test_X), columns=test_X.columns)

In [None]:
purchased = ['abs_max_roll_std_10000',
             'ffti_abs_percentile_20',
 'ffti_count_big_100000_less_threshold_100',
 'ffti_count_big_100000_threshold_50',
 'ffti_count_big_150000_less_threshold_50',
 'ffti_count_big_150000_threshold_50',
 'ffti_percentile_roll_std_10_window_10',
 'ffti_percentile_roll_std_25_window_10',
 'ffti_percentile_roll_std_50_window_1000',
 'ffti_percentile_roll_std_50_window_500',
 'ffti_percentile_roll_std_70_window_500',
 'ffti_percentile_roll_std_80_window_10000',
 'ffti_percentile_roll_std_80_window_50',
 'ffti_percentile_roll_std_80_window_500',
 'ffti_percentile_roll_std_99_window_500' ,
             'ffti_range_-4000_-3000',
 'ffti_range_2000_3000' ,
             'fftr_count_big_100000_threshold_100',
 'fftr_exp_Moving_std_30000_std', 'fftr_exp_Moving_std_3000_std',
 'fftr_exp_Moving_std_300_std', 'fftr_exp_Moving_std_50000_std',
 'fftr_percentile_roll_mean_10_window_10',
 'fftr_percentile_roll_mean_99_window_10',
 'fftr_percentile_roll_std_20_window_10',
 'fftr_percentile_roll_std_60_window_500',
 'fftr_percentile_roll_std_70_window_10',
 'fftr_percentile_roll_std_70_window_1000',
 'fftr_percentile_roll_std_70_window_50',
 'fftr_percentile_roll_std_75_window_1000',
 'fftr_percentile_roll_std_75_window_10000',
 'fftr_percentile_roll_std_75_window_50',
 'fftr_percentile_roll_std_99_window_100',
 'fftr_percentile_roll_std_99_window_500' ,'fftr_range_-3000_-2000',
 'fftr_range_-4000_-3000' ,'fftr_std_roll_std_10',
 'percentile_roll_mean_99_window_50',
             'percentile_roll_std_10_window_1000',
 'percentile_roll_std_10_window_50', 'percentile_roll_std_10_window_500',
 'percentile_roll_std_1_window_1000', 'percentile_roll_std_20_window_100',
 'percentile_roll_std_20_window_1000', 'percentile_roll_std_25_window_10',
 'percentile_roll_std_25_window_100' ,'percentile_roll_std_25_window_50',
 'percentile_roll_std_25_window_500', 'percentile_roll_std_40_window_500',
 'percentile_roll_std_50_window_1000' ,'percentile_roll_std_5_window_1000',
 'percentile_roll_std_5_window_10000', 'percentile_roll_std_5_window_50',
 'percentile_roll_std_5_window_500' ,'percentile_roll_std_90_window_10000',
 'percentile_roll_std_95_window_500', 'std_first_50000']

In [None]:
candidate = ['count_big_100000_less_threshold_10' 
             ,'ffti_abs_percentile_10',
 'ffti_abs_percentile_5',
             'ffti_count_big_100000_threshold_100',
 'ffti_count_big_50000_less_threshold_50',
 'ffti_percentile_roll_std_40_window_100',
 'ffti_percentile_roll_std_40_window_50',
 'ffti_percentile_roll_std_50_window_10',
 'ffti_percentile_roll_std_50_window_100',
 'ffti_percentile_roll_std_50_window_50',
 'ffti_percentile_roll_std_60_window_10',
 'ffti_percentile_roll_std_60_window_100',
 'ffti_percentile_roll_std_60_window_50',
 'ffti_percentile_roll_std_60_window_500',
 'ffti_percentile_roll_std_70_window_10000',
 'ffti_percentile_roll_std_75_window_10000' ,
             'ffti_range_-3000_-2000',
 'fftr_abs_max_roll_std_1000' ,
             'fftr_abs_percentile_10',
 'fftr_percentile_roll_std_25_window_10',
 'fftr_percentile_roll_std_30_window_10',
 'fftr_percentile_roll_std_40_window_100',
 'fftr_percentile_roll_std_40_window_50',
 'fftr_percentile_roll_std_50_window_100',
 'fftr_percentile_roll_std_60_window_1000',
 'fftr_percentile_roll_std_70_window_100',
 'fftr_percentile_roll_std_80_window_100',
 'fftr_percentile_roll_std_80_window_500' ,
             'iqr' ,
             'min_roll_std_1000',
 'min_roll_std_10000',
             'num_peaks_10',
             'percentile_25',
 'percentile_roll_std_10_window_10' ,
             'percentile_roll_std_10_window_100',
 'percentile_roll_std_1_window_10000' ,'percentile_roll_std_1_window_50',
 'percentile_roll_std_1_window_500' ,'percentile_roll_std_20_window_500',
 'percentile_roll_std_25_window_1000' ,'percentile_roll_std_30_window_100',
 'percentile_roll_std_30_window_1000' ,'percentile_roll_std_30_window_500',
 'percentile_roll_std_40_window_100' ,'percentile_roll_std_40_window_1000',
 'percentile_roll_std_50_window_100' ,'percentile_roll_std_5_window_100',
 'percentile_roll_std_60_window_10000', 'percentile_roll_std_80_window_100',
            
            
            'fftr_percentile_roll_std_70_window_10000' ,'percentile_roll_std_40_window_50',
 'ffti_percentile_roll_std_25_window_10' ,'ffti_percentile_roll_std_60_window_500' ,
'ffti_percentile_roll_std_70_window_10000','fftr_range_2000_3000' ,
'fftr_percentile_roll_std_60_window_50', 'fftr_percentile_roll_std_70_window_10',
 'ffti_percentile_roll_mean_70_window_10' ,'percentile_roll_std_95_window_1000'  
            
            
            ]

In [None]:
train_X_purchased = scaled_train_X[purchased]
test_X_purchased = scaled_test_X[purchased]
print(train_X_purchased.shape)

In [None]:
train_X_candidate = scaled_train_X[candidate]
test_X_candidate  = scaled_test_X[candidate]
print(train_X_candidate.shape)

In [None]:
columns_purchased = train_X_purchased.columns
columns_candidate = train_X_candidate.columns

In [None]:
train_X_partial = scaled_train_X[columns_purchased]
test_X_partial = scaled_test_X[columns_purchased]
print(train_X_partial.shape, test_X_partial.shape)

In [None]:
#import required packages
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import gc
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample
#optional but advised
import warnings
warnings.filterwarnings('ignore')

#GLOBAL HYPEROPT PARAMETERS
NUM_EVALS = 1000 #number of hyperopt evaluation rounds
N_FOLDS = 5 #number of cross-validation folds on data in each evaluation round

#LIGHTGBM PARAMETERS
LGBM_MAX_LEAVES = 2**10 #maximum number of leaves per tree for LightGBM
LGBM_MAX_DEPTH = 25 #maximum tree depth for LightGBM
EVAL_METRIC_LGBM_REG = 'mae' #LightGBM regression metric. Note that 'rmse' is more commonly used 
EVAL_METRIC_LGBM_CLASS = 'auc'#LightGBM classification metric

#XGBOOST PARAMETERS
XGB_MAX_LEAVES = 2**12 #maximum number of leaves when using histogram splitting
XGB_MAX_DEPTH = 25 #maximum tree depth for XGBoost
EVAL_METRIC_XGB_REG = 'mae' #XGBoost regression metric
EVAL_METRIC_XGB_CLASS = 'auc' #XGBoost classification metric

#CATBOOST PARAMETERS
CB_MAX_DEPTH = 8 #maximum tree depth in CatBoost
OBJECTIVE_CB_REG = 'MAE' #CatBoost regression metric
OBJECTIVE_CB_CLASS = 'Logloss' #CatBoost classification metric

#OPTIONAL OUTPUT
BEST_SCORE = 0

# if classification problem is solved; Class = True;
# data is the features; each row is a sample;
# label is the y value for each sample;
def quick_hyperopt(data, labels, package='lgbm', num_evals=NUM_EVALS, diagnostic=False, Class=False):
    
    #==========
    #LightGBM
    #==========
    
    if package=='lgbm':
        
        print('Running {} rounds of LightGBM parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth',
                         'num_leaves',
                          'max_bin',
                         'min_data_in_leaf',
                         'min_data_in_bin']
        
        def objective(space_params):
            
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
            
            #extract nested conditional parameters
            if space_params['boosting']['boosting'] == 'goss':
                top_rate = space_params['boosting'].get('top_rate')
                other_rate = space_params['boosting'].get('other_rate')
                #0 <= top_rate + other_rate <= 1
                top_rate = max(top_rate, 0)
                top_rate = min(top_rate, 0.5)
                other_rate = max(other_rate, 0)
                other_rate = min(other_rate, 0.5)
                space_params['top_rate'] = top_rate
                space_params['other_rate'] = other_rate
            
            subsample = space_params['boosting'].get('subsample', 1.0)
            space_params['boosting'] = space_params['boosting']['boosting']
            space_params['subsample'] = subsample
            
            if Class:
                cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=True,
                                    early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_CLASS, seed=42)
                best_loss = 1 - cv_results['auc-mean'][-1]
                
            else:
                cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=False,
                                    early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_REG, seed=42)
                best_loss = cv_results['l1-mean'][-1] #'l2-mean' for rmse
            
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = lgb.Dataset(data, labels)
                
        #integer and string parameters, used with hp.choice()
        boosting_list = [{'boosting': 'gbdt',
                          'subsample': hp.uniform('subsample', 0.5, 1)},
                         {'boosting': 'goss',
                          'subsample': 1.0,
                         'top_rate': hp.uniform('top_rate', 0, 0.5),
                         'other_rate': hp.uniform('other_rate', 0, 0.5)}] #if including 'dart', make sure to set 'n_estimators'
        
        if Class:
            metric_list = ['auc'] #modify as required for other classification metrics
            objective_list = ['binary', 'cross_entropy']
        
        else:
            metric_list = ['MAE', 'RMSE'] 
            objective_list = ['huber', 'gamma', 'fair', 'tweedie']
        
        
        space ={'boosting' : hp.choice('boosting', boosting_list),
                'num_leaves' : hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
                'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
                'max_bin': hp.quniform('max_bin', 32, 255, 1),
                'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 256, 1),
                'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
                'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
                'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
                'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'metric' : hp.choice('metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1, 0.01),
                'bagging_fraction' : hp.quniform('bagging_fraction', 0.5, 1, 0.01)
            }
        
        #optional: activate GPU for LightGBM
        #follow compilation steps here:
        #https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm/
        #then uncomment lines below:
        #space['device'] = 'gpu'
        #space['gpu_platform_id'] = 0,
        #space['gpu_device_id'] =  0

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
                
        #fmin() will return the index of values chosen from the lists/arrays in 'space'
        #to obtain actual values, index values are used to subset the original lists/arrays
        best['boosting'] = boosting_list[best['boosting']]['boosting']#nested dict, index twice
        best['metric'] = metric_list[best['metric']]
        best['objective'] = objective_list[best['objective']]
                
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #XGBoost
    #==========
    
    if package=='xgb':
        
        print('Running {} rounds of XGBoost parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth']
        
        def objective(space_params):
            
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract multiple nested tree_method conditional parameters
            #libera te tutemet ex inferis
            if space_params['tree_method']['tree_method'] == 'hist':
                max_bin = space_params['tree_method'].get('max_bin')
                space_params['max_bin'] = int(max_bin)
                if space_params['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                    grow_policy = space_params['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                    space_params['grow_policy'] = grow_policy
                    space_params['tree_method'] = 'hist'
                else:
                    max_leaves = space_params['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                    space_params['grow_policy'] = 'lossguide'
                    space_params['max_leaves'] = int(max_leaves)
                    space_params['tree_method'] = 'hist'
            else:
                space_params['tree_method'] = space_params['tree_method'].get('tree_method')
                
            #for classification replace EVAL_METRIC_XGB_REG with EVAL_METRIC_XGB_CLASS
            cv_results = xgb.cv(space_params, train, nfold=N_FOLDS, metrics=[EVAL_METRIC_XGB_REG],
                             early_stopping_rounds=100, stratified=False, seed=42)
            
            best_loss = cv_results['test-mae-mean'].iloc[-1] #or 'test-rmse-mean' if using RMSE
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['test-auc-mean'].iloc[-1]
            #if necessary, replace 'test-auc-mean' with 'test-[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = xgb.DMatrix(data, labels)
        
        #integer and string parameters, used with hp.choice()
        boosting_list = ['gbtree', 'gblinear'] #if including 'dart', make sure to set 'n_estimators'
        metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        #metric_list = ['auc']
        #modify as required for other classification metrics classification
        
        tree_method = [{'tree_method' : 'exact'},
               {'tree_method' : 'approx'},
               {'tree_method' : 'hist',
                'max_bin': hp.quniform('max_bin', 2**3, 2**7, 1),
                'grow_policy' : {'grow_policy': {'grow_policy':'depthwise'},
                                'grow_policy' : {'grow_policy':'lossguide',
                                                  'max_leaves': hp.quniform('max_leaves', 32, XGB_MAX_LEAVES, 1)}}}]
        
        #if using GPU, replace 'exact' with 'gpu_exact' and 'hist' with
        #'gpu_hist' in the nested dictionary above
        
        objective_list_reg = ['reg:linear', 'reg:gamma', 'reg:tweedie']
        objective_list_class = ['reg:logistic', 'binary:logistic']
        #for classification change line below to 'objective_list = objective_list_class'
        objective_list = objective_list_reg
        
        space ={'boosting' : hp.choice('boosting', boosting_list),
                'tree_method' : hp.choice('tree_method', tree_method),
                'max_depth': hp.quniform('max_depth', 2, XGB_MAX_DEPTH, 1),
                'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
                'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
                'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
                'gamma' : hp.uniform('gamma', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'eval_metric' : hp.choice('eval_metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
                'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
                'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
                'nthread' : -1
            }
        
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        best['tree_method'] = tree_method[best['tree_method']]['tree_method']
        best['boosting'] = boosting_list[best['boosting']]
        best['eval_metric'] = metric_list[best['eval_metric']]
        best['objective'] = objective_list[best['objective']]
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        if 'max_bin' in best:
            best['max_bin'] = int(best['max_bin'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #CatBoost
    #==========
    
    if package=='cb':
        
        print('Running {} rounds of CatBoost parameter optimisation:'.format(num_evals))
        
        #clear memory 
        gc.collect()
            
        integer_params = ['depth',
                          #'one_hot_max_size', #for categorical data
                          'min_data_in_leaf',
                          'max_bin']
        
        def objective(space_params):
                        
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract nested conditional parameters
            if space_params['bootstrap_type']['bootstrap_type'] == 'Bayesian':
                bagging_temp = space_params['bootstrap_type'].get('bagging_temperature')
                space_params['bagging_temperature'] = bagging_temp
                
            if space_params['grow_policy']['grow_policy'] == 'LossGuide':
                max_leaves = space_params['grow_policy'].get('max_leaves')
                space_params['max_leaves'] = int(max_leaves)
                
            space_params['bootstrap_type'] = space_params['bootstrap_type']['bootstrap_type']
            space_params['grow_policy'] = space_params['grow_policy']['grow_policy']
                           
            #random_strength cannot be < 0
            space_params['random_strength'] = max(space_params['random_strength'], 0)
            #fold_len_multiplier cannot be < 1
            space_params['fold_len_multiplier'] = max(space_params['fold_len_multiplier'], 1)
                       
            #for classification set stratified=True
            cv_results = cb.cv(train, space_params, fold_count=N_FOLDS, 
                             early_stopping_rounds=25, stratified=False, partition_random_seed=42)
           
            best_loss = cv_results['test-MAE-mean'].iloc[-1] #'test-RMSE-mean' for RMSE
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = cv_results['test-Logloss-mean'].iloc[-1]
            #if necessary, replace 'test-Logloss-mean' with 'test-[your-preferred-metric]-mean'
            
            return{'loss':best_loss, 'status': STATUS_OK}
        
        train = cb.Pool(data, labels.astype('float32'))
        
        #integer and string parameters, used with hp.choice()
        bootstrap_type = [{'bootstrap_type':'Poisson'}, 
                           {'bootstrap_type':'Bayesian',
                            'bagging_temperature' : hp.loguniform('bagging_temperature', np.log(1), np.log(50))},
                          {'bootstrap_type':'Bernoulli'}] 
        #LEB = ['No', 'AnyImprovement', 'Armijo'] #remove 'Armijo' if not using GPU
        LEB = ['No', 'AnyImprovement'] #remove 'Armijo' if not using GPU
        #score_function = ['Correlation', 'L2', 'NewtonCorrelation', 'NewtonL2']
        grow_policy = [{'grow_policy':'SymmetricTree'},
                       {'grow_policy':'Depthwise'},
                       {'grow_policy':'Lossguide',
                        'max_leaves': hp.quniform('max_leaves', 2, 32, 1)}]
        eval_metric_list_reg = ['MAE', 'RMSE', 'Poisson']
        eval_metric_list_class = ['Logloss', 'AUC', 'F1']
        #for classification change line below to 'eval_metric_list = eval_metric_list_class'
        eval_metric_list = eval_metric_list_reg
                
        space ={'depth': hp.quniform('depth', 2, CB_MAX_DEPTH, 1),
                'max_bin' : hp.quniform('max_bin', 1, 32, 1), #if using CPU just set this to 254
                'l2_leaf_reg' : hp.uniform('l2_leaf_reg', 0, 5),
                'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 1, 50, 1),
                'random_strength' : hp.loguniform('random_strength', np.log(0.005), np.log(5)),
                #'one_hot_max_size' : hp.quniform('one_hot_max_size', 2, 16, 1), #uncomment if using categorical features
                'bootstrap_type' : hp.choice('bootstrap_type', bootstrap_type),
                'learning_rate' : hp.uniform('learning_rate', 0.05, 0.25),
                'eval_metric' : hp.choice('eval_metric', eval_metric_list),
                'objective' : OBJECTIVE_CB_REG,
                #'score_function' : hp.choice('score_function', score_function), #crashes kernel - reason unknown
                'leaf_estimation_backtracking' : hp.choice('leaf_estimation_backtracking', LEB),
                'grow_policy': hp.choice('grow_policy', grow_policy),
                #'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),# CPU only
                'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
                'od_type' : 'Iter',
                'od_wait' : 25,
                'task_type' : 'GPU',
                'verbose' : 0
            }
        
        #optional: run CatBoost without GPU
        #uncomment line below
        space['task_type'] = 'CPU'
            
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        #unpack nested dicts first
        best['bootstrap_type'] = bootstrap_type[best['bootstrap_type']]['bootstrap_type']
        best['grow_policy'] = grow_policy[best['grow_policy']]['grow_policy']
        best['eval_metric'] = eval_metric_list[best['eval_metric']]
        
        #best['score_function'] = score_function[best['score_function']] 
        #best['leaf_estimation_method'] = LEM[best['leaf_estimation_method']] #CPU only
        best['leaf_estimation_backtracking'] = LEB[best['leaf_estimation_backtracking']]        
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    else:
        print('Package not recognised. Please use "lgbm" for LightGBM, "xgb" for XGBoost or "cb" for CatBoost.')     

In [None]:
params = quick_hyperopt(train_X_partial, train_y, 'lgbm', 200)

In [None]:
predictions1 = np.zeros(len(test_X_partial))
feature_importance_df = pd.DataFrame()
#run model

N_aug = 30

n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

for i_aug in tqdm_notebook(range(0, N_aug)):
    a = np.arange(0, train_X_partial.shape[1])
    #initialise aug dataframe - remember to set dtype!
    train_aug = pd.DataFrame(index = train_X_partial.index, columns=train_X_partial.columns, dtype='float64')

    # Please note that pandas will set the datatype of its columns as 'object' unless you specify otherwise. 
    # I mention this because the above code, which takes less than 1 minute to process 4194 rows of 100 features,
    # will take around an hour if dtype isn't set to 'float64'!

    for i in tqdm_notebook(range(0, len(train_X_partial))):
        # ratio of features to be randomly sampled
        AUG_FEATURE_RATIO = 0.5
        # to integer count
        AUG_FEATURE_COUNT = np.floor(train_X_partial.shape[1]*AUG_FEATURE_RATIO).astype('int16')

        # randomly sample half of columns (features) that will contain random values
        # indices for features which will be sampled from the same feature
        aug_feature_index = np.random.choice(train_X_partial.shape[1], AUG_FEATURE_COUNT, replace=False)
        aug_feature_index.sort()

        # obtain indices for features not in aug_feature_index;
        # i.e., the indices for features that will be kept for all rows
        feature_index = np.where(np.logical_not(np.in1d(a, aug_feature_index)))[0]

        # first insert features with real values; i.e., feature_index records the incices for features 
        # that are kept
        train_aug.iloc[i, feature_index] = train_X_partial.iloc[i, feature_index]

        # random row index to randomly sampled values for each features
        rand_row_index = np.random.choice(len(train_X_partial), len(aug_feature_index), replace=True)

        # loop over all rows of augmented set; for each row and each sampled feature, 
        # randomly choose a value from other row to fill in this sampled feature
        for n, j in enumerate(aug_feature_index):
            train_aug.iloc[i, j] = train_X_partial.iloc[rand_row_index[n], j]

    
    train_all = pd.concat([train_X_partial, train_aug])
    y_all = np.append(train_y, train_y)
    
    # params = quick_hyperopt(train_all, y_all, 'lgbm', 200)
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_all,y_all)):
        strLog = "fold {}".format(fold_)
        print(strLog)

        X_tr, X_val = train_all.iloc[trn_idx], train_all.iloc[val_idx]
        y_tr, y_val = y_all[trn_idx], y_all[val_idx]

        model = lgb.LGBMRegressor(**params, n_estimators = 20000, n_jobs = -1)
        model.fit(X_tr, 
                  y_tr, 
                  eval_set=[(X_tr, y_tr), (X_val, y_val)], 
                  eval_metric='mae',
                  verbose=1000, 
                  early_stopping_rounds=500)

        predictions1 += model.predict(test_X_partial, num_iteration=model.best_iteration_) / (folds.n_splits*N_aug)

In [None]:
oof = np.zeros(len(train_X_partial))
train_score = []

predictions4 = np.zeros(len(scaled_test_X))

n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

feature_importance_df = pd.DataFrame()
#run model
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_X_partial,train_y)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    X_tr, X_val = train_X_partial.iloc[trn_idx], train_X_partial.iloc[val_idx]
    y_tr, y_val = train_y.iloc[trn_idx], train_y.iloc[val_idx]

    model = CatBoostRegressor(n_estimators=25000, verbose=-1, objective="MAE", loss_function="MAE", boosting_type="Ordered", task_type="GPU")
    model.fit(X_tr, 
              y_tr, 
              eval_set=[(X_val, y_val)], 
#               eval_metric='mae',
              verbose=2500, 
              early_stopping_rounds=500)
    oof[val_idx] = model.predict(X_val)

    train_score.append(model.best_score_['learn']["MAE"])
    predictions4 += model.predict(test_X_partial) / (folds.n_splits)
cv_score = mean_absolute_error(train_y, oof)
print('cv_score',cv_score)

In [None]:
combine = np.concatenate([columns_candidate,columns_purchased])

train_X_partial_cat = scaled_train_X[combine]
test_X_partial_cat = scaled_test_X[combine]
print(train_X_partial_cat.shape, test_X_partial_cat.shape)

In [None]:
oof = np.zeros(len(train_X_partial_cat))
train_score = []

predictions2 = np.zeros(len(scaled_test_X))

n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

feature_importance_df = pd.DataFrame()
#run model
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_X_partial_cat,train_y)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    X_tr, X_val = train_X_partial_cat.iloc[trn_idx], train_X_partial_cat.iloc[val_idx]
    y_tr, y_val = train_y.iloc[trn_idx], train_y.iloc[val_idx]

    model = CatBoostRegressor(n_estimators=25000, verbose=-1, objective="MAE", loss_function="MAE", boosting_type="Ordered", task_type="GPU")
    model.fit(X_tr, 
              y_tr, 
              eval_set=[(X_val, y_val)], 
#               eval_metric='mae',
              verbose=2500, 
              early_stopping_rounds=500)
    oof[val_idx] = model.predict(X_val)

    train_score.append(model.best_score_['learn']["MAE"])
    predictions2 += model.predict(test_X_partial_cat) / (folds.n_splits)
cv_score = mean_absolute_error(train_y, oof)
print('cv_score',cv_score)

In [None]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')
submission.time_to_failure = 0.2*predictions1+predictions2*0.6+predictions4*0.2
submission.to_csv('submission.csv',index=True)