In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import os 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Import the data

In [None]:
PATH = '../input/predict-volcanic-eruptions-ingv-oe/'

train_list = os.listdir('../input/predict-volcanic-eruptions-ingv-oe/train')
test_list = os.listdir("../input/predict-volcanic-eruptions-ingv-oe/test")
train_time = pd.read_csv(PATH + 'train.csv')

Train and Test size

In [None]:
print('Number of train files: {}'.format(len(train_list)))
print('Number of test files: {}'.format(len(test_list )))

In [None]:
example = pd.read_csv(PATH + 'train/' + train_list[0])

We can transform our signals in 1 row

In [None]:
example[:5]

In [None]:
example_test = pd.read_csv(PATH + 'test/' + test_list[0])

In [None]:
example_test[:5]

In [None]:
train_list[0]

In [None]:
train_time

Look at one of the train signal

In [None]:
example.plot(figsize=(15,15), subplots=True);

In [None]:
train_time[train_time.segment_id == int(train_list[0].split('.')[0])]

In [None]:
pd.DataFrame(example.fillna(0).describe().iloc[1:, :].unstack()).reset_index()

In [None]:
process = pd.DataFrame(example.fillna(0).describe().iloc[1:, :].unstack()).reset_index()
process = process.rename(columns={0: 'value'})
process['feature'] = process['level_0'] + '_' + process['level_1']

In [None]:
process

In [None]:
process = process.drop(['level_0', 'level_1'], axis=1).set_index('feature').T

In [None]:
process

In [None]:
process['time'] = train_time[train_time.segment_id == int(train_list[0].split('.')[0])].time_to_eruption.values[0]

In [None]:
process

In [None]:
pd.DataFrame(example.fillna(0).skew()).T

# Preprocessing Train and Test

Create a function for data preparation

In [None]:
def create_frame(data, data_time=None, type_data='train'):
    data = data.fillna(0)
    
    # основные статистика
    data_transform = data.describe().iloc[1:, :]
    
    # Дополнительные параметры
    # Коэффициент асимметрии
    data_transform.loc['skew'] = data.skew().tolist()
    
    #Среднее абсолютное отклонение
    data_transform.loc['mad'] = data.mad().tolist()
    
    # Коэффициент эксцесса — мера остроты пика распределения случайной величины.
    data_transform.loc['kurtosis'] = data.kurtosis().tolist()
    
    # добавление квантилей
    for i in range(0, 100, 5):
        if ((i!=25) & (i!=50)):
                str_col = f"{i}%"
                int_col = float(i)/100
                data_transform.loc[str_col] = data_transform.quantile(int_col).tolist()
        else:
            continue
            
    data_transform = pd.DataFrame(data_transform.unstack()).reset_index()
    data_transform = data_transform.rename(columns={0: 'value'})
    data_transform['feature'] = data_transform['level_0'] + '_' + data_transform['level_1']
    data_transform = data_transform.drop(['level_0', 'level_1'], axis=1).set_index('feature').T
    
    if type_data=='train':
        data_transform['time'] = data_time
    return data_transform

In [None]:
all_train = pd.DataFrame()

for file in tqdm(train_list):
    df = pd.read_csv(PATH + 'train/' + file)
    data_time = train_time[train_time.segment_id == int(file.split('.')[0])].time_to_eruption.values[0]
    df = create_frame(df, data_time, type_data='train')
    all_train = all_train.append(df)

all_train = all_train.reset_index(drop=True)

In [None]:
all_test = pd.DataFrame()

for file in tqdm(test_list):
    df = pd.read_csv(PATH + 'test/' + file)
    df = create_frame(df, data_time=None, type_data='test')
    all_test = all_test.append(df)

all_test = all_test.reset_index(drop=True)

In [None]:
all_train[:5]

In [None]:
all_test[:5]

# Modeling

In [None]:
X = all_train.drop('time',axis=1)
y = all_train['time']

test = all_test.copy()

#  Baseline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, shuffle=True, random_state=10)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, shuffle=True, random_state=10)

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [None]:
clf = CatBoostRegressor(loss_function='MAPE')  
train_dataset = Pool(data=X_train,
                     label=y_train,
                     )
    
eval_dataset = Pool(data=X_val,
                    label=y_val,
                    )
    
clf.fit(train_dataset,
          use_best_model=True,
          verbose = 0,
          eval_set=eval_dataset)

In [None]:
y_pred = clf.predict(Pool(data=X_test))
    
print(f"MAPE: {mape(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

# Using KFold with some parametrs

We are going to use KFold with CatBoostRegressor. We didn't use GridSearch because save time

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, shuffle=True, random_state=10)

In [None]:
n_fold = 5
cv = KFold(n_splits=n_fold, shuffle=True, random_state=10)
prediction = np.zeros(len(test))
mape_, mae, rmse = [], [], []

params = {
            'iterations':1000,
            'learning_rate':0.1,
            'depth':6,
            'eval_metric':'RMSE'
}

for fold, (train_index, val_index) in enumerate(cv.split(X)):
    X_train = X.iloc[train_index,:]
    X_val = X.iloc[val_index,:]

    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]
          
    clf = CatBoostRegressor(**params)  
    
    train_dataset = Pool(data=X_train,
                     label=y_train,
                     )
    
    eval_dataset = Pool(data=X_val,
                    label=y_val,
                    )
    
    clf.fit(train_dataset,
              use_best_model=True,
              verbose = 0,
              eval_set=eval_dataset)
   
    y_pred = clf.predict(Pool(data=X_test))
    
    mape_.append(mape(y_test, y_pred))
    mae.append(mean_absolute_error(y_test, y_pred))
    rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    print(f"fold: {fold}, MAPE: {mape(y_test, y_pred)}")
    print(f"fold: {fold}, MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"fold: {fold}, RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

    # test array predictions
    prediction += clf.predict(Pool(data=test))
        
prediction /= n_fold

print('CV mean MAPE:  {0:.4f}, std: {1:.4f}.'.format(np.mean(mape_), np.std(mape_)))
print('CV mean MAE: {0:.4f}, std: {1:.4f}.'.format(np.mean(mae), np.std(mae)))
print('CV mean RMSE: {0:.4f}, std: {1:.4f}.'.format(np.mean(rmse), np.std(rmse)))

https://catboost.ai/docs/concepts/python-usages-examples.html

In [None]:
sub_example = pd.read_csv(PATH + 'sample_submission.csv')
sub_example[:5]

In [None]:
test_index = [int(i.split('.')[0]) for i in test_list]

In [None]:
test_index[:5]

In [None]:
submission = pd.DataFrame()
submission['segment_id'] = test_index
submission['time_to_eruption'] = prediction
submission.to_csv('submission.csv', header=True, index=False)

In [None]:
submission[:5]