In [1]:
import numpy as np
import pandas as pd

In [5]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, max_error, mean_absolute_percentage_error
from scipy.stats import kurtosis, skew

In [3]:
train = pd.read_csv('../input/LANL-Earthquake-Prediction/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})
pd.options.display.precision = 15
train.head()

In [62]:
# Divide the input file in segments of the size specified

rows = 600_000 #TODO: Optimize this parameter

segments = int(np.floor(train.shape[0] / rows))

print(f"Number of segments:",segments)

X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['ave', 'std', 'max', 'min', 'ske', 'kur'])
y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['time_to_failure'])

for segment in range(segments):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]
    y_train.loc[segment, 'time_to_failure'] = y
    X_train.loc[segment, 'ave'] = x.mean()
    X_train.loc[segment, 'std'] = x.std()
    X_train.loc[segment, 'max'] = x.max()
    X_train.loc[segment, 'min'] = x.min()
    X_train.loc[segment, 'ske'] = skew(x)
    X_train.loc[segment, 'kur'] = kurtosis(x)    

In [63]:
X_train.head()

In [64]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [65]:
def plot(y_train, y_pred):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_train.values.flatten(), y_pred)
    plt.xlim(0, 20)
    plt.ylim(0, 20)
    plt.xlabel('actual', fontsize=12)
    plt.ylabel('predicted', fontsize=12)
    plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
    plt.show()

In [66]:
def score(y_train, y_pred):
    y_train_flatten = y_train.values.flatten()
    max = max_error(y_train_flatten, y_pred)
    mae = mean_absolute_error(y_train_flatten, y_pred)
    mape = mean_absolute_percentage_error(y_train_flatten, y_pred)
    print(f'Max Error: {max:0.3f}')
    print(f'Mean Absolute Error: {mae:0.3f}')
    print(f'Mean Absolute Percentage Error: {mape:0.3f}')    

# Catboost with Root Mean Square Error

In [67]:
m_rmse = CatBoostRegressor(loss_function='RMSE')
m_rmse.fit(X_train_scaled, y_train.values.flatten(), silent=True)
y_pred_m_rmse = m_rmse.predict(X_train_scaled)

In [68]:
plot(y_train, y_pred_m_rmse)

In [69]:
score(y_train, y_pred_m_rmse)

# CatBoost with Mean Absolute Error

In [70]:
m_mae = CatBoostRegressor(loss_function='MAE')
m_mae.fit(X_train_scaled, y_train.values.flatten(), silent=True)
y_pred_m_mae = m_mae.predict(X_train_scaled)

In [71]:
plot(y_train, y_pred_m_mae)

In [72]:
score(y_train, y_pred_m_mae)

# Catboost with Mean Absolute Percentage Error

In [73]:
m_mape = CatBoostRegressor(loss_function='MAPE')
m_mape.fit(X_train_scaled, y_train.values.flatten(), silent=True)
y_pred_m_mape = m_mape.predict(X_train_scaled)

In [74]:
plot(y_train, y_pred_m_mape)

In [75]:
score(y_train, y_pred_m_mape)