In [145]:
import numpy as np
import pandas as pd

In [146]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [147]:
train = pd.read_csv('../input/train.csv',dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [148]:
pd.options.display.precision = 15
train.head()

In [149]:
# Divide the input file in segments of (roughly) 600 000 rows each one

rows = 600_000
segments = int(np.floor(train.shape[0] / rows))

print(segments)

X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['ave', 'std', 'max', 'min'])
y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['time_to_failure'])

for segment in range(segments):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]
    y_train.loc[segment, 'time_to_failure'] = y
    X_train.loc[segment, 'ave'] = x.mean()
    X_train.loc[segment, 'std'] = x.std()
    X_train.loc[segment, 'max'] = x.max()
    X_train.loc[segment, 'min'] = x.min()

In [150]:
X_train.head()

In [151]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

# Step 3 - CatBoost with Mean Absolute Error

In [152]:
m = CatBoostRegressor(loss_function='MAE')
m.fit(X_train_scaled, y_train.values.flatten(), silent=True)
y_pred = m.predict(X_train_scaled)

In [153]:
plt.figure(figsize=(6, 6))
plt.scatter(y_train.values.flatten(), y_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.show()

In [154]:
score = mean_absolute_error(y_train.values.flatten(), y_pred)
print(f'Score: {score:0.3f}')

# Step 4 - Catboost (CrossEntropy)

In [155]:
m = CatBoostRegressor(loss_function='RMSE')
m.fit(X_train_scaled, y_train.values.flatten(), silent=True)
y_pred = m.predict(X_train_scaled)

In [156]:
plt.figure(figsize=(6, 6))
plt.scatter(y_train.values.flatten(), y_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.show()

In [157]:
score = mean_absolute_error(y_train.values.flatten(), y_pred)
print(f'Score: {score:0.3f}')

# Step 5 - Catboost with Mean Absolute Percentage Error

In [158]:
m = CatBoostRegressor(loss_function='MAPE')
m.fit(X_train_scaled, y_train.values.flatten(), silent=True)
y_pred = m.predict(X_train_scaled)

In [159]:
plt.figure(figsize=(6, 6))
plt.scatter(y_train.values.flatten(), y_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.show()

In [160]:
score = mean_absolute_error(y_train.values.flatten(), y_pred)
print(f'Score: {score:0.3f}')