In [1]:
from dask.distributed import Client
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
pd.options.display.precision = 15

In [3]:
client = Client(n_workers=11, threads_per_worker=1, processes=True, memory_limit='2GB', ip= '0.0.0.0')

In [4]:
client

0,1
Client  Scheduler: tcp://192.168.3.10:44057  Dashboard: http://192.168.3.10:8787/status,Cluster  Workers: 11  Cores: 11  Memory: 22.00 GB


In [5]:
train = dd.read_csv('train_csv/*', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [6]:
train_head = train.head(20000000)

In [7]:
# Create a training file with simple derived features
def get_train_data(df):
    rows = 150_000
    slide_rows = 2000
    segments = int(np.floor((df.shape[0]-rows+slide_rows) / slide_rows))

    X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                           columns=['ave', 'std', 'max', 'min', 'fft_ave', 'fft_std', 'fft_max', 'fft_min'])
    y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                           columns=['time_to_failure'])
    
    

    for segment in range(segments):
        seg = df.iloc[segment*slide_rows:segment*slide_rows+rows]
        x = seg['acoustic_data'].values
        y = seg['time_to_failure'].values[-1]

        y_train.loc[segment, 'time_to_failure'] = y

        X_train.loc[segment, 'ave'] = x.mean()
        X_train.loc[segment, 'std'] = x.std()
        X_train.loc[segment, 'max'] = x.max()
        X_train.loc[segment, 'min'] = x.min()
        fft_val = np.abs(np.fft.fft(x))[1:]
        X_train.loc[segment, 'fft_ave'] = fft_val.mean()
        X_train.loc[segment, 'fft_std'] = fft_val.std()
        X_train.loc[segment, 'fft_max'] = fft_val.max()
        X_train.loc[segment, 'fft_min'] = fft_val.min()
    X_train['label'] = y_train['time_to_failure']
    return X_train

In [8]:
train_data = train.map_partitions(get_train_data).compute()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns='label'), train_data['label'], test_size=0.2, random_state=42)

In [10]:
from xgboost import XGBModel

In [11]:
def fair_obj(preds, dtrain):
    """y = c * abs(x) - c * np.log(abs(abs(x) + c))"""
    x = dtrain - preds
    c = 1
    den = abs(x) + c
    grad = c*x / den
    hess = c*c / den ** 2
    return grad, hess
params = {'max_depth': 11,
          'learning_rate': 0.3, 
          'n_estimators': 2000,
#           'objective': fair_obj
         }
xgb_model = XGBModel(**params)

In [12]:
xgb_model.fit(X_train, y_train)

XGBModel(base_score=0.5, booster='gbtree', colsample_bylevel=1,
     colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
     max_depth=11, min_child_weight=1, missing=None, n_estimators=2000,
     n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
     reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
     subsample=1)

In [13]:
y_pred = xgb_model.predict(X_test)
mean_absolute_error(y_pred=y_pred, y_true=y_test)

0.37758047622512986

In [14]:
submission = pd.read_csv('sample_submission.csv')

In [15]:
test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.seg_id)
for seg_id in test.index:
    seg = pd.read_csv('test/' + seg_id + '.csv')
    
    x = seg['acoustic_data'].values
    
    test.loc[seg_id, 'ave'] = x.mean()
    test.loc[seg_id, 'std'] = x.std()
    test.loc[seg_id, 'max'] = x.max()
    test.loc[seg_id, 'min'] = x.min()
    fft_val = np.abs(np.fft.fft(x))[1:]
    test.loc[seg_id, 'fft_ave'] = fft_val.mean()
    test.loc[seg_id, 'fft_std'] = fft_val.std()
    test.loc[seg_id, 'fft_max'] = fft_val.max()
    test.loc[seg_id, 'fft_min'] = fft_val.min()

In [16]:
submission['time_to_failure'] = xgb_model.predict(test)

In [17]:
submission.set_index('seg_id').to_csv('submission.csv')

In [18]:
train_head = train.head(4000000)

In [19]:
rows = 150000
sample = train_head.iloc[18888:18888+rows]['acoustic_data']