In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm #Makes iterations look better
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
train_data = pd.read_csv('../input/train.csv',dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})


In [None]:
# Display the head of the dataframe
pd.options.display.precision = 10

train_data.head()

In [None]:
# Dimensions of the given training data
print("Rows: {}, Columns: {}".format(train_data.shape[0],train_data.shape[1]))

In [None]:
from scipy.stats import kurtosis, skew
from statsmodels.robust import mad
segment_size = 150000
num_segments = int(np.floor(train_data.shape[0]/segment_size))

X_train = pd.DataFrame(index=range(num_segments),columns=['ave','std','min','max'],dtype=np.float64)
y_train = pd.DataFrame(index=range(num_segments),columns=['time_to_failure'],dtype=np.float64)

for i in tqdm(range(num_segments)):
    segment_i = train_data.iloc[i*segment_size:i*segment_size+segment_size]
    x = segment_i['acoustic_data'].values
    y = segment_i['time_to_failure'].values[-1]
    X_train.loc[i,'ave'] = x.mean()
    X_train.loc[i,'std'] = x.std()
    X_train.loc[i,'max'] = x.max()
    X_train.loc[i,'min'] = x.min()
    X_train.loc[i,'kurtosis'] = kurtosis(x)
    X_train.loc[i,'skew'] = skew(x)
    X_train.loc[i,'mad'] = mad(x)
    
    y_train.loc[i,'time_to_failure'] = y
    

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
print("X_train Shape: {}, y_train Shape: {}".format(X_train.shape,y_train.shape))

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

In [None]:
# Create testing data / handle the testing part
from scipy.stats import kurtosis, skew
from statsmodels.robust import mad
submission_files = pd.read_csv('../input/sample_submission.csv',index_col='seg_id')
submission_files

X_test = pd.DataFrame(columns=X_train.columns,index=submission_files.index,dtype=np.float64)

for seg_id in tqdm(X_test.index):
    segment = pd.read_csv('../input/test/'+seg_id+'.csv')
    x = segment['acoustic_data'].values
    X_test.loc[seg_id,'ave'] = x.mean()
    X_test.loc[seg_id,'std'] = x.std()
    X_test.loc[seg_id,'max'] = x.max()
    X_test.loc[seg_id,'min'] = x.min()
    X_test.loc[seg_id,'kurtosis'] = kurtosis(x)
    X_test.loc[seg_id,'skew'] = skew(x)
    X_test.loc[seg_id,'mad'] = mad(x)
X_test

In [None]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

In [None]:
# from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import cross_val_score
model = GradientBoostingRegressor(learning_rate=0.1,n_estimators=200,loss='ls')
# model = RandomForestRegressor(criterion="mae",n_estimators=100).fit(X_train_scaled,y_train.values.flatten())
# y_predictions = model.predict(X_train_scaled)
# y_predictions
print(np.mean(cross_val_score(model, X_train_scaled, y_train.values.flatten(), cv=10, scoring=make_scorer(mean_absolute_error))))

In [None]:
#mean_absolute_error(y_train.values.flatten(),y_predictions)

In [None]:
# y_test_predictions = model.predict(X_test_scaled)
# submission_files['time_to_failure'] = y_test_predictions
# submission_files.to_csv('submission1.csv')