In [1]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
import matplotlib.pyplot as plt

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,CuDNNLSTM,BatchNormalization,Dropout
from tensorflow.keras.optimizers import Adam,Adadelta
import xgboost

In [5]:
def generate_features(X):
    strain=[]
    strain.append(X.mean())
    strain.append(X.std())
    strain.append(X.min())
    strain.append(X.kurtosis())
    strain.append(X.skew())
    strain.append(np.quantile(X,0.01))
    strain.append(np.quantile(X,0.05))
    strain.append(np.quantile(X,0.95))
    strain.append(np.quantile(X,0.99))
    strain.append(np.abs(X).max())
    strain.append(np.abs(X).mean())
    strain.append(np.abs(X).std())
    return pd.Series(strain)


In [None]:
train = pd.read_csv('train.csv',chunksize=150000,dtype={'acoustic_data':np.int16,'time_to_failure':np.float64})
X_train_1 = pd.DataFrame()
y_train = pd.Series()
X_train_2 = pd.DataFrame()

for df in train:
    X_train_2 = X_train_2.append(pd.Series(df['acoustic_data'].values),ignore_index=True)
    y_train = y_train.append(pd.Series(df['time_to_failure'].values[-1]))
    ch = generate_features(df['acoustic_data'])
    X_train_1 = X_train_1.append(ch, ignore_index=True)
X_train_2=X_train_2.drop(4194,0)
A=y_train.to_frame('labels')
A.reset_index(drop=True,inplace=True)
A.drop(4194,0,inplace=True)
A.to_csv('labels.csv')
X_train_1.drop(4194,0,inplace=True)
X_train_1.to_csv('Statistic_for_each_series.csv')
X_train_2.to_csv('Serialized_data_01.csv',)
y_train_1=A.copy()

In [6]:
X_train_1=pd.read_csv('Statistic_for_each_series.csv')
X_train_2=pd.read_csv('Serialized_data_01.csv')

In [8]:
y_train_1=pd.read_csv('labels.csv')

In [10]:
y_train_1=y_train_1.labels

In [7]:
X_test = pd.DataFrame()
for i in os.listdir('test'):
    test=pd.read_csv("test/"+i)
    ch = generate_features(test['acoustic_data'])
    X_test = X_test.append(ch, ignore_index=True)

In [None]:
pca = PCA(n_components=2000)
pca.fit(X_train_2)

In [None]:
new_data=pca.transform(X_train_2)

# XGBRegressor

In [12]:
xg=xgboost.XGBRegressor()
parameters = {'max_depth': [3,4,5,6],'min_child_weight':[1, 2, 3, 4, 5, 6],
              'n_estimators':[100,200,300,400],
              'reg_alpha': [0.05, 0.1, 1, 2, 3],
              'reg_lambda': [0.05, 0.1, 1, 2, 3]}
clf = GridSearchCV(xg, parameters, cv=5,verbose=1,n_jobs=3)
clf.fit(X_train_1,y_train_1)

Fitting 5 folds for each of 2400 candidates, totalling 12000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    6.6s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   29.6s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:  3.2min
[Parallel(n_jobs=3)]: Done 1244 tasks      | elapsed:  5.3min
[Parallel(n_jobs=3)]: Done 1794 tasks      | elapsed:  7.9min
[Parallel(n_jobs=3)]: Done 2444 tasks      | elapsed: 11.2min
[Parallel(n_jobs=3)]: Done 3194 tasks      | elapsed: 14.6min
[Parallel(n_jobs=3)]: Done 4044 tasks      | elapsed: 19.8min
[Parallel(n_jobs=3)]: Done 4994 tasks      | elapsed: 25.4min
[Parallel(n_jobs=3)]: Done 6044 tasks      | elapsed: 31.3min
[Parallel(n_jobs=3)]: Done 7194 tasks      | elapsed: 38.9min
[Parallel(n_jobs=3)]: Done 8444 tasks      | elapsed: 47.8min
[Parallel(n_jobs=3)]: Done 9794 tasks      | elapsed: 57.5min
[Parallel(n_jobs=3)]: Done 11244 tasks      | elapsed: 69.2mi

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'max_depth': [3, 4, 5, 6], 'min_child_weight': [1, 2, 3, 4, 5, 6], 'n_estimators': [100, 200, 300, 400], 'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [13]:
clf.best_params_

{'max_depth': 3,
 'min_child_weight': 6,
 'n_estimators': 100,
 'reg_alpha': 2,
 'reg_lambda': 3}

In [14]:
xg=xgboost.XGBRegressor(**clf.best_params_)

In [15]:
xg.fit(X_train_1,y_train_1)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=2,
       reg_lambda=3, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [16]:
xg.score(X_train_1,y_train_1)

0.8830470395383401

In [19]:
X_train_1

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,4.884113,5.101106,-98.0,33.662481,-0.024061,-8.00,-2.0,11.0,18.00,104.0,5.576567,4.333325
1,1,4.725767,6.588824,-154.0,98.758517,0.390561,-11.00,-2.0,12.0,21.00,181.0,5.734167,5.732777
2,2,4.906393,6.967397,-106.0,33.555211,0.217391,-15.00,-3.0,13.0,26.00,140.0,6.152647,5.895945
3,3,4.902240,6.922305,-199.0,116.548172,0.757278,-12.00,-2.0,12.0,22.00,199.0,5.933960,6.061214
4,4,4.908720,7.301110,-126.0,52.977905,0.064531,-15.00,-2.0,12.0,26.00,145.0,6.110587,6.329485
5,5,4.913513,5.434111,-144.0,50.215147,-0.100697,-10.00,-2.0,12.0,19.00,144.0,5.695167,4.608383
6,6,4.855660,5.687823,-78.0,23.173004,0.208810,-12.00,-2.0,12.0,21.00,120.0,5.791007,4.732118
7,7,4.505427,5.854512,-134.0,52.388738,-0.176333,-11.00,-2.0,11.0,20.00,139.0,5.415000,5.025126
8,8,4.717833,7.789643,-156.0,65.360261,-0.160166,-16.00,-3.0,13.0,26.00,168.0,6.152273,6.714605
9,9,4.730960,6.890459,-126.0,53.760207,0.150779,-14.00,-3.0,12.0,23.00,152.0,5.925120,5.895191


In [17]:
mean_squared_error(xg.predict(X_train_1),y_train_1)

1.5775522738362344

In [18]:
pred=xg.predict(X_test)

ValueError: feature_names mismatch: ['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'] ['0 ', '1 ', '2 ', '3 ', '4 ', '5 ', '6 ', '7 ', '8 ', '9 ', '10', '11']
expected 2, 0, 4, 1, 8, 9, 5, 3, Unnamed: 0, 7, 6 in input data
training data did not have the following fields: 8 , 7 , 1 , 2 , 4 , 3 , 0 , 9 , 5 , 6 

In [None]:
sub=pd.read_csv('sample_submission.csv')
sub['seg_id']=[i.split('.')[0] for i in os.listdir('test')]
sub.time_to_failure=pred
sub.to_csv('submission.csv',index=False)

# XGB Regressor on preprocessed serizlied data

In [None]:
def serialized_mean(data):
    mean_series=data.values.reshape(-1,30).mean(1)
    return mean_series

In [None]:
train_data=[]
for i in X_train_2.values.reshape(4194,-1,30):
    temp = i.reshape(-1,30).mean(1)
    train_data.append(temp)

In [None]:
train_data=np.asarray(train_data)

In [None]:
xg_2=xgboost.XGBRegressor(n_estimators=400,max_depth=3,min_child_weight=5)
parameters = {'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]}
clf = GridSearchCV(xg_2, parameters, cv=5,verbose=1,n_jobs=4)
clf.fit(train_data,y_train_1)

In [None]:
clf.best_params_

In [None]:
xg_2=xgboost.XGBRegressor(n_estimators=400,max_depth=3,min_child_weight=5,reg_alpha=1,reg_lambda=3)
xg_2.fit(train_data,y_train_1)

In [None]:
xg_2.score(train_data,y_train_1)

In [None]:
xg_2.score(X_preprocess,y_train_1)

In [None]:
mean_squared_error(train_data,y_train_1)

In [None]:
mean_squared_error(xg_2.predict(X_preprocess),y_train_1)

In [None]:
X_test_2={}
for i in os.listdir('test'):
    key=i.split('.')[0]
    test=pd.read_csv("test/"+i)
    ch = serialized_mean(test)
    X_test_2[key]=ch

In [None]:
new_df=pd.DataFrame.from_dict(X_test_2,orient='index')

In [None]:
new_df['time_to_failure']=xg_2.predict(new_df.values)

In [None]:
final_df=new_df[['time_to_failure']]

In [None]:
final_df.index.name='seg_id'

In [None]:
final_df.to_csv('submission_5.csv')

In [None]:
temp_df=pd.read_csv('submission_4.csv')

In [None]:
a=temp_df.time_to_failure.values
b=final_df.time_to_failure.values

In [None]:
test_pre=xg_2.predict(pd.DataFrame(X_preprocess_test))

In [None]:
sub=pd.read_csv('sample_submission.csv')
sub['seg_id']=[i.split('.')[0] for i in os.listdir('test')]
sub.time_to_failure=test_pre
sub.to_csv('submission_3.csv',index=False)

# RandomForestry Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regr = RandomForestRegressor(random_state=42)
parameters = {'max_depth': [2,3,4], 'n_estimators': [500,600]}
clf = GridSearchCV(regr, parameters, cv=5,verbose=1,n_jobs=4)
clf.fit(X_train_1,y_train_1)

In [None]:
regr = RandomForestRegressor(random_state=42,max_depth= 2,n_estimators=500)
regr.fit(X_train_1,y_train_1.values.reshape(-1))

In [None]:
rf_pre_train_1=regr.predict(X_train_1)

# RandomForestry for Serialized

In [None]:
regr = RandomForestRegressor(random_state=42,max_features='sqrt')
parameters = {'max_depth': [2,3,4], 'n_estimators': [500,600]}
clf = GridSearchCV(regr, parameters, cv=5,verbose=1,n_jobs=4)
clf.fit(train_data,y_train_1)

# LSTM Model

In [None]:
model=Sequential()
model.add(Dense(64,input_dim=(X_train_1.shape[1]),activation='tanh'))
model.add(Dense(128,activation='tanh'))
model.add(Dense(128,activation='tanh'))
model.add(Dense(64,activation='relu'))
model.add(Dense(1))
model.compile(loss="mse", optimizer=Adam(lr=1e-3,decay=1e-6))


In [None]:
hist=model.fit(X_train_1,y_train_1,epochs=30,batch_size=50)

In [None]:
prediction=model.predict(X_train_1)

In [None]:
mean_squared_error(prediction,y_train_1)