In [1]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
import matplotlib.pyplot as plt

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,CuDNNLSTM,BatchNormalization,Dropout
from tensorflow.keras.optimizers import Adam,Adadelta
import xgboost

In [5]:
def generate_features(X):
    strain=[]
    strain.append(X.mean())
    strain.append(X.std())
    strain.append(X.min())
    strain.append(X.kurtosis())
    strain.append(X.skew())
    strain.append(np.quantile(X,0.01))
    strain.append(np.quantile(X,0.05))
    strain.append(np.quantile(X,0.95))
    strain.append(np.quantile(X,0.99))
    strain.append(np.abs(X).max())
    strain.append(np.abs(X).mean())
    strain.append(np.abs(X).std())
    return pd.Series(strain)


In [None]:
train = pd.read_csv('train.csv',chunksize=150000,dtype={'acoustic_data':np.int16,'time_to_failure':np.float64})
X_train_1 = pd.DataFrame()
y_train = pd.Series()
X_train_2 = pd.DataFrame()

for df in train:
    X_train_2 = X_train_2.append(pd.Series(df['acoustic_data'].values),ignore_index=True)
    y_train = y_train.append(pd.Series(df['time_to_failure'].values[-1]))
    ch = generate_features(df['acoustic_data'])
    X_train_1 = X_train_1.append(ch, ignore_index=True)
X_train_2=X_train_2.drop(4194,0)
A=y_train.to_frame('labels')
A.reset_index(drop=True,inplace=True)
A.drop(4194,0,inplace=True)
A.to_csv('labels.csv')
X_train_1.drop(4194,0,inplace=True)
X_train_1.to_csv('Statistic_for_each_series.csv')
X_train_2.to_csv('Serialized_data_01.csv',)
y_train_1=A.copy()

In [None]:
X_train_1=pd.read_csv('Statistic_for_each_series.csv')
X_train_2=pd.read_csv('Serialized_data_01.csv')

In [None]:
y_train_1=pd.read_csv('labels.csv')

In [None]:
X_test = pd.DataFrame()
for i in os.listdir('test'):
    test=pd.read_csv("test/"+i)
    ch = generate_features(test['acoustic_data'])
    X_test = X_test.append(ch, ignore_index=True)

In [None]:
pca = PCA(n_components=2000)
pca.fit(X_train_2)

In [None]:
new_data=pca.transform(X_train_2)

# XGBRegressor

In [None]:
xg=xgboost.XGBRegressor(n_estimators=100,max_depth=3,min_child_weight=4)
parameters = {'max_depth': [3,4,5,6],'min_child_weight':[1, 2, 3, 4, 5, 6]}
clf = GridSearchCV(xg, parameters, cv=5,verbose=1,n_jobs=3)
clf.fit(X_train_1,y_train_1)

In [None]:
xg=xgboost.XGBRegressor(n_estimators=100,max_depth=3,min_child_weight=4)
xg.fit(X_train_1,y_train_1)

In [None]:
xg.score(X_train_1,y_train_1)

In [None]:
mean_squared_error(xg.predict(X_train_1),y_train_1)

In [None]:
pred=xg.predict(X_test)

In [None]:
sub=pd.read_csv('sample_submission.csv')
sub['seg_id']=[i.split('.')[0] for i in os.listdir('test')]
sub.time_to_failure=pred
sub.to_csv('submission.csv',index=False)

# XGB Regressor on preprocessed serizlied data

In [None]:
def serialized_mean(data):
    mean_series=data.values.reshape(-1,30).mean(1)
    return mean_series

In [None]:
train_data=[]
for i in X_train_2.values.reshape(4194,-1,30):
    temp = i.reshape(-1,30).mean(1)
    train_data.append(temp)

In [None]:
train_data=np.asarray(train_data)

In [None]:
xg_2=xgboost.XGBRegressor(n_estimators=400,max_depth=3,min_child_weight=5)
parameters = {'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]}
clf = GridSearchCV(xg_2, parameters, cv=5,verbose=1,n_jobs=4)
clf.fit(train_data,y_train_1)

In [None]:
clf.best_params_

In [None]:
xg_2=xgboost.XGBRegressor(n_estimators=400,max_depth=3,min_child_weight=5,reg_alpha=1,reg_lambda=3)
xg_2.fit(train_data,y_train_1)

In [None]:
xg_2.score(train_data,y_train_1)

In [None]:
xg_2.score(X_preprocess,y_train_1)

In [None]:
mean_squared_error(train_data,y_train_1)

In [None]:
mean_squared_error(xg_2.predict(X_preprocess),y_train_1)

In [None]:
X_test_2={}
for i in os.listdir('test'):
    key=i.split('.')[0]
    test=pd.read_csv("test/"+i)
    ch = serialized_mean(test)
    X_test_2[key]=ch

In [None]:
new_df=pd.DataFrame.from_dict(X_test_2,orient='index')

In [None]:
new_df['time_to_failure']=xg_2.predict(new_df.values)

In [None]:
final_df=new_df[['time_to_failure']]

In [None]:
final_df.index.name='seg_id'

In [None]:
final_df.to_csv('submission_5.csv')

In [None]:
temp_df=pd.read_csv('submission_4.csv')

In [None]:
a=temp_df.time_to_failure.values
b=final_df.time_to_failure.values

In [None]:
test_pre=xg_2.predict(pd.DataFrame(X_preprocess_test))

In [None]:
sub=pd.read_csv('sample_submission.csv')
sub['seg_id']=[i.split('.')[0] for i in os.listdir('test')]
sub.time_to_failure=test_pre
sub.to_csv('submission_3.csv',index=False)

# RandomForestry Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regr = RandomForestRegressor(random_state=42)
parameters = {'max_depth': [2,3,4], 'n_estimators': [500,600]}
clf = GridSearchCV(regr, parameters, cv=5,verbose=1,n_jobs=4)
clf.fit(X_train_1,y_train_1)

In [None]:
regr = RandomForestRegressor(random_state=42,max_depth= 2,n_estimators=500)
regr.fit(X_train_1,y_train_1.values.reshape(-1))

In [None]:
rf_pre_train_1=regr.predict(X_train_1)

# RandomForestry for Serialized

In [None]:
regr = RandomForestRegressor(random_state=42,max_features='sqrt')
parameters = {'max_depth': [2,3,4], 'n_estimators': [500,600]}
clf = GridSearchCV(regr, parameters, cv=5,verbose=1,n_jobs=4)
clf.fit(train_data,y_train_1)

# LSTM Model

In [None]:
model=Sequential()
model.add(Dense(64,input_dim=(X_train_1.shape[1]),activation='tanh'))
model.add(Dense(128,activation='tanh'))
model.add(Dense(128,activation='tanh'))
model.add(Dense(64,activation='relu'))
model.add(Dense(1))
model.compile(loss="mse", optimizer=Adam(lr=1e-3,decay=1e-6))


In [None]:
hist=model.fit(X_train_1,y_train_1,epochs=30,batch_size=50)

In [None]:
prediction=model.predict(X_train_1)

In [None]:
mean_squared_error(prediction,y_train_1)