In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('../input/pump-sensor-data/sensor.csv')
data.head()

In [None]:
data.drop(['sensor_15','Unnamed: 0','sensor_01','sensor_03','sensor_14','sensor_16','sensor_17','sensor_18','sensor_19','sensor_20','sensor_21',
           'sensor_22','sensor_23','sensor_24','sensor_25','sensor_26','sensor_27','sensor_28','sensor_29','sensor_30',
           'sensor_31','sensor_33','sensor_34','sensor_37','sensor_36','sensor_48'],
          inplace=True,axis=1) #droping unwanted feature

In [None]:
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
from sklearn.metrics import roc_auc_score

In [None]:
mean=data.iloc[0:int((data.shape[0]*50)/100)][data.columns[1:-1]].mean().values
for i in tqdm(range(1,len(data.columns)-1)):
    col=data.columns[i]
    data[col]=data[col].fillna(mean[i-1])
np.save(open('NA_replace','wb'),mean)

In [None]:
def map_fun(x):
    if x=='NORMAL':
        return 0
    else:
        return 1

data['machine_status']=data.machine_status.map(map_fun)

In [None]:
columns=[]
for col in data.columns[1:-1]:
    columns.append('s{0}_median'.format(col[7:])) #to select sensor number
    columns.append('s{0}_mean'.format(col[7:]))
    columns.append('s{0}_std'.format(col[7:]))
    columns.append('s{0}_min'.format(col[7:]))
    columns.append('s{0}_max'.format(col[7:]))
columns.append('machine_status')

In [None]:
columns

In [None]:
w=10

X = []
for i in data.columns[1:]:
    X1,X2,X3,X4,X5,X6=[],[],[],[],[],[]
    if not i =='machine_status':
        X1.append(data[i].rolling(w).median()) #creating mean min etc for each sensor window
        X2.append(data[i].rolling(w).mean())
        X3.append(data[i].rolling(w).std())
        X4.append(data[i].rolling(w).min())
        X5.append(data[i].rolling(w).max())
        fea_data = np.hstack([np.array(X1).reshape(-1,1),np.array(X2).reshape(-1,1),\
                       np.array(X3).reshape(-1,1),np.array(X4).reshape(-1,1),\
                       np.array(X5).reshape(-1,1)])
    else:    
        X6.append(data[i].rolling(w).max()) # taking class label, if there is even singal failure we consider whole window as failure window.
        fea_data=np.array(X6).reshape(-1,1)
    X.append(fea_data)

In [None]:
temp_data = X[0]
for i in range(1,len(X)):
    temp_data = np.hstack([temp_data, X[i]])

data_df = pd.DataFrame(temp_data, columns=columns)
data_df

In [None]:
data_df=data_df.loc[w-1:]
temp1=data_df['machine_status'].iloc[w+w:].values
temp2=data['timestamp'].iloc[w:-(w+w-1)].values
data_df=data_df.iloc[:-(w+w)].copy()
data_df['machine_status']=temp1
data_df['timestamp']=temp2


In [None]:
data_df

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
from sklearn.metrics import  recall_score, confusion_matrix

In [None]:
X = data_df
X_train=X.iloc[0:int((X.shape[0]*50)/100)]
X_cv=X.iloc[int((X.shape[0]*50)/100):int((X.shape[0]*75)/100)]
X_test=X.iloc[int((X.shape[0]*75)/100):]
print('train shape:',X_train.shape)
print('cv shape:',X_cv.shape)
print('test shape:',X_test.shape)

In [None]:
X_train_fail=X_train[X_train['machine_status']==1].copy()
X_train_fail['timestamp']=pd.to_datetime(X_train_fail['timestamp'])
g1=X_train_fail.groupby(by=X_train_fail['timestamp'].dt.date)
print('train failure dates:\n',g1.groups.keys())

X_cv_fail=X_cv[X_cv['machine_status']==1].copy()
X_cv_fail['timestamp']=pd.to_datetime(X_cv_fail['timestamp'])
g2=X_cv_fail.groupby(by=X_cv_fail['timestamp'].dt.date)
print('cv failure dates:\n',g2.groups.keys())

In [None]:
y_train = X_train['machine_status']
X_train = X_train.drop(['machine_status'], axis=1)
y_cv = X_cv['machine_status']
X_cv = X_cv.drop(['machine_status'], axis=1)
y_test = X_test['machine_status']
X_test = X_test.drop(['machine_status'], axis=1)

In [None]:
for col in X_train.columns[0:-1]:
    scaler = MinMaxScaler()
    X_train[col]=scaler.fit_transform(X_train[col].values.reshape(-1, 1))
    X_cv[col]=scaler.transform(X_cv[col].values.reshape(-1, 1))
    X_test[col]=scaler.transform(X_test[col].values.reshape(-1, 1))

In [None]:
def plot_confusion_matrix(test_y, predict_y):
    """ this function print cunfusion matrix and recall matrix
    """
    labels = [0,1]
    C = confusion_matrix(test_y, predict_y,labels=labels)
    A =(((C.T)/(C.sum(axis=1))).T)
    # representing A in heatmap format
    print("-"*20, "Confusion matrix", "-"*20)
    plt.figure(figsize=(5,4))
    sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()

    print("-"*20, "Recall matrix (Row sum=1)", "-"*20)
    plt.figure(figsize=(5,4))
    sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()

In [None]:
clf = RandomForestClassifier(max_features='sqrt',n_jobs=-1,random_state=1)
clf.fit(X_train[X_train.columns[0:-1]], y_train)
print('train confusion and recall matrix:\n')
plot_confusion_matrix(y_train, clf.predict(X_train[X_train.columns[0:-1]]))
print('cv confusion and recall matrix:\n')
plot_confusion_matrix(y_cv, clf.predict(X_cv[X_train.columns[0:-1]]))
print('test confusion and recall matrix:\n')
plot_confusion_matrix(y_test, clf.predict(X_test[X_train.columns[0:-1]]))
print('train AUC score:',roc_auc_score(y_train, clf.predict_proba(X_train[X_train.columns[0:-1]])[:,1]))
print('cv AUC score:',roc_auc_score(y_cv, clf.predict_proba(X_cv[X_train.columns[0:-1]])[:,1]))
print('test AUC score:',roc_auc_score(y_test, clf.predict_proba(X_test[X_train.columns[0:-1]])[:,1]))